# FastText

Authors:
* Aurelien ROUXEL
* Ethan MACHAVOINE
* Jonathan POELGER

In [3]:
import datasets as ds
import fasttext
import numpy as np
import string
import random
from sklearn.model_selection import train_test_split
random.seed(42)

In [4]:
ds_train = ds.load_dataset('imdb', split='train')
ds_test = ds.load_dataset('imdb', split='test')

Found cached dataset imdb (/home/ethan/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)
Found cached dataset imdb (/home/ethan/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)


### 1. Pretreatment

In [5]:
def preprocessing(base_text: str):
  """
  Preprocess the text before classification
  Args:
    base_text: the string to preprocess
  Return:
    The preprocessed text
  """
  base_text = base_text.lower()
  base_text = base_text.replace("<br />",' ')
  text = ""
  ponct = string.punctuation
  for char in base_text:
    if char in ponct:
      text += ' '
    else:
      text += char
  return text

In [6]:
def text_label(label):
    if label == 0:
        return "negative"
    return "positive"

In [7]:
train_set = [f"__label__{text_label(text['label'])} {preprocessing(text['text'])}\n" for text in ds_train]
test_set = [f"__label__{text_label(text['label'])} {preprocessing(text['text'])}\n" for text in ds_test]
random.shuffle(train_set)
random.shuffle(test_set)

In [8]:
with open("imdb.train", "w") as f:
    f.writelines(train_set)
with open("imdb.test", "w") as f:
    f.writelines(test_set)

### 2. Train a FastText classifier

In [9]:
model = fasttext.train_supervised(input="imdb.train")

Read 6M words
Number of words:  75900
Number of labels: 2
Progress: 100.0% words/sec/thread: 4541135 lr:  0.000000 avg.loss:  0.388679 ETA:   0h 0m 0s


Results:
* Read 5M words
* Number of words:  75900
* Number of labels: 2
* Progress: 100.0% words/sec/thread: 4541135 lr:  0.000000 avg.loss:  0.388679 ETA:   0h 0m 0s

In [10]:
def get_true_values(model, test_set):
    values = 0
    for text in test_set:
        label = text[:17]
        predict = model.predict(text[:-1])[0][0]
        if label == predict:
            values += 1
    return values

def compute_accuracy(model, test_set):
    tn_fn = get_true_values(model, test_set)
    samples, _, _ = model.test("imdb.test")
    return tn_fn / samples

In [11]:
accuracy = compute_accuracy(model, test_set)
print(f"Accuracy: {accuracy}")

Accuracy: 0.879


Result:
* Accuracy: 0.879

### 3. Use the hyperparameters search functionality

In [12]:
training_set, validation_set = train_test_split(train_set, test_size=0.2, random_state=42)
random.shuffle(training_set)
random.shuffle(validation_set)

In [13]:
with open("imdb.training.hyperparameter", "w") as f:
    f.writelines(training_set)
with open("imdb.validation.hyperparameter", "w") as f:
    f.writelines(validation_set)

In [14]:
model = fasttext.train_supervised(input='imdb.training.hyperparameter'
                                  , autotuneValidationFile='imdb.validation.hyperparameter')

Progress: 100.0% Trials:   11 Best score:  0.899000 ETA:   0h 0m 0s
Training again with best arguments
Read 4M words
Number of words:  69077
Number of labels: 2
Progress: 100.0% words/sec/thread: 1881838 lr:  0.000000 avg.loss:  0.043658 ETA:   0h 0m 0s


Results:
* Progress: 100.0% Trials:   11 Best score:  0.899000 ETA:   0h 0m 0s
* Training again with best arguments
* Read 4M words
* Number of words:  69077
* Number of labels: 2
* Progress: 100.0% words/sec/thread: 1881838 lr:  0.000000 avg.loss:  0.043658 ETA:   0h 0m 0s

In [15]:
accuracy = compute_accuracy(model, test_set)
print(f"Accuracy: {accuracy}")

Accuracy: 0.89588


Result:
* Accuracy: 0.89588

### 4. Look at the differences between the 2 models