In [1]:
import fasttext
from datasets import load_dataset
import pandas as pd
import csv
from sklearn.metrics import accuracy_score, f1_score

In [2]:
dataset = load_dataset('stanfordnlp/imdb')

In [3]:
# Let's convert them to dataframes
df_train = pd.DataFrame(dataset['train'])
df_test = pd.DataFrame(dataset['test'])

In [4]:
def get_fasttext_label(label):
    if label == 0:
        return '__label__neg'
    if label == 1:
        return '__label__pos'

In [5]:
df_train['fasttext_label'] = df_train['label'].apply(get_fasttext_label)
df_test['fasttext_label'] = df_test['label'].apply(get_fasttext_label)

In [6]:
df_train.head(10)

Unnamed: 0,text,label,fasttext_label
0,I rented I AM CURIOUS-YELLOW from my video sto...,0,__label__neg
1,"""I Am Curious: Yellow"" is a risible and preten...",0,__label__neg
2,If only to avoid making this type of film in t...,0,__label__neg
3,This film was probably inspired by Godard's Ma...,0,__label__neg
4,"Oh, brother...after hearing about this ridicul...",0,__label__neg
5,I would put this at the top of my list of film...,0,__label__neg
6,Whoever wrote the screenplay for this movie ob...,0,__label__neg
7,"When I first saw a glimpse of this movie, I qu...",0,__label__neg
8,"Who are these ""They""- the actors? the filmmake...",0,__label__neg
9,This is said to be a personal film for Peter B...,0,__label__neg


In [7]:
df_train[['fasttext_label', 'text']].to_csv('../../data/train_fasttext_imdb.txt', index = False, sep = ' ', header = None, quoting = csv.QUOTE_NONE, quotechar = "", escapechar = " ")
df_test[['fasttext_label', 'text']].to_csv('../../data/test_fasttext_imdb.txt', index = False, sep = ' ', header = None, quoting = csv.QUOTE_NONE, quotechar = "", escapechar = " ")

In [8]:
model = fasttext.train_supervised('../../data/train_fasttext_imdb.txt', epoch=25, wordNgrams=2)

Read 5M words
Number of words:  281132
Number of labels: 2
Progress: 100.0% words/sec/thread: 1616309 lr:  0.000000 avg.loss:  0.134875 ETA:   0h 0m 0s100.0% words/sec/thread: 1616324 lr: -0.000002 avg.loss:  0.134875 ETA:   0h 0m 0s


In [9]:
def predict_fasttext(text):
    label = model.predict(text)[0][0]
    if label == '__label__neg':
        return 0
    if label == '__label__pos':
        return 1

In [10]:
df_train['predicted_fasttext'] = df_train['text'].apply(predict_fasttext)
df_test['predicted_fasttext'] = df_test['text'].apply(predict_fasttext)

In [11]:
print('Train Accuracy: {}, Test Accuracy: {}'.format(
    accuracy_score(y_true=df_train['label'], y_pred=df_train['predicted_fasttext']), 
    accuracy_score(y_true=df_test['label'], y_pred=df_test['predicted_fasttext'])
))

Train Accuracy: 0.99952, Test Accuracy: 0.88652


In [12]:
print('Train f-score: {}, Test f-score: {}'.format(
    f1_score(y_true=df_train['label'], y_pred=df_train['predicted_fasttext']), 
    f1_score(y_true=df_test['label'], y_pred=df_test['predicted_fasttext']))
)

Train f-score: 0.9995198463508322, Test f-score: 0.8853598415969612
