# Part 3 fastText

[Bag of Tricks for Efficient Text Classification](https://arxiv.org/abs/1607.01759)

## Utils

In [1]:
import pandas as pd
import numpy as np

np.random.seed(0)  # reproducibility

In [2]:
def prefix_to_csv(df, prefix='__label__', path='fasttext/train.csv'):
    
    df['prefix_label'] = prefix + df['label'].astype(str)
    df[['text', 'prefix_label']].to_csv(path, sep='\t', header=False, index=False)

In [3]:
def submit_to_scv(pred, 
                  sample_csv='data/test_a_sample_submit.csv', 
                  path='submissions/sub.csv'):
    
    sub = pd.read_csv(sample_csv, index_col=False)
    sub['label'] = pred
    sub.to_csv(path, index=False)

## Load Data

In [4]:
%%time

train_df = pd.read_csv('data/train_set.csv', sep='\t', nrows=50000, index_col=False)
test_df = pd.read_csv('data/test_a.csv', sep='\t', index_col=False)

CPU times: user 3.9 s, sys: 394 ms, total: 4.3 s
Wall time: 4.33 s


## Hold-out

In [5]:
HOLD_OUT = 0.2
PREFIX = '__label__'

In [6]:
from sklearn.model_selection import train_test_split

X, y = train_df['text'], train_df['label'].astype(str)
X_test = test_df['text']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, 
    test_size=HOLD_OUT, 
    random_state=42
)

prefix_to_csv(X_train.to_frame().join(y_train), prefix=PREFIX)

## fastText for Text Classification
* [Text classification tutoriol](https://fasttext.cc/docs/en/supervised-tutorial.html)
* [List of options](https://fasttext.cc/docs/en/options.html)

In [7]:
!pip install fasttext



In [8]:
import fasttext
help(fasttext.train_supervised)

Help on function train_supervised in module fasttext.FastText:

train_supervised(*kargs, **kwargs)
    Train a supervised model and return a model object.
    
    input must be a filepath. The input text does not need to be tokenized
    as per the tokenize function, but it must be preprocessed and encoded
    as UTF-8. You might want to consult standard preprocessing scripts such
    as tokenizer.perl mentioned here: http://www.statmt.org/wmt07/baseline.html
    
    The input file must must contain at least one label per line. For an
    example consult the example datasets which are part of the fastText
    repository such as the dataset pulled by classification-example.sh.



In [9]:
%%time

model = fasttext.train_supervised(
    'fasttext/train.csv', 
    verbose=2,
    minCount=1,
    wordNgrams=2,
    lr=1.0, 
    epoch=25,
    loss='hs'
)

CPU times: user 8min 45s, sys: 7.55 s, total: 8min 52s
Wall time: 1min 29s


In [10]:
from sklearn.metrics import f1_score

y_pred = [model.predict(x)[0][0].replace(PREFIX, '') for x in X_val]
print(f1_score(y_val, y_pred, average='macro'))

0.883733902957938
