In [1]:
import fasttext
import torchtext
import pandas as pd
from torchtext.data.utils import get_tokenizer
from torchtext import datasets
import time

In [2]:
# Download yelp  polarity
torchtext.utils.download_from_url(datasets.text_classification.URLS['YelpReviewPolarity'])
!tar -C .data -xvf .data/yelp_review_polarity_csv.tar.gz

yelp_review_polarity_csv/
yelp_review_polarity_csv/readme.txt
yelp_review_polarity_csv/test.csv
yelp_review_polarity_csv/train.csv


In [3]:
train_df = pd.read_csv('.data/yelp_review_polarity_csv/train.csv',header=None)
test_df = pd.read_csv('.data/yelp_review_polarity_csv/test.csv',header=None)
train_df.columns = ['label','text']
test_df.columns = ['label','text']

In [4]:
# fasttext expects __label__ before labels. ¯\_(ツ)_/¯
train_df['label'] = train_df['label'].apply(lambda l: '__label__'+str(l))
test_df['label'] = test_df['label'].apply(lambda l: '__label__'+str(l))

In [5]:
# preprocess text with torchtext
tokenize=get_tokenizer('basic_english')
train_df['text'] = train_df['text'].apply(lambda text: ' '.join(tokenize(text)))
test_df['text'] = test_df['text'].apply(lambda text: ' '.join(tokenize(text)))

In [6]:
# save to files, fasttext expexts path not df
train_df.to_csv('.cache/fasttext_train.csv',sep='\t',index=False)
test_df.to_csv('.cache/fasttext_test.csv',sep='\t',index=False)

In [7]:
start_time = time.time()
model = fasttext.train_supervised('.cache/fasttext_train.csv',dim=10, wordNgrams=2, minCount=1, bucket=10000000
                                  ,epoch=5,lr=0.5)
secs = time.time() - start_time
secs

26.17220950126648

In [8]:
def print_results(N, p, r):
    print("N\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

print_results(*model.test('.cache/fasttext_test.csv'))

N	38000
P@1	0.956
R@1	0.956
