# Creation of FastText model using Facebook's library: 
## https://github.com/facebookresearch/fastText

#### Imports and loading data

In [38]:
import pandas as pd
import numpy as np
from gensim import utils
import pickle

def load_data():
    train = pd.read_csv('data/cs_subs_train.csv')
    val = pd.read_csv('data/cs_subs_val.csv')
    test = pd.read_csv('data/cs_subs_test.csv')
    
    X_train, y_train = train['title'], train['subreddit']
    X_val, y_val = val['title'], val['subreddit']
    X_test, y_test = test['title'], test['subreddit']
    
    label_encoder = pickle.load(open('pickles/label_encoder.pkl', 'rb'))
    
    return X_train, y_train, X_val, y_val, X_test, y_test, label_encoder

X_train, y_train, X_val, y_val, X_test, y_test, label_encoder = load_data()

#### Stopwords from Scikit-learn's repository: https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_extraction/stop_words.py

In [39]:
ENGLISH_STOP_WORDS = frozenset([
    "a", "about", "above", "across", "after", "afterwards", "again", "against",
    "all", "almost", "alone", "along", "already", "also", "although", "always",
    "am", "among", "amongst", "amoungst", "amount", "an", "and", "another",
    "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are",
    "around", "as", "at", "back", "be", "became", "because", "become",
    "becomes", "becoming", "been", "before", "beforehand", "behind", "being",
    "below", "beside", "besides", "between", "beyond", "bill", "both",
    "bottom", "but", "by", "call", "can", "cannot", "cant", "co", "con",
    "could", "couldnt", "cry", "de", "describe", "detail", "do", "done",
    "down", "due", "during", "each", "eg", "eight", "either", "eleven", "else",
    "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone",
    "everything", "everywhere", "except", "few", "fifteen", "fifty", "fill",
    "find", "fire", "first", "five", "for", "former", "formerly", "forty",
    "found", "four", "from", "front", "full", "further", "get", "give", "go",
    "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter",
    "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his",
    "how", "however", "hundred", "i", "ie", "if", "in", "inc", "indeed",
    "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter",
    "latterly", "least", "less", "ltd", "made", "many", "may", "me",
    "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly",
    "move", "much", "must", "my", "myself", "name", "namely", "neither",
    "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone",
    "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on",
    "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our",
    "ours", "ourselves", "out", "over", "own", "part", "per", "perhaps",
    "please", "put", "rather", "re", "same", "see", "seem", "seemed",
    "seeming", "seems", "serious", "several", "she", "should", "show", "side",
    "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone",
    "something", "sometime", "sometimes", "somewhere", "still", "such",
    "system", "take", "ten", "than", "that", "the", "their", "them",
    "themselves", "then", "thence", "there", "thereafter", "thereby",
    "therefore", "therein", "thereupon", "these", "they", "thick", "thin",
    "third", "this", "those", "though", "three", "through", "throughout",
    "thru", "thus", "to", "together", "too", "top", "toward", "towards",
    "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us",
    "very", "via", "was", "we", "well", "were", "what", "whatever", "when",
    "whence", "whenever", "where", "whereafter", "whereas", "whereby",
    "wherein", "whereupon", "wherever", "whether", "which", "while", "whither",
    "who", "whoever", "whole", "whom", "whose", "why", "will", "with",
    "within", "without", "would", "yet", "you", "your", "yours", "yourself",
"yourselves"])

#### Helper functions

In [40]:
def format_for_fastext(X, y, filename):
    """
    By default, FastText looks for text data in the following format:
    
    __label__0 <text>
    __label__0 <text>
    __label__1 <text>
    ...
    
    This function will take text data and labels (X, y) and format it for
    FastText consumption. The formatted text is then saved to a specified file
    in the "data" directory. Should work with iterable data structures such as 
    pandas sequences, numpy arrays, and Python lists.
    """
    prefix = '__label__'
    f = open(''.join(['data/', filename]), 'w')
    for title, label in zip(X, y):
        title = title.lower()
        tokens = utils.simple_preprocess(title)
        tokens = [token for token in tokens if token not in ENGLISH_STOP_WORDS]
        f.write(''.join([prefix, str(label), ' ', ' '.join(tokens), '\n']))
    f.close()
    
def test_fasttext(y, X, classifier, n=1):
    """
    Returns the top N accuracy for the classifier i.e., if the correct label is
    within the top N most likely labels according to the classifier.
    """
    match = []
    for true, string in zip(y, X):
        predictions = list(classifier.predict(string, n)[0])
        for i in range(n):
            predictions[i] = int(predictions[i].split('__label__')[1])
        match.append(int(true in predictions))
    
    match = np.array(match)
    return match.sum() / y.size

#### Formatting the data for FastText consumption

In [41]:
format_for_fastext(X_train, y_train, 'reddit_fasttext_train.txt')

#### Building the model

In [42]:
import fastText

classifier = fastText.train_supervised(input='data/reddit_fasttext_train.txt',
                                 lr=0.1,
                                 epoch=30,
                                 dim=64,
                                 minn=2,
                                 maxn=5
                                )

In [43]:
print('Validation Accuracy (Top 1):', test_fasttext(y_val, X_val, classifier))
print('Test Accuracy (Top 1):', test_fasttext(y_test, X_test, classifier))

Validation Accuracy (Top 1): 0.324870255419
Test Accuracy (Top 1): 0.313116922764


In [44]:
print('Validation Accuracy (Top 1):', test_fasttext(y_val, X_val, classifier, 5))
print('Test Accuracy (Top 1):', test_fasttext(y_test, X_test, classifier, 5))

Validation Accuracy (Top 1): 0.611987381703
Test Accuracy (Top 1): 0.605372952071
