# Creation of FastText model using Keras

#### Imports and loading data

In [183]:
import pandas as pd
import numpy as np

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import GlobalAveragePooling1D
from keras import callbacks
import keras.backend as K
from keras.models import load_model

def load_data():
    train = pd.read_csv('data/cs_subs_train.csv')
    val = pd.read_csv('data/cs_subs_val.csv')
    test = pd.read_csv('data/cs_subs_test.csv')
    
    X_train, y_train = train['title'], train['subreddit']
    X_val, y_val = val['title'], val['subreddit']
    X_test, y_test = test['title'], test['subreddit']
    
    return X_train, y_train, X_val, y_val, X_test, y_test

X_train, y_train, X_val, y_val, X_test, y_test = load_data()

Adapted from https://github.com/keras-team/keras/blob/master/examples/imdb_fasttext.py

In [74]:
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence


def create_ngram_set(input_list, ngram_value=2):
    """
    Extract a set of n-grams from a list of integers.
    >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2)
    {(4, 9), (4, 1), (1, 4), (9, 4)}
    >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=3)
    [(1, 4, 9), (4, 9, 4), (9, 4, 1), (4, 1, 4)]
    """
    return set(zip(*[input_list[i:] for i in range(ngram_value)]))

def add_ngram(sequences, token_indice, ngram_range=2):
    """
    Augment the input list of list (sequences) by appending n-grams values.
    Example: adding bi-gram
    >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
    >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017}
    >>> add_ngram(sequences, token_indice, ngram_range=2)
    [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42]]
    Example: adding tri-gram
    >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
    >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017, (7, 9, 2): 2018}
    >>> add_ngram(sequences, token_indice, ngram_range=3)
    [[1, 3, 4, 5, 1337], [1, 3, 7, 9, 2, 1337, 2018]]
    """
    new_sequences = []
    for input_list in sequences:
        new_list = input_list[:]
        for i in range(len(new_list) - ngram_range + 1):
            for ngram_value in range(2, ngram_range + 1):
                ngram = tuple(new_list[i:i + ngram_value])
                if ngram in token_indice:
                    new_list.append(token_indice[ngram])
        new_sequences.append(new_list)

    return new_sequences

def create_indices(X, ngram_range, max_features):
    ngram_set = set()
    for input_list in X:
        for i in range(2, ngram_range + 1):
            set_of_ngram = create_ngram_set(input_list, ngram_value=i)
            ngram_set.update(set_of_ngram)


    start_index = max_features + 1
    token_indice = {v: k + start_index for k, v in enumerate(ngram_set)}
    indice_token = {token_indice[k]: k for k in token_indice}

    max_features = np.max(list(indice_token.keys())) + 1
        
    return token_indice, max_features

def preprocess(X, y, train=False, ngram_range=2, max_len=30, max_features=1000, **kwargs):
    
    if train:
        tokenizer = Tokenizer(num_words=max_features)
        tokenizer.fit_on_texts(X)
        
        label_encoder = LabelEncoder()
        y = label_encoder.fit_transform(y)
        
        label_binarizer = LabelBinarizer()
        y = label_binarizer.fit_transform(y)
        
        token_indice = None
    else:
        tokenizer = kwargs['tokenizer']
        label_encoder = kwargs['label_encoder']
        label_binarizer = kwargs['label_binarizer']
        
        y = label_encoder.transform(y)
        y = label_binarizer.transform(y)
        
    X = tokenizer.texts_to_sequences(X)
    
    if ngram_range > 1:
        if train:
            token_indice, max_features = create_indices(X, ngram_range, max_features)
        else:
            token_indice = kwargs['token_indice']
        
        X = add_ngram(X, token_indice, ngram_range)
        
    X = sequence.pad_sequences(X, maxlen=max_len)
    
    if train:
        return X, y, tokenizer, label_encoder, label_binarizer, token_indice, max_features
    return X, y

In [122]:
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report


def top_n_accuracy(y_true, probs, n=5):
    y_true_decoded = []
    for label in y_true:
        y_true_decoded.append(np.argmax(label))
    
    top_n_list = []
    for prob in probs:
        top_n_list.append(np.argsort(-prob)[:n])
    predictions = []
    for prediction, top_n in zip(y_true_decoded, top_n_list):
        predictions.append(int(prediction in top_n))
    return np.sum(predictions) / y_true.shape[0]

##### Splitting data into train (60%), val (20%), and test (20%) and preprocessing

In [145]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=17)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=31)
print(X_train.shape)
print(X_test.shape)
print(X_val.shape)
print(y_train.shape)
print(y_test.shape)
print(y_val.shape)

ngram_range = 2
max_features = 10000
max_len = 30

X_train, y_train, tokenizer, label_encoder, label_binarizer, token_indice, max_features = preprocess(
    X_train, y_train, train=True, ngram_range=ngram_range, max_features=max_features, max_len=max_len)

processors = {
    'tokenizer': tokenizer,
    'label_binarizer': label_binarizer,
    'label_encoder': label_encoder,
    'token_indice': token_indice
}

X_val, y_val = preprocess(
    X_val, y_val, ngram_range=ngram_range, max_len=max_len, **processors)

X_test, y_test = preprocess(
    X_test, y_test, ngram_range=ngram_range, max_len=max_len, **processors)

print('max_features:', max_features)

(59364,)
(19788,)
(19789,)
(59364,)
(19788,)
(19789,)
max_features: 229413


In [154]:
batch_size = 100
embedding_dims = 100
epochs = 1000

model = Sequential()

model.add(Embedding(max_features, embedding_dims, input_length=X_train.shape[1]))
model.add(GlobalAveragePooling1D())
model.add(Dense(y_train.shape[1], activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# Callbacks
early_stopping = callbacks.EarlyStopping(monitor='val_loss', 
                                         min_delta=0.001,
                                         patience=5,
                                         mode='min')

get_best = callbacks.ModelCheckpoint(monitor='val_loss',
                                     filepath='models/keras_fasttext.hdf5',
                                     save_best_only=True)

reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_loss',
                                        patience=1,
                                        factor=0.00001,
                                        min_lr=0.0001)

model.fit(X_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          callbacks=[early_stopping, get_best, reduce_lr],
          validation_data=[X_val, y_val])

model = load_model('models/keras_fasttext.hdf5')

Train on 59364 samples, validate on 19789 samples
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000


In [155]:
probs = model.predict_proba(X_val)

top_n_accuracy(y_val, probs)

0.77785638486027586