building fastText word embedding
https://www.kaggle.com/mschumacher/using-fasttext-models-for-robust-embeddings?scriptVersionId=2456848

## Loading and cleaning the data

We define a method normalize to clean and prepare a single string. We will use it later to prepare our string data. Also, we load the data as we're used to:

In [1]:
import re
import numpy as np
import pandas as pd
from fastText import load_model

In [2]:
# The amount of words we look at per example. Experiment with this.
window_length = 200

In [3]:
def normalize(s):
    """
    Given a text, cleans and normalizes it. Feel free to add your own stuff.
    """
    s = s.lower()
    # Replace ips
    s = re.sub(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', ' _ip_ ', s)
    # Isolate punctuation
    s = re.sub(r'([\'\"\.\(\)\!\?\-\\\/\,])', r' \1 ', s)
    # Remove some special characters
    s = re.sub(r'([\;\:\|•«\n])', ' ', s)
    # Replace numbers and symbols with language
    s = s.replace('&', ' and ')
    s = s.replace('@', ' at ')
    s = s.replace('0', ' zero ')
    s = s.replace('1', ' one ')
    s = s.replace('2', ' two ')
    s = s.replace('3', ' three ')
    s = s.replace('4', ' four ')
    s = s.replace('5', ' five ')
    s = s.replace('6', ' six ')
    s = s.replace('7', ' seven ')
    s = s.replace('8', ' eight ')
    s = s.replace('9', ' nine ')
    # some cleaning 
    s = re.sub(r"what's", "what is ", s)
    s = re.sub(r"\'s", " ", s)
    s = re.sub(r"\'ve", " have ", s)
    s = re.sub(r"can't", "cannot ", s)
    s = re.sub(r"n't", " not ", s)
    s = re.sub(r"i'm", "i am ", s)
    s = re.sub(r"\'re", " are ", s)
    s = re.sub(r"\'d", " would ", s)
    s = re.sub(r"\'ll", " will ", s)
    s = re.sub(r"\'scuse", " excuse ", s)
    s = re.sub('\W', ' ', s)
    s = re.sub('\s+', ' ', s)
    s = s.strip(' ')
    return s

In [4]:
print('\nLoading data')
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
train['comment_text'] = train['comment_text'].fillna('_empty_')
test['comment_text'] = test['comment_text'].fillna('_empty_')


Loading data


Ok next, let's load the FastText model and define methods that convert text to a sequence of vectors. Note that I'm just considering the last n words of each text. You could play with this, too.

In [5]:
classes = [
    'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'
]

In [6]:
print('\nLoading FT model')
ft_model = load_model('../input/fastText.model.bin')
n_features = ft_model.get_dimension()


Loading FT model


In [7]:
def text_to_vector(text):
    """
    Given a string, normalizes it, then splits it into words and finally converts
    it to a sequence of word vectors.
    """
    text = normalize(text)
    words = text.split()
    window = words[-window_length:]
    
    x = np.zeros((window_length, n_features))

    for i, word in enumerate(window):
        x[i, :] = ft_model.get_word_vector(word).astype('float32')

    return x

In [8]:
def df_to_data(df):
    """
    Convert a given dataframe to a dataset of inputs for the NN.
    """
    x = np.zeros((len(df), window_length, n_features), dtype='float32')

    for i, comment in enumerate(df['comment_text'].values):
        x[i, :] = text_to_vector(comment)

    return x

To convert an input dataframe to an input vector, just call df_to_data. This will result in the shape (n_examples, window_length, n_features). Here, for each row we would have 100 words a 300 features each.

In [9]:
# Split the dataset:
split_index = round(len(train) * 0.9)
shuffled_train = train.sample(frac=1)
df_train = shuffled_train.iloc[:split_index]
df_val = shuffled_train.iloc[split_index:]

# Convert validation set to fixed array
x_val = df_to_data(df_val)
y_val = df_val[classes].values

In [19]:
x_test = df_to_data(test)

KeyError: "['toxic' 'severe_toxic' 'obscene' 'threat' 'insult' 'identity_hate'] not in index"

In [10]:
def data_generator(df, batch_size):
    """
    Given a raw dataframe, generates infinite batches of FastText vectors.
    """
    batch_i = 0 # Counter inside the current batch vector
    batch_x = None # The current batch's x data
    batch_y = None # The current batch's y data
    
    while True: # Loop forever
        df = df.sample(frac=1) # Shuffle df each epoch
        
        for i, row in df.iterrows():
            comment = row['comment_text']
            
            if batch_x is None:
                batch_x = np.zeros((batch_size, window_length, n_features), dtype='float32')
                batch_y = np.zeros((batch_size, len(classes)), dtype='float32')
                
            batch_x[batch_i] = text_to_vector(comment)
            batch_y[batch_i] = row[classes].values
            batch_i += 1

            if batch_i == batch_size:
                # Ready to yield the batch
                yield batch_x, batch_y
                batch_x = None
                batch_y = None
                batch_i = 0

In [11]:
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model, Sequential
from keras import initializers, regularizers, constraints, optimizers, layers

Using TensorFlow backend.


In [12]:
def build_model():
    model = Sequential()
    model.add(Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1), input_shape=(window_length, n_features)))
    model.add(GlobalMaxPool1D())
    model.add(Dense(50, activation="relu"))
    model.add(Dropout(0.1))
    model.add(Dense(6, activation="sigmoid"))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [13]:
model = build_model()  # TODO: Implement

batch_size = 128
training_steps_per_epoch = round(len(df_train) / batch_size)
training_generator = data_generator(df_train, batch_size)

And now you should be good to go! Train this as usual. You don't need an EmbeddingLayer, but you need to pass input_shape=(window_length, n_features) to the first layer in your NN.

In [17]:
# Ready to start training:
model.fit_generator(
    training_generator,
    steps_per_epoch=training_steps_per_epoch,
    validation_data=(x_val, y_val),
    verbose=1,
    epochs=2
)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x2a973d160>

In [22]:
y_test = model.predict([x_test], batch_size=1024, verbose=1)
sample_submission = pd.read_csv('../input/sample_submission.csv')
sample_submission[classes] = y_test
sample_submission.to_csv('submission_fastText.csv', index=False)



pickle the model