In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
#print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

# Loading in the datasets

In [2]:
train_df = pd.read_csv('input/train.csv', encoding = 'utf-8')
test_df = pd.read_csv('input/test.csv', encoding = 'utf-8')
#sample_submission = pd.read_csv('../input/sample_submission.csv')

# Text Processing

In [3]:
import utils

# remove all non letters characters
regex = r"[^a-zA-Z ]"
# process both training and testing data
train_processed_comments = utils.text_processing(train_df.comment_text, regex = regex)
test_processed_comments = utils.text_processing(test_df.comment_text, regex = regex)

In [5]:
# Get the labels of all the target columns
target_cols = [col for col in train_df.columns if col not in ['id', 'comment_text']]

In [6]:
# Build the character-to-index dictionaries and caclulate the number of characters retained
num_chars, char2idx, idx2char = utils.build_idx(train_processed_comments)

In [7]:
# Specify the maximum number of sentences and sentence length
nb_sent = 4
max_len = 80

In [8]:
# Convert text to matrices
train_X = utils.comments_to_idx(train_processed_comments, nb_sent, max_len, char2idx)
test_X  = utils.comments_to_idx(test_processed_comments, nb_sent, max_len, char2idx)

# Data Augmentation
Apply data augmentation technique with the text data. Since the model will be processing text data at the character level, three augmentation techniques were used to imitate the typos/misspelling behaviors observed in the dataset.

## Three augmentation techniques
1. Adding characters
2. Removing characters
3. Replace characters in the string with a random character

The three augmnetation tehcniques are applied randomly to the sentences by specified probability

In [9]:
# Specify the characters that can be used to augment the text data
replace_set = list(set('abcdefghijklmnopqrstuvwxyz'))
# create the augmnetation class
aug = utils.augmentation(replace_set)

In [10]:
# Loop through each comments/sentences and augment the data
augmented_comment = train_processed_comments.apply(lambda x: [aug.data_augmentation(sent, 0.05) for sent in x])
# Convert the augmented data to matrix form
augment_X = utils.comments_to_idx(augmented_comment, nb_sent, max_len, char2idx)

In [11]:
# Append augmented data to training set
augment_train_X = np.append(train_X, augment_X, axis = 0)

# Get the target data
train_y = train_df[target_cols].as_matrix()

# Double the target data to match the augmneted dataset size
augment_train_y = np.append(train_y, train_y, axis = 0)

# Calling the LSTM models

In [12]:
import models

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [19]:
charLSTM_models = models.charLSTM(num_outputs = len(target_cols))

In [20]:
# Here are the channels, analogous to n-gram, to process the character embeddings
channels = [(1,128), (2,128), (5,128)]

# First character-LSTM model with three channels
charLSTM1 = charLSTM_models.get_model(16, num_chars, max_len, nb_sent, channels, 0.5, highway = False)

# Second character-LSTM model with Five channels
channels = [(1,128), (2,128), (5,128), (7,128),(9,128)]
charLSTM2 = charLSTM_models.get_model(16, num_chars, max_len, nb_sent, channels, 0.5, highway = False)

In [21]:
# Compile parameters
compile_params = {'loss':'binary_crossentropy', 
                  'optimizer':'adam',
                  'metrics':['accuracy']}

# Compile the models
charLSTM1.compile(**compile_params)
charLSTM2.compile(**compile_params)

In [23]:
# Train the two models with different number of channels
# Only train for two epochs because the dataset is fairly prone to overfitting
for i, model in enumerate([charLSTM1, charLSTM2]):
    print("Fitting model {}...".format(i + 1))
    model.fit(augment_train_X, augment_train_y, batch_size = 32, epochs = 2, validation_split=0.1)

Fitting model 1...
Train on 287227 samples, validate on 31915 samples
Epoch 1/2
Epoch 2/2
Fitting model 2...
Train on 287227 samples, validate on 31915 samples
Epoch 1/2
Epoch 2/2


In [24]:
# Saving the weights of the best model
charLSTM2.save_weights('charLSTM.h5')

In [25]:
#charLSTM2.load_weights('charLSTM.h5')

# Generating predictions for Kaggle submissions

In [26]:
predict_test = charLSTM2.predict(test_X)

submission = pd.DataFrame(data = predict_test)

submission.columns = target_cols

submission['id'] = test_df['id']

submission = submission[['id'] + target_cols]
submission.to_csv('output/submission.csv', index = False)