Preprocess

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize, sent_tokenize, TweetTokenizer 
from nltk.corpus import stopwords
from string import punctuation
import re
import math

In [2]:
# read in datasets
train_original = pd.read_csv('train.csv')
train = pd.read_csv('train_medical.csv')

In [3]:
# remove english stopwords
def remove_stopwords(words):
    stop_words = set(stopwords.words('english'))
    return [word.lower() for word in words if word not in stop_words]

# remove punctuation
def remove_punctuation(text):
    return re.sub(r'[^\w\s]', '', text)

# preprocess and balance dataset
def Preprocess(train):
    sample_size = len(train[train.target == 1])
    train_balanced = train[train.target == 1].sample(sample_size).append(train[train.target == 0].sample(sample_size)).reset_index()
    train_balanced = train_balanced.drop(columns=['index'])
    # Removing punctuation
    train_balanced['question_text_token'] = train_balanced['question_text'].apply(lambda x: remove_punctuation(x))
    # Tokenizing the text
    train_balanced['question_text_token'] = train_balanced['question_text_token'].apply(lambda x: word_tokenize(x))
    # Removing stopwords
    train_balanced['question_text_token'] = train_balanced['question_text_token'].apply(lambda x: remove_stopwords(x))
    return train_balanced

In [4]:
train_balanced_original = Preprocess(train_original)
train_balanced = Preprocess(train)

Google Word Vectors

In [5]:
# google news word2vec model
google_word2vec = 'GoogleNews-vectors-negative300.bin'

In [6]:
from gensim.models.word2vec import Word2Vec
from gensim.models import KeyedVectors
# load word2vec model
word2vec_model = KeyedVectors.load_word2vec_format(google_word2vec, binary=True)

In [7]:
def Vectorization(question_token):
    words = [i for i in question_token if i in word2vec_model]
    vector_representations = [word2vec_model[i] for i in words]
    return vector_representations

In [8]:
def Cleaner(tokens):
    tokens = [word for word in tokens if word in word2vec_model]
    return ' '.join(tokens)

In [9]:
train_balanced_original['question_text_vector'] = train_balanced_original['question_text_token'].apply(Vectorization)

In [10]:
train_balanced_original['question_text_clean'] = train_balanced_original['question_text_token'].apply(Cleaner)

In [11]:
train_balanced['question_text_vector'] = train_balanced['question_text_token'].apply(Vectorization)

In [12]:
train_balanced['question_text_clean'] = train_balanced['question_text_token'].apply(Cleaner)

LSTM

In [13]:
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding

Using TensorFlow backend.


Original Data

In [25]:
train_balanced_original = train_balanced_original.sample(frac=1).reset_index()
train_balanced_original = train_balanced_original.drop(columns=['index'])
fraction = 0.5
train_balanced_original_train = train_balanced_original.iloc[:int(fraction*len(train_balanced_original))]
train_balanced_original_test = train_balanced_original.iloc[int(fraction*len(train_balanced_original)):]
Y_train = np.array(train_balanced_original_train['target'])

In [26]:
vocab_size = len(set([i for j in list(train_balanced_original_train['question_text_clean'])for i in j.split(' ')]))

In [27]:
### Create sequence
vocabulary_size = vocab_size
tokenizer = Tokenizer(num_words= vocab_size)
tokenizer.fit_on_texts(train_balanced_original_train['question_text_clean'])
sequences = tokenizer.texts_to_sequences(train_balanced_original_train['question_text_clean'])
data = pad_sequences(sequences, maxlen=32)

In [28]:
embedding_matrix = np.zeros((vocab_size, 300))
for word, index in tokenizer.word_index.items():
    if index > vocabulary_size - 1:
        break
    else:
        embedding_vector = word2vec_model[word]
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector

In [29]:
model = Sequential()
model.add(Embedding(vocab_size, 300, input_length=32, weights=[embedding_matrix], trainable=False))
model.add(Dropout(0.2))
model.add(Conv1D(64, 5, activation='relu'))
model.add(MaxPooling1D(pool_size=4))
model.add(LSTM(300))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [30]:
model.fit(data, Y_train, validation_split=0.4, epochs = 3)

Train on 48486 samples, validate on 32324 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fa3d46762b0>

In [37]:
from sklearn.externals import joblib

In [38]:
joblib.dump(train_balanced_original_train, 'lstm_original_train.pkl')
joblib.dump(train_balanced_original_test, 'lstm_original_test.pkl')
joblib.dump(model, 'LSTMBalanced_original.pkl')

['LSTMBalanced_original.pkl']

Filtered Data

In [50]:
train_balanced = train_balanced.sample(frac=1).reset_index()
train_balanced = train_balanced.drop(columns=['index'])
fraction = 0.5
train_balanced_train = train_balanced.iloc[:int(fraction*len(train_balanced))]
train_balanced_test = train_balanced.iloc[int(fraction*len(train_balanced)):]
Y_train = np.array(train_balanced_train['target'])

In [51]:
vocab_size = len(set([i for j in list(train_balanced_train['question_text_clean'])for i in j.split(' ')]))

In [52]:
### Create sequence
vocabulary_size = vocab_size
tokenizer = Tokenizer(num_words= vocab_size)
tokenizer.fit_on_texts(train_balanced_train['question_text_clean'])
sequences = tokenizer.texts_to_sequences(train_balanced_train['question_text_clean'])
data = pad_sequences(sequences, maxlen=32)

In [53]:
embedding_matrix = np.zeros((vocab_size, 300))
for word, index in tokenizer.word_index.items():
    if index > vocabulary_size - 1:
        break
    else:
        embedding_vector = word2vec_model[word]
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector

In [54]:
model = Sequential()
model.add(Embedding(vocab_size, 300, input_length=32, weights=[embedding_matrix], trainable=False))
model.add(Dropout(0.2))
model.add(Conv1D(64, 5, activation='relu'))
model.add(MaxPooling1D(pool_size=4))
model.add(LSTM(300))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [55]:
model.fit(data, Y_train, validation_split=0.4, epochs = 3)

Train on 2035 samples, validate on 1357 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fa1f5b1df28>

In [61]:
joblib.dump(train_balanced_train, 'lstm_train.pkl')
joblib.dump(train_balanced_test, 'lstm_test.pkl')
joblib.dump(model, 'LSTMBalanced.pkl')

['LSTMBalanced.pkl']