In [5]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from sklearn.metrics import roc_auc_score
import gensim
word2vec = gensim.models.KeyedVectors.load_word2vec_format('../data/GoogleNews-vectors-negative300.bin', binary=True)




In [7]:
def load_data(filepath):
    is_train = False
    id_label = []
    comment = []
    parent_comment = []
    with open(filepath) as f:
        header = f.readline()
        if header.strip().split()[0] == 'label':
            is_train = True
        for line in f:
            row = line.strip().split("\t")
            id_label.append(int(row[0]))
            comment.append(row[1])
            parent_comment.append(row[2])
    if is_train:
        return pd.DataFrame(data={'label':id_label, 'comment': comment, 'parent_comment': parent_comment})
    else:
        return pd.DataFrame(data={'id':id_label, 'comment': comment, 'parent_comment': parent_comment})
    
dataframe = load_data('../data/train.tsv')
print (len(dataframe))

53032


In [58]:
MAX_SEQUENCE_LENGTH = 540
MAX_VOCAB_SIZE = 30000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.2
BATCH_SIZE = 128
EPOCHS = 10


dataframe = load_data('../data/train.tsv')
dataframe = dataframe[["label", "comment"]]
dataframe.dropna(inplace=True)





# read test file
test_dataframe = load_data('../data/test.tsv')
test_dataframe = test_dataframe[["id", "comment"]]
test_dataframe.fillna("", inplace=True)

print('training set size:', len(dataframe))
print('test set size:',len(test_dataframe))




training set size: 53032
test set size: 17719


In [51]:
data = np.array(dataframe.comment)
target_list = []
for i in dataframe.label:
    target_list.append(i)

print(len(data))
print(len(target_list))

target= pd.Series(target_list)
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
# convert the sentences (strings) into integers
tokenizer.fit_on_texts(data)
sequences = tokenizer.texts_to_sequences(data)
#print("sequences:", sequences)

53032
53032


In [52]:
#train_x_list=train_data.tolist()
print("max sequence length:", max(len(s) for s in sequences))
print("min sequence length:", min(len(s) for s in sequences))
s = sorted(len(s) for s in sequences)
print("median sequence length:", s[len(s) // 2])


# get word -> integer mapping
word2idx = tokenizer.word_index
print('Found %s unique tokens.' % len(word2idx))


# pad sequences so that we get a N x T matrix
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', data.shape)





max sequence length: 540
min sequence length: 0
median sequence length: 9
Found 36383 unique tokens.
Shape of data tensor: (53032, 540)


In [55]:
# prepare embedding matrix
print('Filling pre-trained embeddings...')
num_words = min(MAX_VOCAB_SIZE, len(word2idx) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word2idx.items():
    if i < MAX_VOCAB_SIZE:
        if(word in word2vec.vocab):
            embedding_vector = word2vec[word]
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
# load pre-trained word embeddings into an Embedding layer

Filling pre-trained embeddings...


In [59]:
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(
  num_words,
  EMBEDDING_DIM,
  weights=[embedding_matrix],
  input_length=MAX_SEQUENCE_LENGTH,
  trainable=False
)


print('Building model...')

Building model...


In [None]:
# train a 1D convnet with global maxpooling
input_ = Input(shape=(MAX_SEQUENCE_LENGTH,))
x = embedding_layer(input_)
x = Conv1D(128, 3, activation='relu')(x)
x = MaxPooling1D(3)(x)
x = Conv1D(128, 3, activation='relu')(x)
x = MaxPooling1D(3)(x)
x = Conv1D(128, 3, activation='relu')(x)
x = GlobalMaxPooling1D()(x)
x = Dense(128, activation='relu')(x)
output = Dense(1, activation='sigmoid')(x)

model = Model(input_, output)
model.compile(
  loss='binary_crossentropy',
  optimizer='rmsprop',
  metrics=['accuracy']
)

print('Training model...')
r = model.fit(
  data,
  target,
  batch_size=BATCH_SIZE,
  epochs=EPOCHS,
  validation_split=VALIDATION_SPLIT
)


# plot some data
plt.plot(r.history['loss'], label='loss')
plt.plot(r.history['val_loss'], label='val_loss')
plt.legend()
plt.show()

# accuracies
plt.plot(r.history['acc'], label='acc')
plt.plot(r.history['val_acc'], label='val_acc')
plt.legend()
plt.show()


Training model...
Train on 42425 samples, validate on 10607 samples
Epoch 1/10