In [162]:
from __future__ import division, print_function
import numpy as np
import pandas as pd
from keras.layers.core import Dense, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.models import Sequential
from keras.preprocessing import sequence
import collections
import nltk
import re

train = pd.read_csv('~/Downloads/train.csv')
test = pd.read_csv('~/Downloads/test.csv')

print(train.shape)
print(test.shape)

(5668, 2)
(1418, 1)


In [163]:
x_train = train['sentence']
y_train = train['label']
x_test = test['sentence']

nltk.download('punkt')
maxlen = 0
word_freqs = collections.Counter()
num_recs = 0
for sentence in (list(train['sentence']) + list(test['sentence'])):
    sentence = re.sub("[^a-zA-Z]", " ", sentence)
    words = nltk.word_tokenize(sentence.lower())
    if len(words) > maxlen:
        maxlen = len(words)
    for word in words:
        word_freqs[word] += 1
    num_recs += 1

print("maxlen: %d, vocab size: %d" % (maxlen, len(word_freqs)))

[nltk_data] Downloading package punkt to /Users/mingxie/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


maxlen: 38, vocab size: 2121


In [164]:
MAX_FEATURES = 2200
MAX_SENTENCE_LENGTH = 40

# special words: UNK = -1, PAD = 0
vocab = {"UNK": -1, "PAD": 0}
reverse_vocab = {v:k for k, v in vocab.items()}
for idx, word in enumerate([w[0] for w in word_freqs.most_common(MAX_FEATURES - 1)]):
    vocab[word] = idx + 1
    reverse_vocab[idx + 1] = word

In [165]:
# Convert sentences to token sequences
X = np.empty((x_train.shape[0], ), dtype=list)
for i in range(x_train.shape[0]):
    sentence = x_train[i]
    sentence = re.sub("[^a-zA-Z]", " ", sentence)
    words = nltk.word_tokenize(sentence.lower())
    seqs = []
    for word in words:
        seqs.append(vocab.get(word, -1))
    X[i] = seqs

X_train = sequence.pad_sequences(X, maxlen=MAX_SENTENCE_LENGTH)

In [166]:
model = Sequential()
model.add(Embedding(MAX_FEATURES, 128, input_length=MAX_SENTENCE_LENGTH, dropout=0.2))
model.add(LSTM(128, dropout_W=0.2, dropout_U=0.2))
model.add(Dense(1))
model.add(Activation("sigmoid"))

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
history = model.fit(X_train, y_train, batch_size=32, epochs=5, shuffle=True, validation_split=0.2)

  
  This is separate from the ipykernel package so we can avoid doing imports until


Train on 4534 samples, validate on 1134 samples
Epoch 1/5


  32/4534 [..............................] - ETA: 5:11 - loss: 0.6981 - acc: 0.3125

  64/4534 [..............................] - ETA: 2:39 - loss: 0.6939 - acc: 0.4688

  96/4534 [..............................] - ETA: 1:48 - loss: 0.6885 - acc: 0.5729

 128/4534 [..............................] - ETA: 1:23 - loss: 0.6860 - acc: 0.5938

 160/4534 [>.............................] - ETA: 1:07 - loss: 0.6816 - acc: 0.6188

 192/4534 [>.............................] - ETA: 57s - loss: 0.6771 - acc: 0.6302 

 224/4534 [>.............................] - ETA: 50s - loss: 0.6734 - acc: 0.6339

 256/4534 [>.............................] - ETA: 45s - loss: 0.6700 - acc: 0.6367

 288/4534 [>.............................] - ETA: 40s - loss: 0.6736 - acc: 0.6215

 320/4534 [=>............................] - ETA: 37s - loss: 0.6711 - acc: 0.6250

 352/4534 [=>............................] - ETA: 34s - loss: 0.6702 - acc: 0.6193

 384/4534 [=>............................] - ETA: 31s - loss: 0.6630 - acc: 0.6250

 416/4534 [=>............................] - ETA: 29s - loss: 0.6641 - acc: 0.6202

 448/4534 [=>............................] - ETA: 28s - loss: 0.6650 - acc: 0.6161

 480/4534 [==>...........................] - ETA: 26s - loss: 0.6679 - acc: 0.6042

 512/4534 [==>...........................] - ETA: 25s - loss: 0.6670 - acc: 0.5996

 544/4534 [==>...........................] - ETA: 24s - loss: 0.6622 - acc: 0.6029

 576/4534 [==>...........................] - ETA: 23s - loss: 0.6609 - acc: 0.5990

 608/4534 [===>..........................] - ETA: 22s - loss: 0.6588 - acc: 0.6003

 640/4534 [===>..........................] - ETA: 21s - loss: 0.6577 - acc: 0.5953

 672/4534 [===>..........................] - ETA: 20s - loss: 0.6566 - acc: 0.5923

 704/4534 [===>..........................] - ETA: 19s - loss: 0.6543 - acc: 0.5952

 736/4534 [===>..........................] - ETA: 19s - loss: 0.6524 - acc: 0.5965

 768/4534 [====>.........................] - ETA: 18s - loss: 0.6518 - acc: 0.5951

 800/4534 [====>.........................] - ETA: 17s - loss: 0.6487 - acc: 0.6000

 832/4534 [====>.........................] - ETA: 17s - loss: 0.6462 - acc: 0.6058

 864/4534 [====>.........................] - ETA: 16s - loss: 0.6427 - acc: 0.6123

 896/4534 [====>.........................] - ETA: 16s - loss: 0.6387 - acc: 0.6194

 928/4534 [=====>........................] - ETA: 16s - loss: 0.6362 - acc: 0.6239

 960/4534 [=====>........................] - ETA: 15s - loss: 0.6326 - acc: 0.6302

 992/4534 [=====>........................] - ETA: 15s - loss: 0.6282 - acc: 0.6371

1024/4534 [=====>........................] - ETA: 15s - loss: 0.6266 - acc: 0.6377

1056/4534 [=====>........................] - ETA: 15s - loss: 0.6218 - acc: 0.6458



























































































































































































































Epoch 2/5
  32/4534 [..............................] - ETA: 14s - loss: 0.0268 - acc: 1.0000

  64/4534 [..............................] - ETA: 13s - loss: 0.0280 - acc: 1.0000

  96/4534 [..............................] - ETA: 12s - loss: 0.0239 - acc: 1.0000

 128/4534 [..............................] - ETA: 12s - loss: 0.0232 - acc: 1.0000

 160/4534 [>.............................] - ETA: 11s - loss: 0.0252 - acc: 1.0000

 192/4534 [>.............................] - ETA: 11s - loss: 0.0256 - acc: 1.0000

 224/4534 [>.............................] - ETA: 11s - loss: 0.0294 - acc: 0.9955

 256/4534 [>.............................] - ETA: 11s - loss: 0.0311 - acc: 0.9922

 288/4534 [>.............................] - ETA: 11s - loss: 0.0303 - acc: 0.9931

 320/4534 [=>............................] - ETA: 13s - loss: 0.0324 - acc: 0.9906

 352/4534 [=>............................] - ETA: 13s - loss: 0.0306 - acc: 0.9915

 384/4534 [=>............................] - ETA: 13s - loss: 0.0337 - acc: 0.9896

 416/4534 [=>............................] - ETA: 13s - loss: 0.0341 - acc: 0.9880

 448/4534 [=>............................] - ETA: 13s - loss: 0.0335 - acc: 0.9888

 480/4534 [==>...........................] - ETA: 12s - loss: 0.0321 - acc: 0.9896

 512/4534 [==>...........................] - ETA: 12s - loss: 0.0316 - acc: 0.9902

 544/4534 [==>...........................] - ETA: 13s - loss: 0.0304 - acc: 0.9908

 576/4534 [==>...........................] - ETA: 13s - loss: 0.0306 - acc: 0.9913

 608/4534 [===>..........................] - ETA: 12s - loss: 0.0313 - acc: 0.9901

 640/4534 [===>..........................] - ETA: 12s - loss: 0.0312 - acc: 0.9906

 672/4534 [===>..........................] - ETA: 12s - loss: 0.0302 - acc: 0.9911

 704/4534 [===>..........................] - ETA: 12s - loss: 0.0301 - acc: 0.9915

 736/4534 [===>..........................] - ETA: 12s - loss: 0.0294 - acc: 0.9918

 768/4534 [====>.........................] - ETA: 12s - loss: 0.0284 - acc: 0.9922

 800/4534 [====>.........................] - ETA: 12s - loss: 0.0277 - acc: 0.9925

 832/4534 [====>.........................] - ETA: 12s - loss: 0.0279 - acc: 0.9916

 864/4534 [====>.........................] - ETA: 11s - loss: 0.0281 - acc: 0.9907

 896/4534 [====>.........................] - ETA: 11s - loss: 0.0275 - acc: 0.9911

 928/4534 [=====>........................] - ETA: 12s - loss: 0.0272 - acc: 0.9914

 960/4534 [=====>........................] - ETA: 12s - loss: 0.0268 - acc: 0.9917

 992/4534 [=====>........................] - ETA: 12s - loss: 0.0262 - acc: 0.9919

1024/4534 [=====>........................] - ETA: 12s - loss: 0.0258 - acc: 0.9922

1056/4534 [=====>........................] - ETA: 12s - loss: 0.0251 - acc: 0.9924



























































































































































































































Epoch 3/5
  32/4534 [..............................] - ETA: 8s - loss: 0.0193 - acc: 1.0000

  64/4534 [..............................] - ETA: 8s - loss: 0.0141 - acc: 1.0000

  96/4534 [..............................] - ETA: 8s - loss: 0.0170 - acc: 1.0000

 128/4534 [..............................] - ETA: 8s - loss: 0.0204 - acc: 1.0000

 160/4534 [>.............................] - ETA: 8s - loss: 0.0174 - acc: 1.0000

 192/4534 [>.............................] - ETA: 8s - loss: 0.0170 - acc: 1.0000

 224/4534 [>.............................] - ETA: 8s - loss: 0.0159 - acc: 1.0000

 256/4534 [>.............................] - ETA: 8s - loss: 0.0148 - acc: 1.0000

 288/4534 [>.............................] - ETA: 8s - loss: 0.0148 - acc: 1.0000

 320/4534 [=>............................] - ETA: 8s - loss: 0.0168 - acc: 1.0000

 352/4534 [=>............................] - ETA: 8s - loss: 0.0186 - acc: 1.0000

 384/4534 [=>............................] - ETA: 8s - loss: 0.0180 - acc: 1.0000

 416/4534 [=>............................] - ETA: 8s - loss: 0.0171 - acc: 1.0000

 448/4534 [=>............................] - ETA: 7s - loss: 0.0165 - acc: 1.0000

 480/4534 [==>...........................] - ETA: 8s - loss: 0.0161 - acc: 1.0000

 512/4534 [==>...........................] - ETA: 8s - loss: 0.0155 - acc: 1.0000

 544/4534 [==>...........................] - ETA: 7s - loss: 0.0156 - acc: 1.0000

 576/4534 [==>...........................] - ETA: 7s - loss: 0.0150 - acc: 1.0000

 608/4534 [===>..........................] - ETA: 7s - loss: 0.0150 - acc: 1.0000

 640/4534 [===>..........................] - ETA: 7s - loss: 0.0145 - acc: 1.0000

 672/4534 [===>..........................] - ETA: 7s - loss: 0.0147 - acc: 1.0000

 704/4534 [===>..........................] - ETA: 7s - loss: 0.0155 - acc: 0.9986

 736/4534 [===>..........................] - ETA: 7s - loss: 0.0151 - acc: 0.9986

 768/4534 [====>.........................] - ETA: 7s - loss: 0.0147 - acc: 0.9987

 800/4534 [====>.........................] - ETA: 7s - loss: 0.0153 - acc: 0.9988

 832/4534 [====>.........................] - ETA: 7s - loss: 0.0158 - acc: 0.9988

 864/4534 [====>.........................] - ETA: 7s - loss: 0.0155 - acc: 0.9988

 896/4534 [====>.........................] - ETA: 7s - loss: 0.0156 - acc: 0.9989

 928/4534 [=====>........................] - ETA: 6s - loss: 0.0157 - acc: 0.9989

 960/4534 [=====>........................] - ETA: 6s - loss: 0.0156 - acc: 0.9990

 992/4534 [=====>........................] - ETA: 6s - loss: 0.0157 - acc: 0.9990

1024/4534 [=====>........................] - ETA: 6s - loss: 0.0154 - acc: 0.9990

1056/4534 [=====>........................] - ETA: 6s - loss: 0.0151 - acc: 0.9991



























































































































































































































Epoch 4/5
  32/4534 [..............................] - ETA: 9s - loss: 8.0918e-04 - acc: 1.0000

  64/4534 [..............................] - ETA: 9s - loss: 0.0055 - acc: 1.0000    

  96/4534 [..............................] - ETA: 8s - loss: 0.0040 - acc: 1.0000

 128/4534 [..............................] - ETA: 8s - loss: 0.0034 - acc: 1.0000

 160/4534 [>.............................] - ETA: 8s - loss: 0.0037 - acc: 1.0000

 192/4534 [>.............................] - ETA: 8s - loss: 0.0034 - acc: 1.0000

 224/4534 [>.............................] - ETA: 8s - loss: 0.0031 - acc: 1.0000

 256/4534 [>.............................] - ETA: 7s - loss: 0.0029 - acc: 1.0000

 288/4534 [>.............................] - ETA: 7s - loss: 0.0030 - acc: 1.0000

 320/4534 [=>............................] - ETA: 7s - loss: 0.0031 - acc: 1.0000

 352/4534 [=>............................] - ETA: 7s - loss: 0.0030 - acc: 1.0000

 384/4534 [=>............................] - ETA: 7s - loss: 0.0028 - acc: 1.0000

 416/4534 [=>............................] - ETA: 7s - loss: 0.0028 - acc: 1.0000

 448/4534 [=>............................] - ETA: 7s - loss: 0.0027 - acc: 1.0000

 480/4534 [==>...........................] - ETA: 7s - loss: 0.0027 - acc: 1.0000

 512/4534 [==>...........................] - ETA: 7s - loss: 0.0026 - acc: 1.0000

 544/4534 [==>...........................] - ETA: 7s - loss: 0.0025 - acc: 1.0000

 576/4534 [==>...........................] - ETA: 7s - loss: 0.0025 - acc: 1.0000

 608/4534 [===>..........................] - ETA: 7s - loss: 0.0024 - acc: 1.0000

 640/4534 [===>..........................] - ETA: 7s - loss: 0.0024 - acc: 1.0000

 672/4534 [===>..........................] - ETA: 7s - loss: 0.0023 - acc: 1.0000

 704/4534 [===>..........................] - ETA: 7s - loss: 0.0023 - acc: 1.0000

 736/4534 [===>..........................] - ETA: 7s - loss: 0.0024 - acc: 1.0000

 768/4534 [====>.........................] - ETA: 7s - loss: 0.0024 - acc: 1.0000

 800/4534 [====>.........................] - ETA: 7s - loss: 0.0023 - acc: 1.0000

 832/4534 [====>.........................] - ETA: 7s - loss: 0.0023 - acc: 1.0000

 864/4534 [====>.........................] - ETA: 7s - loss: 0.0023 - acc: 1.0000

 896/4534 [====>.........................] - ETA: 7s - loss: 0.0023 - acc: 1.0000

 928/4534 [=====>........................] - ETA: 7s - loss: 0.0023 - acc: 1.0000

 960/4534 [=====>........................] - ETA: 7s - loss: 0.0022 - acc: 1.0000

 992/4534 [=====>........................] - ETA: 7s - loss: 0.0022 - acc: 1.0000

1024/4534 [=====>........................] - ETA: 7s - loss: 0.0022 - acc: 1.0000

1056/4534 [=====>........................] - ETA: 7s - loss: 0.0021 - acc: 1.0000



























































































































































































































Epoch 5/5
  32/4534 [..............................] - ETA: 8s - loss: 9.0373e-04 - acc: 1.0000

  64/4534 [..............................] - ETA: 8s - loss: 8.0452e-04 - acc: 1.0000

  96/4534 [..............................] - ETA: 10s - loss: 7.8016e-04 - acc: 1.0000

 128/4534 [..............................] - ETA: 10s - loss: 6.9799e-04 - acc: 1.0000

 160/4534 [>.............................] - ETA: 10s - loss: 7.2195e-04 - acc: 1.0000

 192/4534 [>.............................] - ETA: 10s - loss: 7.8105e-04 - acc: 1.0000

 224/4534 [>.............................] - ETA: 9s - loss: 7.8416e-04 - acc: 1.0000 

 256/4534 [>.............................] - ETA: 9s - loss: 7.7708e-04 - acc: 1.0000

 288/4534 [>.............................] - ETA: 9s - loss: 0.0010 - acc: 1.0000    

 320/4534 [=>............................] - ETA: 9s - loss: 0.0013 - acc: 1.0000

 352/4534 [=>............................] - ETA: 8s - loss: 0.0012 - acc: 1.0000

 384/4534 [=>............................] - ETA: 8s - loss: 0.0012 - acc: 1.0000

 416/4534 [=>............................] - ETA: 8s - loss: 0.0013 - acc: 1.0000

 448/4534 [=>............................] - ETA: 8s - loss: 0.0013 - acc: 1.0000

 480/4534 [==>...........................] - ETA: 8s - loss: 0.0012 - acc: 1.0000

 512/4534 [==>...........................] - ETA: 8s - loss: 0.0012 - acc: 1.0000

 544/4534 [==>...........................] - ETA: 7s - loss: 0.0012 - acc: 1.0000

 576/4534 [==>...........................] - ETA: 7s - loss: 0.0012 - acc: 1.0000

 608/4534 [===>..........................] - ETA: 7s - loss: 0.0012 - acc: 1.0000

 640/4534 [===>..........................] - ETA: 7s - loss: 0.0012 - acc: 1.0000

 672/4534 [===>..........................] - ETA: 7s - loss: 0.0013 - acc: 1.0000

 704/4534 [===>..........................] - ETA: 7s - loss: 0.0012 - acc: 1.0000

 736/4534 [===>..........................] - ETA: 7s - loss: 0.0012 - acc: 1.0000

 768/4534 [====>.........................] - ETA: 7s - loss: 0.0013 - acc: 1.0000

 800/4534 [====>.........................] - ETA: 7s - loss: 0.0013 - acc: 1.0000

 832/4534 [====>.........................] - ETA: 7s - loss: 0.0013 - acc: 1.0000

 864/4534 [====>.........................] - ETA: 7s - loss: 0.0013 - acc: 1.0000

 896/4534 [====>.........................] - ETA: 7s - loss: 0.0012 - acc: 1.0000

 928/4534 [=====>........................] - ETA: 7s - loss: 0.0012 - acc: 1.0000

 960/4534 [=====>........................] - ETA: 6s - loss: 0.0013 - acc: 1.0000

 992/4534 [=====>........................] - ETA: 6s - loss: 0.0013 - acc: 1.0000

1024/4534 [=====>........................] - ETA: 6s - loss: 0.0012 - acc: 1.0000

1056/4534 [=====>........................] - ETA: 6s - loss: 0.0012 - acc: 1.0000



























































































































































































































In [167]:
x_test.shape

(1418,)

In [168]:
# Convert sentences to token sequences
X_test = np.empty((x_test.shape[0], ), dtype=list)
for i in range(len(test['sentence'])):
    sentence = str(test['sentence'][i])
    sentence = re.sub("[^a-zA-Z]", " ", sentence)
    words = nltk.word_tokenize(sentence.lower())
    seqs = []
    for word in words:
        seqs.append(vocab.get(word, -1))
    X_test[i] = seqs

x_test = sequence.pad_sequences(X_test, maxlen=MAX_SENTENCE_LENGTH)

In [170]:
pred = model.predict(x_test)

In [176]:
submission = pd.read_csv('~/Downloads/sample_submission.csv')
for i in range(pred.shape[0]):
    if pred[i, 0] < 0.5:
        submission.loc[i, 'label'] = 0
    else:
        submission.loc[i, 'label'] = 1

submission.to_csv('~/Downloads/submission4.csv', index=False)  # not store index

