In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from gensim.models import Word2Vec, Phrases
from gensim.models.phrases import Phraser



In [2]:
from tqdm import tqdm
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')
stop = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gohw\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gohw\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [3]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, SpatialDropout1D, Flatten
from keras.initializers import Constant
from keras.optimizers import Adam

Using TensorFlow backend.


In [4]:
df = pd.read_csv('train.csv')

In [5]:
ids_with_target_error = [328,443,513,2619,3640,3900,4342,5781,6552,6554,6570,6701,6702,6729,6861,7226]

In [6]:
df.loc[df['id'].isin(ids_with_target_error),'target'] = 0

# Count Vectorizer

In [7]:
countvec = feature_extraction.text.CountVectorizer()
tfidfvec = feature_extraction.text.TfidfVectorizer()

In [8]:
model = linear_model.RidgeClassifier()

In [9]:
vector_count = countvec.fit_transform(df['text']).todense()
vector_tfidf = tfidfvec.fit_transform(df['text']).todense()

In [11]:
scores = model_selection.cross_val_score(model, vector_count, df['target'], cv=5, scoring='f1')
np.mean(scores)

0.5715569538004827

In [12]:
scores = model_selection.cross_val_score(model, vector_tfidf, df['target'], cv=5, scoring='f1')
np.mean(scores)

0.6325075863982154

# Glove

In [7]:
def create_corpus(df):
    corpus = []
    for row in tqdm(df['text']):
        word = [w.lower() for w in word_tokenize(row) if ((w.isalpha()) and (w not in stop)) ]
        corpus.append(word)
    return corpus

In [8]:
corpus = create_corpus(df)

100%|████████████████████████████████████| 7613/7613 [00:01<00:00, 4785.94it/s]


In [56]:
embedding_dict = {}
with open('glove.6B/glove.6B.300d.txt','r', encoding='utf8') as f:
    for line in f:
        val = line.split()
        word = val[0]
        vec = np.array(val[1:],'float32')
        embedding_dict[word] = vec
f.close()

In [57]:
tok = Tokenizer()
tok.fit_on_texts(corpus)
seq = tok.texts_to_sequences(corpus)

In [58]:
pad = pad_sequences(seq, maxlen=30, truncating='post', padding='post')

In [59]:
word_index = tok.word_index
len(word_index)

15015

In [60]:
emb_mat = np.zeros((len(word_index)+1,300))
for word,i in word_index.items():
    emb_vec = embedding_dict.get(word)
    if emb_vec is not None:
        emb_mat[i] = emb_vec

In [61]:
emb_mat.shape

(15016, 300)

In [62]:
X_train, X_test, y_train, y_test = train_test_split(pad,df['target'],test_size=0.2, shuffle=True)

In [63]:
model = Sequential()
embedding = Embedding(emb_mat.shape[0], emb_mat.shape[1], embeddings_initializer=Constant(emb_mat), input_length=30, trainable=False)
model.add(embedding)
#model.add(SpatialDropout1D(0.2))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
#model.add(Flatten())
#model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
opt = Adam(lr=1e-5)
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])

In [64]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 30, 300)           4504800   
_________________________________________________________________
lstm_3 (LSTM)                (None, 64)                93440     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 65        
Total params: 4,598,305
Trainable params: 93,505
Non-trainable params: 4,504,800
_________________________________________________________________


In [65]:
history = model.fit(X_train, y_train, batch_size=4, epochs=10, validation_data=(X_test, y_test), verbose=1)

Train on 6090 samples, validate on 1523 samples
Epoch 1/10








Epoch 2/10








Epoch 3/10








Epoch 4/10








Epoch 5/10


1404/6090 [=====>........................] - ETA: 1:21 - loss: 0.7116 - acc: 0.500 - ETA: 1:26 - loss: 0.5995 - acc: 0.750 - ETA: 1:33 - loss: 0.5308 - acc: 0.833 - ETA: 1:35 - loss: 0.5421 - acc: 0.750 - ETA: 1:45 - loss: 0.4972 - acc: 0.800 - ETA: 1:44 - loss: 0.5003 - acc: 0.791 - ETA: 1:44 - loss: 0.4811 - acc: 0.821 - ETA: 1:42 - loss: 0.4568 - acc: 0.843 - ETA: 1:31 - loss: 0.4792 - acc: 0.825 - ETA: 1:23 - loss: 0.4451 - acc: 0.854 - ETA: 1:19 - loss: 0.4472 - acc: 0.839 - ETA: 1:19 - loss: 0.4525 - acc: 0.833 - ETA: 1:15 - loss: 0.4753 - acc: 0.823 - ETA: 1:11 - loss: 0.5151 - acc: 0.802 - ETA: 1:10 - loss: 0.5042 - acc: 0.809 - ETA: 1:08 - loss: 0.4873 - acc: 0.815 - ETA: 1:06 - loss: 0.4754 - acc: 0.830 - ETA: 1:04 - loss: 0.4702 - acc: 0.833 - ETA: 1:02 - loss: 0.4641 - acc: 0.833 - ETA: 1:01 - loss: 0.4664 - acc: 0.828 - ETA: 1:00 - loss: 0.4715 - acc: 0.816 - ETA: 59s - loss: 0.4821 - acc: 0.805 - ETA: 58s - loss: 0.4942 - acc: 0.80 - ETA: 59s - loss: 0.4918 - acc: 0.80 - 







Epoch 6/10










Epoch 7/10


1372/6090 [=====>........................] - ETA: 47s - loss: 0.2024 - acc: 1.00 - ETA: 47s - loss: 0.3960 - acc: 0.91 - ETA: 47s - loss: 0.5211 - acc: 0.80 - ETA: 47s - loss: 0.5461 - acc: 0.78 - ETA: 47s - loss: 0.5125 - acc: 0.80 - ETA: 47s - loss: 0.5089 - acc: 0.81 - ETA: 52s - loss: 0.5156 - acc: 0.80 - ETA: 1:00 - loss: 0.5596 - acc: 0.767 - ETA: 1:02 - loss: 0.5394 - acc: 0.783 - ETA: 1:09 - loss: 0.5461 - acc: 0.781 - ETA: 1:06 - loss: 0.5772 - acc: 0.763 - ETA: 1:04 - loss: 0.5635 - acc: 0.762 - ETA: 1:02 - loss: 0.5734 - acc: 0.761 - ETA: 1:02 - loss: 0.5686 - acc: 0.760 - ETA: 1:00 - loss: 0.5803 - acc: 0.759 - ETA: 59s - loss: 0.5765 - acc: 0.767 - ETA: 58s - loss: 0.5756 - acc: 0.77 - ETA: 57s - loss: 0.5628 - acc: 0.78 - ETA: 57s - loss: 0.5388 - acc: 0.79 - ETA: 56s - loss: 0.5526 - acc: 0.79 - ETA: 55s - loss: 0.5420 - acc: 0.79 - ETA: 55s - loss: 0.5336 - acc: 0.80 - ETA: 54s - loss: 0.5226 - acc: 0.80 - ETA: 54s - loss: 0.5145 - acc: 0.81 - ETA: 53s - loss: 0.5139 - 







Epoch 8/10








Epoch 9/10








Epoch 10/10


1328/6090 [=====>........................] - ETA: 47s - loss: 0.1813 - acc: 1.00 - ETA: 55s - loss: 0.3658 - acc: 0.91 - ETA: 56s - loss: 0.3138 - acc: 0.90 - ETA: 57s - loss: 0.5104 - acc: 0.75 - ETA: 58s - loss: 0.4872 - acc: 0.77 - ETA: 58s - loss: 0.4795 - acc: 0.77 - ETA: 57s - loss: 0.4565 - acc: 0.80 - ETA: 59s - loss: 0.4609 - acc: 0.80 - ETA: 58s - loss: 0.4513 - acc: 0.81 - ETA: 58s - loss: 0.4501 - acc: 0.81 - ETA: 57s - loss: 0.4573 - acc: 0.80 - ETA: 57s - loss: 0.4483 - acc: 0.80 - ETA: 56s - loss: 0.4537 - acc: 0.79 - ETA: 57s - loss: 0.4461 - acc: 0.79 - ETA: 57s - loss: 0.4628 - acc: 0.79 - ETA: 58s - loss: 0.4562 - acc: 0.80 - ETA: 58s - loss: 0.4617 - acc: 0.79 - ETA: 59s - loss: 0.4653 - acc: 0.79 - ETA: 59s - loss: 0.4561 - acc: 0.80 - ETA: 1:00 - loss: 0.4591 - acc: 0.801 - ETA: 1:00 - loss: 0.4590 - acc: 0.805 - ETA: 1:00 - loss: 0.4783 - acc: 0.796 - ETA: 1:00 - loss: 0.5105 - acc: 0.775 - ETA: 1:00 - loss: 0.5109 - acc: 0.774 - ETA: 1:00 - loss: 0.5100 - acc: 0









In [66]:
pred=model.predict(X_test)

In [67]:
f1_score(y_test.as_matrix(),np.round(pred).astype(int))

  """Entry point for launching an IPython kernel.


0.7441860465116279

# Word2Vec embedding

In [68]:
model = Word2Vec(corpus, iter=100, min_count=1, size=300, window=10)

In [69]:
len(model.wv.index2word)

15015

In [70]:
model.wv.syn0.shape

  """Entry point for launching an IPython kernel.


(15015, 300)

In [71]:
tok = Tokenizer()
tok.fit_on_texts(corpus)
seq = tok.texts_to_sequences(corpus)
pad = pad_sequences(seq, maxlen=30, truncating='post', padding='post')

In [72]:
word_index = tok.word_index
len(word_index)

15015

In [73]:
len(model.wv.syn0[2])

  """Entry point for launching an IPython kernel.


300

In [74]:
emb_mat_new = np.zeros((len(word_index)+1,300))
for word,i in word_index.items():
    index = model.wv.index2word.index(word)
    emb_vec_new = model.wv.syn0[index]
    if emb_vec_new is not None:
        emb_mat_new[i] = emb_vec_new

  after removing the cwd from sys.path.


In [75]:
print(emb_mat_new.shape)
emb_mat_new

(15016, 300)


array([[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [-2.33808666e-01, -1.28938818e+00,  5.79067171e-01, ...,
         4.45988141e-02,  1.43443614e-01, -2.23751500e-01],
       [-8.14709008e-01,  1.03409655e-01, -1.18410933e+00, ...,
         1.26772749e+00,  1.39811024e-01,  8.63084853e-01],
       ...,
       [-1.44951671e-01,  3.16515118e-02, -1.54727206e-01, ...,
         3.08250207e-02,  1.79997817e-01, -1.25375405e-01],
       [-6.60575926e-04,  1.19019702e-01, -1.54966917e-02, ...,
         5.82552701e-02,  8.06757137e-02,  4.43036892e-02],
       [ 1.00453962e-02,  9.98999402e-02, -1.16862990e-02, ...,
         4.10295613e-02,  6.75621107e-02,  5.01458235e-02]])

In [76]:
model = Sequential()
embedding = Embedding(emb_mat_new.shape[0], emb_mat_new.shape[1], embeddings_initializer=Constant(emb_mat_new), input_length=30, trainable=False)
model.add(embedding)
#model.add(SpatialDropout1D(0.2))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
#model.add(Flatten())
#model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
opt = Adam(lr=1e-5)
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])

In [77]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 30, 300)           4504800   
_________________________________________________________________
lstm_4 (LSTM)                (None, 64)                93440     
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 65        
Total params: 4,598,305
Trainable params: 93,505
Non-trainable params: 4,504,800
_________________________________________________________________


In [78]:
history = model.fit(X_train, y_train, batch_size=4, epochs=10, validation_data=(X_test, y_test))

Train on 6090 samples, validate on 1523 samples
Epoch 1/10


1352/6090 [=====>........................] - ETA: 4:03:13 - loss: 0.6922 - acc: 0.50 - ETA: 2:03:20 - loss: 0.6934 - acc: 0.37 - ETA: 1:02:04 - loss: 0.6930 - acc: 0.37 - ETA: 41:39 - loss: 0.6928 - acc: 0.5000 - ETA: 35:57 - loss: 0.6930 - acc: 0.50 - ETA: 28:09 - loss: 0.6934 - acc: 0.47 - ETA: 23:11 - loss: 0.6936 - acc: 0.45 - ETA: 19:45 - loss: 0.6933 - acc: 0.50 - ETA: 17:14 - loss: 0.6932 - acc: 0.50 - ETA: 15:18 - loss: 0.6928 - acc: 0.51 - ETA: 13:48 - loss: 0.6930 - acc: 0.53 - ETA: 12:33 - loss: 0.6930 - acc: 0.54 - ETA: 11:32 - loss: 0.6930 - acc: 0.54 - ETA: 10:40 - loss: 0.6931 - acc: 0.54 - ETA: 9:57 - loss: 0.6930 - acc: 0.5370 - ETA: 9:19 - loss: 0.6930 - acc: 0.525 - ETA: 8:47 - loss: 0.6929 - acc: 0.516 - ETA: 8:18 - loss: 0.6930 - acc: 0.515 - ETA: 7:52 - loss: 0.6930 - acc: 0.507 - ETA: 7:29 - loss: 0.6930 - acc: 0.500 - ETA: 7:09 - loss: 0.6930 - acc: 0.493 - ETA: 7:00 - loss: 0.6930 - acc: 0.500 - ETA: 6:42 - loss: 0.6930 - acc: 0.511 - ETA: 6:26 - loss: 0.6930 -







Epoch 2/10


1368/6090 [=====>........................] - ETA: 1:27 - loss: 0.6758 - acc: 0.750 - ETA: 1:27 - loss: 0.6784 - acc: 0.750 - ETA: 1:09 - loss: 0.6786 - acc: 0.812 - ETA: 1:14 - loss: 0.6814 - acc: 0.750 - ETA: 1:18 - loss: 0.6831 - acc: 0.708 - ETA: 1:12 - loss: 0.6830 - acc: 0.718 - ETA: 1:09 - loss: 0.6865 - acc: 0.625 - ETA: 1:07 - loss: 0.6856 - acc: 0.625 - ETA: 1:09 - loss: 0.6861 - acc: 0.615 - ETA: 1:08 - loss: 0.6879 - acc: 0.583 - ETA: 1:06 - loss: 0.6858 - acc: 0.617 - ETA: 1:05 - loss: 0.6853 - acc: 0.631 - ETA: 1:05 - loss: 0.6872 - acc: 0.595 - ETA: 1:04 - loss: 0.6858 - acc: 0.619 - ETA: 1:04 - loss: 0.6865 - acc: 0.610 - ETA: 1:04 - loss: 0.6865 - acc: 0.611 - ETA: 1:04 - loss: 0.6858 - acc: 0.620 - ETA: 1:04 - loss: 0.6861 - acc: 0.612 - ETA: 1:03 - loss: 0.6862 - acc: 0.613 - ETA: 1:03 - loss: 0.6861 - acc: 0.621 - ETA: 1:04 - loss: 0.6863 - acc: 0.618 - ETA: 1:04 - loss: 0.6857 - acc: 0.625 - ETA: 1:03 - loss: 0.6854 - acc: 0.631 - ETA: 1:03 - loss: 0.6864 - acc: 0.6







Epoch 3/10


1244/6090 [=====>........................] - ETA: 1:27 - loss: 0.6232 - acc: 0.750 - ETA: 1:59 - loss: 0.5589 - acc: 0.875 - ETA: 1:59 - loss: 0.6171 - acc: 0.750 - ETA: 1:51 - loss: 0.6149 - acc: 0.687 - ETA: 1:32 - loss: 0.5850 - acc: 0.750 - ETA: 1:23 - loss: 0.5819 - acc: 0.718 - ETA: 1:21 - loss: 0.5752 - acc: 0.750 - ETA: 1:17 - loss: 0.5612 - acc: 0.770 - ETA: 1:16 - loss: 0.5690 - acc: 0.750 - ETA: 1:13 - loss: 0.5831 - acc: 0.750 - ETA: 1:13 - loss: 0.5879 - acc: 0.736 - ETA: 1:11 - loss: 0.5761 - acc: 0.762 - ETA: 1:12 - loss: 0.5779 - acc: 0.761 - ETA: 1:11 - loss: 0.5876 - acc: 0.750 - ETA: 1:10 - loss: 0.5927 - acc: 0.740 - ETA: 1:09 - loss: 0.5872 - acc: 0.750 - ETA: 1:10 - loss: 0.5844 - acc: 0.758 - ETA: 1:09 - loss: 0.5821 - acc: 0.766 - ETA: 1:10 - loss: 0.5812 - acc: 0.766 - ETA: 1:09 - loss: 0.5839 - acc: 0.765 - ETA: 1:09 - loss: 0.5822 - acc: 0.764 - ETA: 1:10 - loss: 0.5796 - acc: 0.764 - ETA: 1:10 - loss: 0.5870 - acc: 0.750 - ETA: 1:11 - loss: 0.5814 - acc: 0.7









Epoch 4/10


1152/6090 [====>.........................] - ETA: 51s - loss: 0.4091 - acc: 1.00 - ETA: 56s - loss: 0.4414 - acc: 0.91 - ETA: 1:06 - loss: 0.4835 - acc: 0.875 - ETA: 1:03 - loss: 0.4902 - acc: 0.833 - ETA: 1:08 - loss: 0.4675 - acc: 0.857 - ETA: 1:11 - loss: 0.4630 - acc: 0.843 - ETA: 1:08 - loss: 0.5018 - acc: 0.800 - ETA: 1:10 - loss: 0.5125 - acc: 0.795 - ETA: 1:10 - loss: 0.5620 - acc: 0.750 - ETA: 1:08 - loss: 0.5857 - acc: 0.716 - ETA: 1:09 - loss: 0.5604 - acc: 0.720 - ETA: 1:07 - loss: 0.5839 - acc: 0.710 - ETA: 1:09 - loss: 0.5810 - acc: 0.712 - ETA: 1:08 - loss: 0.5536 - acc: 0.738 - ETA: 1:09 - loss: 0.5500 - acc: 0.739 - ETA: 1:12 - loss: 0.5432 - acc: 0.739 - ETA: 1:15 - loss: 0.5383 - acc: 0.740 - ETA: 1:16 - loss: 0.5392 - acc: 0.740 - ETA: 1:14 - loss: 0.5336 - acc: 0.741 - ETA: 1:13 - loss: 0.5391 - acc: 0.741 - ETA: 1:14 - loss: 0.5369 - acc: 0.741 - ETA: 1:14 - loss: 0.5263 - acc: 0.750 - ETA: 1:15 - loss: 0.5208 - acc: 0.757 - ETA: 1:17 - loss: 0.5234 - acc: 0.757 -







Epoch 5/10


1200/6090 [====>.........................] - ETA: 1:11 - loss: 0.6503 - acc: 0.750 - ETA: 1:23 - loss: 0.6108 - acc: 0.625 - ETA: 1:11 - loss: 0.5678 - acc: 0.687 - ETA: 1:20 - loss: 0.5841 - acc: 0.650 - ETA: 1:14 - loss: 0.5922 - acc: 0.642 - ETA: 1:13 - loss: 0.5892 - acc: 0.638 - ETA: 1:10 - loss: 0.6471 - acc: 0.590 - ETA: 1:08 - loss: 0.5884 - acc: 0.653 - ETA: 1:10 - loss: 0.5729 - acc: 0.678 - ETA: 1:09 - loss: 0.5583 - acc: 0.703 - ETA: 1:07 - loss: 0.5211 - acc: 0.736 - ETA: 1:09 - loss: 0.5223 - acc: 0.723 - ETA: 1:08 - loss: 0.5560 - acc: 0.702 - ETA: 1:07 - loss: 0.5385 - acc: 0.717 - ETA: 1:08 - loss: 0.5431 - acc: 0.718 - ETA: 1:09 - loss: 0.5388 - acc: 0.730 - ETA: 1:08 - loss: 0.5343 - acc: 0.731 - ETA: 1:09 - loss: 0.5231 - acc: 0.741 - ETA: 1:08 - loss: 0.5027 - acc: 0.758 - ETA: 1:07 - loss: 0.4990 - acc: 0.765 - ETA: 1:08 - loss: 0.4907 - acc: 0.772 - ETA: 1:08 - loss: 0.4924 - acc: 0.771 - ETA: 1:07 - loss: 0.5044 - acc: 0.756 - ETA: 1:08 - loss: 0.5014 - acc: 0.7







Epoch 6/10


1000/6090 [===>..........................] - ETA: 1:11 - loss: 0.2783 - acc: 1.000 - ETA: 1:23 - loss: 0.5703 - acc: 0.750 - ETA: 1:11 - loss: 0.4652 - acc: 0.812 - ETA: 1:15 - loss: 0.4112 - acc: 0.850 - ETA: 1:18 - loss: 0.4161 - acc: 0.875 - ETA: 1:13 - loss: 0.4160 - acc: 0.843 - ETA: 1:16 - loss: 0.4385 - acc: 0.833 - ETA: 1:17 - loss: 0.4203 - acc: 0.850 - ETA: 1:19 - loss: 0.4273 - acc: 0.840 - ETA: 1:17 - loss: 0.4465 - acc: 0.826 - ETA: 1:15 - loss: 0.4570 - acc: 0.800 - ETA: 1:16 - loss: 0.4692 - acc: 0.781 - ETA: 1:14 - loss: 0.4812 - acc: 0.777 - ETA: 1:15 - loss: 0.4862 - acc: 0.776 - ETA: 1:16 - loss: 0.5086 - acc: 0.762 - ETA: 1:14 - loss: 0.4885 - acc: 0.784 - ETA: 1:15 - loss: 0.4893 - acc: 0.782 - ETA: 1:14 - loss: 0.5031 - acc: 0.770 - ETA: 1:13 - loss: 0.5126 - acc: 0.759 - ETA: 1:12 - loss: 0.5109 - acc: 0.758 - ETA: 1:12 - loss: 0.5078 - acc: 0.758 - ETA: 1:12 - loss: 0.5127 - acc: 0.757 - ETA: 1:12 - loss: 0.5203 - acc: 0.750 - ETA: 1:12 - loss: 0.5326 - acc: 0.7









Epoch 7/10


 860/6090 [===>..........................] - ETA: 1:53 - loss: 0.2292 - acc: 1.000 - ETA: 1:40 - loss: 0.4349 - acc: 0.750 - ETA: 1:22 - loss: 0.4804 - acc: 0.750 - ETA: 1:24 - loss: 0.4318 - acc: 0.800 - ETA: 1:24 - loss: 0.4033 - acc: 0.833 - ETA: 1:29 - loss: 0.4726 - acc: 0.785 - ETA: 1:28 - loss: 0.4911 - acc: 0.781 - ETA: 1:28 - loss: 0.5206 - acc: 0.777 - ETA: 1:30 - loss: 0.5149 - acc: 0.775 - ETA: 1:31 - loss: 0.4887 - acc: 0.795 - ETA: 1:28 - loss: 0.5067 - acc: 0.788 - ETA: 1:27 - loss: 0.5185 - acc: 0.785 - ETA: 1:29 - loss: 0.5541 - acc: 0.750 - ETA: 1:29 - loss: 0.5517 - acc: 0.750 - ETA: 1:30 - loss: 0.5348 - acc: 0.764 - ETA: 1:29 - loss: 0.5592 - acc: 0.736 - ETA: 1:26 - loss: 0.5456 - acc: 0.750 - ETA: 1:27 - loss: 0.5379 - acc: 0.750 - ETA: 1:28 - loss: 0.5425 - acc: 0.750 - ETA: 1:26 - loss: 0.5386 - acc: 0.750 - ETA: 1:26 - loss: 0.5268 - acc: 0.759 - ETA: 1:26 - loss: 0.5367 - acc: 0.750 - ETA: 1:26 - loss: 0.5373 - acc: 0.750 - ETA: 1:26 - loss: 0.5301 - acc: 0.7











Epoch 8/10


 860/6090 [===>..........................] - ETA: 1:11 - loss: 0.3497 - acc: 0.750 - ETA: 1:23 - loss: 0.5126 - acc: 0.750 - ETA: 1:26 - loss: 0.5098 - acc: 0.750 - ETA: 1:28 - loss: 0.5431 - acc: 0.750 - ETA: 1:29 - loss: 0.6222 - acc: 0.700 - ETA: 1:30 - loss: 0.5625 - acc: 0.750 - ETA: 1:31 - loss: 0.5833 - acc: 0.714 - ETA: 1:31 - loss: 0.5341 - acc: 0.750 - ETA: 1:31 - loss: 0.5483 - acc: 0.750 - ETA: 1:32 - loss: 0.5185 - acc: 0.775 - ETA: 1:32 - loss: 0.4995 - acc: 0.795 - ETA: 1:32 - loss: 0.4976 - acc: 0.791 - ETA: 1:32 - loss: 0.4742 - acc: 0.807 - ETA: 1:32 - loss: 0.4831 - acc: 0.803 - ETA: 1:32 - loss: 0.4751 - acc: 0.816 - ETA: 1:34 - loss: 0.5158 - acc: 0.781 - ETA: 1:34 - loss: 0.5233 - acc: 0.779 - ETA: 1:34 - loss: 0.5604 - acc: 0.750 - ETA: 1:34 - loss: 0.5511 - acc: 0.750 - ETA: 1:33 - loss: 0.5347 - acc: 0.762 - ETA: 1:33 - loss: 0.5340 - acc: 0.761 - ETA: 1:33 - loss: 0.5230 - acc: 0.772 - ETA: 1:33 - loss: 0.5126 - acc: 0.782 - ETA: 1:34 - loss: 0.5182 - acc: 0.7













Epoch 9/10


 832/6090 [===>..........................] - ETA: 1:23 - loss: 0.8030 - acc: 0.750 - ETA: 1:07 - loss: 0.6291 - acc: 0.750 - ETA: 1:14 - loss: 0.5234 - acc: 0.812 - ETA: 1:18 - loss: 0.4908 - acc: 0.800 - ETA: 1:20 - loss: 0.4353 - acc: 0.833 - ETA: 1:23 - loss: 0.3969 - acc: 0.857 - ETA: 1:27 - loss: 0.4672 - acc: 0.812 - ETA: 1:28 - loss: 0.4400 - acc: 0.833 - ETA: 1:28 - loss: 0.4572 - acc: 0.825 - ETA: 1:29 - loss: 0.4559 - acc: 0.818 - ETA: 1:29 - loss: 0.4654 - acc: 0.812 - ETA: 1:30 - loss: 0.4442 - acc: 0.826 - ETA: 1:31 - loss: 0.4517 - acc: 0.821 - ETA: 1:32 - loss: 0.4360 - acc: 0.833 - ETA: 1:32 - loss: 0.4189 - acc: 0.843 - ETA: 1:32 - loss: 0.4209 - acc: 0.838 - ETA: 1:29 - loss: 0.4005 - acc: 0.855 - ETA: 1:30 - loss: 0.4135 - acc: 0.850 - ETA: 1:30 - loss: 0.4106 - acc: 0.845 - ETA: 1:30 - loss: 0.4003 - acc: 0.852 - ETA: 1:30 - loss: 0.4077 - acc: 0.847 - ETA: 1:30 - loss: 0.4018 - acc: 0.854 - ETA: 1:31 - loss: 0.3980 - acc: 0.860 - ETA: 1:29 - loss: 0.4142 - acc: 0.8













Epoch 10/10


 816/6090 [===>..........................] - ETA: 1:32 - loss: 0.9877 - acc: 0.500 - ETA: 1:35 - loss: 0.8342 - acc: 0.625 - ETA: 1:38 - loss: 0.7130 - acc: 0.666 - ETA: 1:43 - loss: 0.6928 - acc: 0.687 - ETA: 1:46 - loss: 0.6155 - acc: 0.750 - ETA: 1:47 - loss: 0.6043 - acc: 0.750 - ETA: 1:48 - loss: 0.6294 - acc: 0.714 - ETA: 1:49 - loss: 0.6837 - acc: 0.687 - ETA: 1:49 - loss: 0.6866 - acc: 0.666 - ETA: 1:49 - loss: 0.6721 - acc: 0.675 - ETA: 1:51 - loss: 0.6336 - acc: 0.704 - ETA: 1:52 - loss: 0.6180 - acc: 0.708 - ETA: 1:51 - loss: 0.5915 - acc: 0.730 - ETA: 1:51 - loss: 0.5652 - acc: 0.750 - ETA: 1:51 - loss: 0.5627 - acc: 0.750 - ETA: 1:51 - loss: 0.5521 - acc: 0.750 - ETA: 1:51 - loss: 0.5401 - acc: 0.750 - ETA: 1:51 - loss: 0.5271 - acc: 0.763 - ETA: 1:50 - loss: 0.5132 - acc: 0.776 - ETA: 1:50 - loss: 0.5171 - acc: 0.775 - ETA: 1:50 - loss: 0.5194 - acc: 0.773 - ETA: 1:50 - loss: 0.5085 - acc: 0.784 - ETA: 1:50 - loss: 0.4972 - acc: 0.793 - ETA: 1:50 - loss: 0.4865 - acc: 0.8















In [79]:
pred=model.predict(X_test)

In [80]:
f1_score(y_test.as_matrix(),np.round(pred).astype(int))

  """Entry point for launching an IPython kernel.


0.715092816787732

# Bigram Word2Vec

In [123]:
bigram = Phrases(corpus, min_count=1, threshold=2)

In [124]:
model = Word2Vec(Phraser(bigram)[corpus], iter=100, min_count=1, size=300, window=5)

In [125]:
len(model.wv.index2word)

18588

In [126]:
model.wv.syn0.shape

  """Entry point for launching an IPython kernel.


(18588, 300)

In [140]:
seq = [[model.wv.index2word.index(word)+1 for word in line] for line in Phraser(bigram)[corpus]]

In [144]:
pad = pad_sequences(seq, maxlen=30, truncating='post', padding='post')

In [145]:
pad.shape

(7613, 30)

In [146]:
emb_mat_bi = np.zeros((len(model.wv.index2word)+1,300))
emb_mat_bi[1:] = model.wv.syn0

  


In [147]:
print(emb_mat_bi.shape)
emb_mat_bi

(18589, 300)


array([[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 2.41050050e-01,  1.02071381e+00, -1.72832698e-01, ...,
        -8.06990087e-01,  4.93431777e-01, -8.95636141e-01],
       [-1.79948181e-01,  7.80899107e-01, -1.29675508e+00, ...,
         2.13270998e+00, -1.14702332e+00,  5.83924115e-01],
       ...,
       [-1.03534691e-01,  5.11198901e-02,  1.20399781e-02, ...,
         1.34615935e-02,  3.03649921e-02, -9.97772589e-02],
       [ 4.40160185e-02,  1.33806720e-01, -1.36804566e-01, ...,
        -1.58327408e-02,  1.04513116e-01, -2.89176678e-04],
       [ 3.59053500e-02,  9.10307914e-02, -9.10026133e-02, ...,
        -2.49990653e-02,  7.95121416e-02, -2.54430022e-04]])

In [148]:
model = Sequential()
embedding = Embedding(emb_mat_bi.shape[0], emb_mat_bi.shape[1], embeddings_initializer=Constant(emb_mat_bi), input_length=30, trainable=False)
model.add(embedding)
#model.add(SpatialDropout1D(0.2))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
#model.add(Flatten())
#model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
opt = Adam(lr=1e-5)
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])

In [149]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 30, 300)           5576700   
_________________________________________________________________
lstm_6 (LSTM)                (None, 64)                93440     
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 65        
Total params: 5,670,205
Trainable params: 93,505
Non-trainable params: 5,576,700
_________________________________________________________________


In [150]:
X_train, X_test, y_train, y_test = train_test_split(pad,df['target'],test_size=0.2, shuffle=True)

In [151]:
history = model.fit(X_train, y_train, batch_size=4, epochs=5, validation_data=(X_test, y_test))

Train on 6090 samples, validate on 1523 samples
Epoch 1/5


 808/6090 [==>...........................] - ETA: 5:52:36 - loss: 0.6932 - acc: 0.50 - ETA: 2:58:22 - loss: 0.6924 - acc: 0.50 - ETA: 1:59:49 - loss: 0.6921 - acc: 0.58 - ETA: 1:30:41 - loss: 0.6929 - acc: 0.56 - ETA: 1:13:02 - loss: 0.6926 - acc: 0.55 - ETA: 1:01:24 - loss: 0.6924 - acc: 0.54 - ETA: 53:04 - loss: 0.6926 - acc: 0.5000 - ETA: 46:48 - loss: 0.6926 - acc: 0.50 - ETA: 41:56 - loss: 0.6927 - acc: 0.47 - ETA: 38:00 - loss: 0.6927 - acc: 0.47 - ETA: 34:49 - loss: 0.6927 - acc: 0.45 - ETA: 32:08 - loss: 0.6927 - acc: 0.45 - ETA: 29:54 - loss: 0.6928 - acc: 0.42 - ETA: 27:56 - loss: 0.6928 - acc: 0.41 - ETA: 26:17 - loss: 0.6927 - acc: 0.41 - ETA: 24:50 - loss: 0.6928 - acc: 0.42 - ETA: 23:32 - loss: 0.6928 - acc: 0.42 - ETA: 22:22 - loss: 0.6931 - acc: 0.40 - ETA: 21:22 - loss: 0.6929 - acc: 0.40 - ETA: 20:37 - loss: 0.6930 - acc: 0.38 - ETA: 19:46 - loss: 0.6930 - acc: 0.40 - ETA: 19:00 - loss: 0.6930 - acc: 0.38 - ETA: 18:18 - loss: 0.6931 - acc: 0.38 - ETA: 17:39 - loss: 0.













Epoch 2/5
 232/6090 [>.............................] - ETA: 2:32 - loss: 0.6982 - acc: 0.250 - ETA: 2:58 - loss: 0.6792 - acc: 0.500 - ETA: 3:12 - loss: 0.6811 - acc: 0.583 - ETA: 3:15 - loss: 0.6859 - acc: 0.500 - ETA: 3:16 - loss: 0.6870 - acc: 0.500 - ETA: 3:15 - loss: 0.6874 - acc: 0.500 - ETA: 3:15 - loss: 0.6889 - acc: 0.500 - ETA: 3:19 - loss: 0.6876 - acc: 0.531 - ETA: 3:22 - loss: 0.6880 - acc: 0.527 - ETA: 3:37 - loss: 0.6888 - acc: 0.525 - ETA: 3:39 - loss: 0.6884 - acc: 0.545 - ETA: 3:38 - loss: 0.6882 - acc: 0.541 - ETA: 3:35 - loss: 0.6888 - acc: 0.538 - ETA: 3:34 - loss: 0.6895 - acc: 0.535 - ETA: 3:32 - loss: 0.6900 - acc: 0.516 - ETA: 3:31 - loss: 0.6902 - acc: 0.515 - ETA: 3:31 - loss: 0.6902 - acc: 0.514 - ETA: 3:35 - loss: 0.6913 - acc: 0.500 - ETA: 3:35 - loss: 0.6917 - acc: 0.486 - ETA: 3:38 - loss: 0.6917 - acc: 0.487 - ETA: 3:38 - loss: 0.6914 - acc: 0.488 - ETA: 3:37 - loss: 0.6917 - acc: 0.488 - ETA: 3:36 - loss: 0.6918 - acc: 0.489 - ETA: 3:39 - loss: 0.6923 

  % delta_t_median)


 248/6090 [>.............................] - ETA: 4:06 - loss: 0.6876 - acc: 0.567 - ETA: 4:09 - loss: 0.6881 - acc: 0.558 - ETA: 4:10 - loss: 0.6884 - acc: 0.553 - ETA: 4:12 - loss: 0.6886 - acc: 0.5484

  % delta_t_median)


 260/6090 [>.............................] - ETA: 4:14 - loss: 0.6885 - acc: 0.551 - ETA: 4:15 - loss: 0.6883 - acc: 0.554 - ETA: 4:15 - loss: 0.6886 - acc: 0.5500

  % delta_t_median)


1076/6090 [====>.........................] - ETA: 4:15 - loss: 0.6890 - acc: 0.545 - ETA: 4:14 - loss: 0.6890 - acc: 0.544 - ETA: 4:13 - loss: 0.6888 - acc: 0.547 - ETA: 4:12 - loss: 0.6889 - acc: 0.547 - ETA: 4:11 - loss: 0.6889 - acc: 0.546 - ETA: 4:11 - loss: 0.6890 - acc: 0.545 - ETA: 4:10 - loss: 0.6889 - acc: 0.548 - ETA: 4:10 - loss: 0.6889 - acc: 0.547 - ETA: 4:09 - loss: 0.6886 - acc: 0.550 - ETA: 4:08 - loss: 0.6887 - acc: 0.550 - ETA: 4:08 - loss: 0.6883 - acc: 0.555 - ETA: 4:07 - loss: 0.6880 - acc: 0.558 - ETA: 4:08 - loss: 0.6879 - acc: 0.560 - ETA: 4:07 - loss: 0.6875 - acc: 0.563 - ETA: 4:06 - loss: 0.6870 - acc: 0.568 - ETA: 4:06 - loss: 0.6871 - acc: 0.567 - ETA: 4:05 - loss: 0.6874 - acc: 0.564 - ETA: 4:05 - loss: 0.6875 - acc: 0.563 - ETA: 4:04 - loss: 0.6873 - acc: 0.565 - ETA: 4:03 - loss: 0.6868 - acc: 0.570 - ETA: 4:03 - loss: 0.6868 - acc: 0.572 - ETA: 4:02 - loss: 0.6867 - acc: 0.571 - ETA: 4:02 - loss: 0.6865 - acc: 0.573 - ETA: 4:02 - loss: 0.6868 - acc: 0.5













Epoch 3/5


 816/6090 [===>..........................] - ETA: 2:17 - loss: 0.6356 - acc: 0.750 - ETA: 2:38 - loss: 0.5864 - acc: 0.875 - ETA: 2:43 - loss: 0.6203 - acc: 0.833 - ETA: 2:53 - loss: 0.6308 - acc: 0.812 - ETA: 2:54 - loss: 0.6357 - acc: 0.750 - ETA: 3:00 - loss: 0.6353 - acc: 0.750 - ETA: 2:58 - loss: 0.6287 - acc: 0.750 - ETA: 2:57 - loss: 0.6161 - acc: 0.750 - ETA: 2:57 - loss: 0.6154 - acc: 0.750 - ETA: 2:57 - loss: 0.6237 - acc: 0.725 - ETA: 2:57 - loss: 0.6263 - acc: 0.704 - ETA: 2:55 - loss: 0.6201 - acc: 0.708 - ETA: 2:56 - loss: 0.6269 - acc: 0.692 - ETA: 2:58 - loss: 0.6288 - acc: 0.696 - ETA: 2:58 - loss: 0.6300 - acc: 0.683 - ETA: 3:03 - loss: 0.6383 - acc: 0.671 - ETA: 3:03 - loss: 0.6425 - acc: 0.661 - ETA: 3:01 - loss: 0.6507 - acc: 0.666 - ETA: 3:03 - loss: 0.6548 - acc: 0.657 - ETA: 3:03 - loss: 0.6532 - acc: 0.662 - ETA: 3:03 - loss: 0.6610 - acc: 0.642 - ETA: 3:04 - loss: 0.6621 - acc: 0.647 - ETA: 3:03 - loss: 0.6538 - acc: 0.652 - ETA: 3:02 - loss: 0.6576 - acc: 0.6





  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)








Epoch 4/5


 816/6090 [===>..........................] - ETA: 2:24 - loss: 0.8951 - acc: 0.500 - ETA: 2:37 - loss: 0.7240 - acc: 0.500 - ETA: 2:44 - loss: 0.6169 - acc: 0.666 - ETA: 2:43 - loss: 0.7091 - acc: 0.562 - ETA: 2:51 - loss: 0.7405 - acc: 0.550 - ETA: 2:53 - loss: 0.7407 - acc: 0.541 - ETA: 2:56 - loss: 0.7030 - acc: 0.571 - ETA: 2:57 - loss: 0.6630 - acc: 0.625 - ETA: 2:57 - loss: 0.6607 - acc: 0.638 - ETA: 2:57 - loss: 0.6662 - acc: 0.650 - ETA: 2:58 - loss: 0.6421 - acc: 0.681 - ETA: 2:58 - loss: 0.6309 - acc: 0.708 - ETA: 2:57 - loss: 0.6295 - acc: 0.692 - ETA: 2:58 - loss: 0.6200 - acc: 0.696 - ETA: 2:58 - loss: 0.6222 - acc: 0.683 - ETA: 2:58 - loss: 0.6195 - acc: 0.671 - ETA: 2:58 - loss: 0.6113 - acc: 0.676 - ETA: 2:58 - loss: 0.6229 - acc: 0.680 - ETA: 2:58 - loss: 0.6167 - acc: 0.684 - ETA: 2:58 - loss: 0.6177 - acc: 0.675 - ETA: 2:58 - loss: 0.6066 - acc: 0.690 - ETA: 2:59 - loss: 0.6021 - acc: 0.704 - ETA: 2:59 - loss: 0.6001 - acc: 0.706 - ETA: 3:00 - loss: 0.5892 - acc: 0.7







  % delta_t_median)




  % delta_t_median)








Epoch 5/5


 816/6090 [===>..........................] - ETA: 2:58 - loss: 1.0656 - acc: 0.250 - ETA: 2:58 - loss: 0.7147 - acc: 0.625 - ETA: 3:06 - loss: 0.6317 - acc: 0.750 - ETA: 3:04 - loss: 0.5617 - acc: 0.812 - ETA: 3:07 - loss: 0.5397 - acc: 0.850 - ETA: 3:10 - loss: 0.5498 - acc: 0.833 - ETA: 3:11 - loss: 0.5319 - acc: 0.857 - ETA: 3:14 - loss: 0.5340 - acc: 0.843 - ETA: 3:16 - loss: 0.5218 - acc: 0.861 - ETA: 3:19 - loss: 0.5027 - acc: 0.875 - ETA: 3:19 - loss: 0.5256 - acc: 0.818 - ETA: 3:19 - loss: 0.5315 - acc: 0.812 - ETA: 3:19 - loss: 0.5201 - acc: 0.826 - ETA: 3:21 - loss: 0.5139 - acc: 0.839 - ETA: 3:23 - loss: 0.5444 - acc: 0.783 - ETA: 3:22 - loss: 0.5433 - acc: 0.781 - ETA: 3:22 - loss: 0.5499 - acc: 0.764 - ETA: 3:22 - loss: 0.5595 - acc: 0.722 - ETA: 3:21 - loss: 0.5824 - acc: 0.697 - ETA: 3:22 - loss: 0.5731 - acc: 0.700 - ETA: 3:21 - loss: 0.5792 - acc: 0.690 - ETA: 3:21 - loss: 0.5855 - acc: 0.681 - ETA: 3:22 - loss: 0.5826 - acc: 0.684 - ETA: 3:20 - loss: 0.5730 - acc: 0.6









  % delta_t_median)




  % delta_t_median)








In [152]:
pred=model.predict(X_test)

In [153]:
f1_score(y_test.as_matrix(),np.round(pred).astype(int))

  """Entry point for launching an IPython kernel.


0.673544583640383