In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from gensim.models import Word2Vec, Phrases
from gensim.models.phrases import Phraser
from spellchecker import SpellChecker
import re
import string



In [2]:
from tqdm import tqdm
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')
stop = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gohw\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gohw\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [3]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, SpatialDropout1D, Flatten
from keras.initializers import Constant
from keras.optimizers import Adam

Using TensorFlow backend.


# Data Cleaning

In [4]:
df = pd.read_csv('train.csv')

In [5]:
ids_with_target_error = [328,443,513,2619,3640,3900,4342,5781,6552,6554,6570,6701,6702,6729,6861,7226]

In [6]:
df.loc[df['id'].isin(ids_with_target_error),'target'] = 0

In [7]:
def remove_url(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

In [8]:
def remove_html(text):
    html = re.compile(r'<.*?>')
    return html.sub(r'',text)

In [9]:
def remove_emoji(text):
    emoji = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji.sub(r'', text)

In [10]:
def remove_punct(text):
    table = str.maketrans('','',string.punctuation)
    return text.translate(table)

In [11]:
df['text'] = df['text'].apply(lambda x:remove_url(x)).apply(lambda x:remove_html(x)).apply(lambda x:remove_emoji(x)).apply(lambda x:remove_punct(x))

# Load Cleaned Data

In [4]:
df = pd.read_csv('train_corrected.csv')

# Count Vectorizer

In [5]:
countvec = feature_extraction.text.CountVectorizer()
tfidfvec = feature_extraction.text.TfidfVectorizer()

In [6]:
model = linear_model.RidgeClassifier()

In [7]:
vector_count = countvec.fit_transform(df['text']).todense()
vector_tfidf = tfidfvec.fit_transform(df['text']).todense()

In [8]:
scores = model_selection.cross_val_score(model, vector_count, df['target'], cv=5, scoring='f1')
np.mean(scores)

0.569462701690526

In [9]:
scores = model_selection.cross_val_score(model, vector_tfidf, df['target'], cv=5, scoring='f1')
np.mean(scores)

0.6182848015494506

# Glove - Wikipedia or Twitter

In [10]:
def create_corpus(df):
    corpus = []
    for row in tqdm(df['text']):
        word = [w.lower() for w in word_tokenize(row) if ((w.isalpha()) and (w not in stop)) ]
        corpus.append(word)
    return corpus

In [11]:
corpus = create_corpus(df)

100%|████████████████████████████████████| 7613/7613 [00:01<00:00, 6864.74it/s]


In [25]:
tok = Tokenizer()
tok.fit_on_texts(corpus)
seq = tok.texts_to_sequences(corpus)

In [26]:
pad = pad_sequences(seq, maxlen=30, truncating='post', padding='post')

In [18]:
X_train, X_test, y_train, y_test = train_test_split(pad,df['target'],test_size=0.2, shuffle=True)

In [24]:
embedding_dict = {}
with open('glove.twitter.27B/glove.twitter.27B.100d.txt','r', encoding='utf8') as f:
    for line in f:
        val = line.split()
        word = val[0]
        vec = np.array(val[1:],'float32')
        embedding_dict[word] = vec
f.close()

In [27]:
word_index = tok.word_index
len(word_index)

15686

In [29]:
emb_dim = 100
emb_mat = np.zeros((len(word_index)+1,emb_dim))
for word,i in word_index.items():
    emb_vec = embedding_dict.get(word)
    if emb_vec is not None:
        emb_mat[i] = emb_vec

In [30]:
emb_mat.shape

(15687, 100)

In [31]:
model = Sequential()
embedding = Embedding(emb_mat.shape[0], emb_mat.shape[1], embeddings_initializer=Constant(emb_mat), input_length=30, trainable=False)
model.add(embedding)
#model.add(SpatialDropout1D(0.2))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
#model.add(Flatten())
#model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
opt = Adam(lr=1e-5)
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])

In [32]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 30, 100)           1568700   
_________________________________________________________________
lstm_2 (LSTM)                (None, 64)                42240     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 65        
Total params: 1,611,005
Trainable params: 42,305
Non-trainable params: 1,568,700
_________________________________________________________________


In [33]:
history = model.fit(X_train, y_train, batch_size=4, epochs=10, validation_data=(X_test, y_test), verbose=0)

In [34]:
pred=model.predict(X_test)

In [23]:
# Wikipedia
f1_score(y_test.as_matrix(),np.round(pred).astype(int))

  """Entry point for launching an IPython kernel.


0.7254063301967493

In [35]:
# Twitter
f1_score(y_test.as_matrix(),np.round(pred).astype(int))

  


0.7292358803986713

# Word2Vec embedding

In [36]:
emb_dim = 100
model = Word2Vec(corpus, iter=50, min_count=1, size=emb_dim, window=5)

In [37]:
len(model.wv.index2word)

15686

In [38]:
model.wv.syn0.shape

  """Entry point for launching an IPython kernel.


(15686, 100)

In [39]:
tok = Tokenizer()
tok.fit_on_texts(corpus)
seq = tok.texts_to_sequences(corpus)
pad = pad_sequences(seq, maxlen=30, truncating='post', padding='post')

In [40]:
X_train, X_test, y_train, y_test = train_test_split(pad,df['target'],test_size=0.2, shuffle=True)

In [41]:
word_index = tok.word_index
len(word_index)

15686

In [42]:
len(model.wv.syn0[2])

  """Entry point for launching an IPython kernel.


100

In [43]:
emb_mat_new = np.zeros((len(word_index)+1,emb_dim))
for word,i in word_index.items():
    index = model.wv.index2word.index(word)
    emb_vec_new = model.wv.syn0[index]
    if emb_vec_new is not None:
        emb_mat_new[i] = emb_vec_new

  after removing the cwd from sys.path.


In [44]:
print(emb_mat_new.shape)
emb_mat_new

(15687, 100)


array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.25090939, -0.65710855, -1.569049  , ..., -0.48348492,
         1.219504  , -0.96216369],
       [ 0.39262351,  0.04824859, -1.72793174, ..., -0.38319874,
         0.26742628, -0.06977333],
       ...,
       [-0.14329815,  0.01530781, -0.04845123, ..., -0.06255065,
         0.02216502, -0.14169963],
       [-0.0608668 ,  0.00680893, -0.06536276, ..., -0.04777988,
         0.03836352,  0.00586081],
       [-0.02184738,  0.01286345, -0.03119265, ..., -0.03432117,
         0.03497138,  0.00528882]])

In [45]:
model = Sequential()
embedding = Embedding(emb_mat_new.shape[0], emb_mat_new.shape[1], embeddings_initializer=Constant(emb_mat_new), input_length=30, trainable=False)
model.add(embedding)
#model.add(SpatialDropout1D(0.2))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
#model.add(Flatten())
#model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
opt = Adam(lr=1e-5)
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])

In [46]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 30, 100)           1568700   
_________________________________________________________________
lstm_3 (LSTM)                (None, 64)                42240     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 65        
Total params: 1,611,005
Trainable params: 42,305
Non-trainable params: 1,568,700
_________________________________________________________________


In [47]:
history = model.fit(X_train, y_train, batch_size=4, epochs=10, validation_data=(X_test, y_test), verbose=0)

In [48]:
pred=model.predict(X_test)

In [49]:
f1_score(y_test.as_matrix(),np.round(pred).astype(int))

  """Entry point for launching an IPython kernel.


0.7082334132693846

# Bigram Word2Vec

In [50]:
bigram = Phrases(corpus, min_count=1, threshold=2)

In [56]:
emb_dim = 100
model = Word2Vec(Phraser(bigram)[corpus], iter=100, min_count=1, size=emb_dim, window=5)

In [57]:
len(model.wv.index2word)

19114

In [58]:
model.wv.syn0.shape

  """Entry point for launching an IPython kernel.


(19114, 100)

In [59]:
seq = [[model.wv.index2word.index(word)+1 for word in line] for line in Phraser(bigram)[corpus]]

In [61]:
pad = pad_sequences(seq, maxlen=30, truncating='post', padding='post')

In [62]:
pad.shape

(7613, 30)

In [67]:
X_train, X_test, y_train, y_test = train_test_split(pad,df['target'],test_size=0.2, shuffle=True)

In [63]:
emb_mat_bi = np.zeros((len(model.wv.index2word)+1,emb_dim))
emb_mat_bi[1:] = model.wv.syn0

  


In [64]:
print(emb_mat_bi.shape)
emb_mat_bi

(19115, 100)


array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 1.7201978 , -5.01257944, -4.29149199, ...,  0.81665832,
         6.7889576 ,  2.38013935],
       [-4.74477196, -0.46224374, -3.88889694, ..., -0.68521464,
         1.54192841, -4.04894018],
       ...,
       [-0.10639054, -0.05220846, -0.05545874, ..., -0.03213445,
         0.0635984 ,  0.04479982],
       [-0.25424987,  0.10132236, -0.10805469, ..., -0.05415159,
        -0.10835221,  0.02593345],
       [-0.18333161,  0.05533584, -0.05194137, ..., -0.04876018,
        -0.06487427,  0.03322721]])

In [65]:
model = Sequential()
embedding = Embedding(emb_mat_bi.shape[0], emb_mat_bi.shape[1], embeddings_initializer=Constant(emb_mat_bi), input_length=30, trainable=False)
model.add(embedding)
#model.add(SpatialDropout1D(0.2))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
#model.add(Flatten())
#model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
opt = Adam(lr=1e-5)
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])

In [66]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 30, 100)           1911500   
_________________________________________________________________
lstm_4 (LSTM)                (None, 64)                42240     
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 65        
Total params: 1,953,805
Trainable params: 42,305
Non-trainable params: 1,911,500
_________________________________________________________________


In [68]:
history = model.fit(X_train, y_train, batch_size=4, epochs=5, validation_data=(X_test, y_test), verbose=0)

In [69]:
pred=model.predict(X_test)

In [70]:
f1_score(y_test.as_matrix(),np.round(pred).astype(int))

  """Entry point for launching an IPython kernel.


0.6256077795786061