In [0]:
import re
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import string
import nltk
import warnings 
warnings.filterwarnings("ignore", category=DeprecationWarning)

%matplotlib inline

In [2]:
!wget https://datahack-prod.s3.amazonaws.com/train_file/train_2kmZucJ.csv
!wget https://datahack-prod.s3.amazonaws.com/test_file/test_oJQbWVk.csv

--2019-10-19 21:37:12--  https://datahack-prod.s3.amazonaws.com/train_file/train_2kmZucJ.csv
Resolving datahack-prod.s3.amazonaws.com (datahack-prod.s3.amazonaws.com)... 52.219.62.56
Connecting to datahack-prod.s3.amazonaws.com (datahack-prod.s3.amazonaws.com)|52.219.62.56|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1100229 (1.0M) [application/vnd.ms-excel]
Saving to: ‘train_2kmZucJ.csv’


2019-10-19 21:37:13 (2.01 MB/s) - ‘train_2kmZucJ.csv’ saved [1100229/1100229]

--2019-10-19 21:37:15--  https://datahack-prod.s3.amazonaws.com/test_file/test_oJQbWVk.csv
Resolving datahack-prod.s3.amazonaws.com (datahack-prod.s3.amazonaws.com)... 52.219.62.56
Connecting to datahack-prod.s3.amazonaws.com (datahack-prod.s3.amazonaws.com)|52.219.62.56|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 263010 (257K) [application/vnd.ms-excel]
Saving to: ‘test_oJQbWVk.csv’


2019-10-19 21:37:15 (824 KB/s) - ‘test_oJQbWVk.csv’ saved [263010/263010]



In [0]:
train  = pd.read_csv('/content/train_2kmZucJ.csv')
test = pd.read_csv('/content/test_oJQbWVk.csv')

In [0]:
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
        
    return input_txt 

In [0]:
#for time saving preprocessing as well as having more words to (probably) better fit our tokenizer
combi = train.append(test, ignore_index=True, sort=False)

In [8]:
print(test.shape)
print(train.shape)

(1953, 2)
(7920, 3)


In [9]:
print(combi.shape)

(9873, 3)


In [10]:
train.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


In [0]:
# remove twitter handles (@user)
combi['tidy_tweet'] = np.vectorize(remove_pattern)(combi['tweet'], "@[\w]*")
# remove special characters, numbers, punctuations
combi['tidy_tweet'] = combi['tidy_tweet'].str.replace("[^a-zA-Z#]", " ")

In [0]:
# remove short words
combi['tidy_tweet'] = combi['tidy_tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

In [13]:
combi.head()

Unnamed: 0,id,label,tweet,tidy_tweet
0,1,0.0,#fingerprint #Pregnancy Test https://goo.gl/h1...,#fingerprint #Pregnancy Test https MfQV #andro...
1,2,0.0,Finally a transparant silicon case ^^ Thanks t...,Finally transparant silicon case Thanks uncle ...
2,3,0.0,We love this! Would you go? #talk #makememorie...,love this Would #talk #makememories #unplug #r...
3,4,0.0,I'm wired I know I'm George I was made that wa...,wired know George made that #iphone #cute #dav...
4,5,1.0,What amazing service! Apple won't even talk to...,What amazing service Apple even talk about que...


In [14]:
tokenized_tweet = combi['tidy_tweet'].apply(lambda x: x.split())
tokenized_tweet.head()

0    [#fingerprint, #Pregnancy, Test, https, MfQV, ...
1    [Finally, transparant, silicon, case, Thanks, ...
2    [love, this, Would, #talk, #makememories, #unp...
3    [wired, know, George, made, that, #iphone, #cu...
4    [What, amazing, service, Apple, even, talk, ab...
Name: tidy_tweet, dtype: object

In [15]:
#Stem words: a.k.a remove suffixes,ending letters
from nltk.stem.porter import *
stemmer = PorterStemmer()

tokenized_tweet = tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x]) # stemming
tokenized_tweet.head()

0    [#fingerprint, #pregnanc, test, http, mfqv, #a...
1    [final, transpar, silicon, case, thank, uncl, ...
2    [love, thi, would, #talk, #makememori, #unplug...
3    [wire, know, georg, made, that, #iphon, #cute,...
4    [what, amaz, servic, appl, even, talk, about, ...
Name: tidy_tweet, dtype: object

In [0]:
for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] = ' '.join(tokenized_tweet[i])

combi['tidy_tweet'] = tokenized_tweet

In [178]:
from keras.preprocessing.text import Tokenizer

# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 1000
# Max number of words in each tweet
MAX_SEQUENCE_LENGTH = 15
# This is fixed.
EMBEDDING_DIM = 50
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(combi['tidy_tweet'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 20449 unique tokens.


In [0]:
from keras.preprocessing.sequence import pad_sequences
combi_X = tokenizer.texts_to_sequences(combi['tidy_tweet'].values)
combi_X = pad_sequences(combi_X, maxlen=MAX_SEQUENCE_LENGTH)

In [0]:
train = combi[:7920]
test = combi[7920:]

In [0]:
X_train = combi_X[:7920]
Y = train['label'].values
X_test = combi_X[7920:]

In [182]:
print(len(combi_X))

9873


In [183]:
print('Shape of data tensor:', X_test.shape)

Shape of data tensor: (1953, 15)


In [184]:
print('Shape of data tensor:', X_train.shape)

Shape of data tensor: (7920, 15)


In [185]:
print(X_train[0:5])

[[  0   0   0   0   0   1  21  34  30  26 109  81 104  96   2]
 [  0   0   0   0   0   0   0  57  24  39 439   9 237   1   4]
 [  0   0   0   0   0  15  11  76 477 378   2  92 331 543   1]
 [  0   0   0   0   0   0   0 119 211  20   2  26  87   1  10]
 [  0   0  59  58 253   3 115 477  80 798  17 157 212 192 275]]


In [186]:
from keras.models import Sequential
from keras.models import load_model
from keras.layers import LSTM, Dense
from keras.layers import Embedding
from keras.layers import SpatialDropout1D
from keras.layers import Bidirectional, Dropout
import tensorflow as tf
from keras.callbacks import EarlyStopping
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X_train.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(Bidirectional(LSTM(100, dropout=0.4, recurrent_dropout=0.2, return_sequences=True)))
model.add(Bidirectional(LSTM(100, dropout=0.4, recurrent_dropout=0.2)))
model.add(Dense(50, activation='sigmoid'))
model.add(Dropout(0.4))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
epochs = 15
batch_size = 64

history = model.fit(X_train, Y, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

Model: "sequential_17"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_17 (Embedding)     (None, 15, 50)            50000     
_________________________________________________________________
spatial_dropout1d_17 (Spatia (None, 15, 50)            0         
_________________________________________________________________
bidirectional_13 (Bidirectio (None, 15, 200)           120800    
_________________________________________________________________
bidirectional_14 (Bidirectio (None, 200)               240800    
_________________________________________________________________
dense_17 (Dense)             (None, 50)                10050     
_________________________________________________________________
dropout_4 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_18 (Dense)             (None, 1)               

In [0]:
predictions = (model.predict(X_test)>0.5).astype(np.int)

In [195]:
predictions[0:5]

array([[1],
       [1],
       [1],
       [1],
       [1]])

In [0]:
submissions_df = pd.DataFrame(test['id'])

In [197]:
print( submissions_df.shape)
print(len(predictions))

(1953, 1)
1953


In [0]:
submissions_df['label'] = predictions

In [199]:
submissions_df[0:100]

Unnamed: 0,id,label
7920,7921,1
7921,7922,1
7922,7923,1
7923,7924,1
7924,7925,1
7925,7926,0
7926,7927,1
7927,7928,0
7928,7929,1
7929,7930,0


In [0]:
submissions_df.to_csv('sub_lreg_bow.csv', index=False) # writing data to a CSV file