# Neural net with pretrained word embeddings

This notebook follows this tutorial:
https://stackabuse.com/python-for-nlp-movie-sentiment-analysis-using-deep-learning-in-keras/

Notes : 
- We used Pretrained Word Embeddings from https://nlp.stanford.edu/projects/glove/ (glove.6b.zip), not customed ones.
- Do not run the entire notebook at once. Run all the cells below til the first model cell and then run the cells of the model you want to use

In [10]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords

from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Flatten
from keras.layers import GlobalMaxPooling1D,MaxPooling1D,GRU
from keras.layers import Conv1D
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer

from numpy import asarray
from numpy import zeros

import matplotlib.pyplot as plt

from utils import create_csv_submission

## Loading the dataset

In [11]:
#Run this cell to work with the full dataset


df_full = pd.read_csv("./data/tweets_full.csv")
df_tweets_full = df_full[['tweet','label']]
#shuffle the dataset to mix the labels
df_tweets_full = df_tweets_full.sample(frac=1, random_state=1).reset_index(drop=True)

In [None]:
#Run this cell to work with the small dataset

df_tweets_small = pd.read_pickle("./data/tweets.pkl")
del df_tweets_small['tweet_len']
df_tweets_small = df_tweets_small.sample(frac=1, random_state=1).reset_index(drop=True)


## Convert tweets to vectors

In [12]:
#to work with the full dataset
X_train, X_test, y_train, y_test = train_test_split(df_tweets_full['tweet'], df_tweets_full['label'], test_size=0.05, random_state=42)

#to work with the small dataset
#X_train, X_test, y_train, y_test = train_test_split(df_tweets_small['tweet'], df_tweets_small['label'], test_size=0.20, random_state=42)


#create word dictionary
#it will keep only the top num_words words
tokenizer = Tokenizer(num_words=100000)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)#convert each word to a integer based on the tokenizer
X_test = tokenizer.texts_to_sequences(X_test)
X_train

[[1, 62, 16, 8, 194, 16, 43, 151, 117, 40, 171, 16, 4, 8, 16, 4, 158],
 [13, 39, 61, 50, 7, 30, 2, 3, 108, 25, 2, 3, 160, 198, 2],
 [58, 6, 17, 6, 10, 5, 98, 10, 2, 19, 5, 6, 5, 10, 5, 11, 10, 2],
 [1, 31, 40, 9, 148, 18],
 [59, 189, 128, 54, 182, 54, 2, 178, 156, 54, 23, 54, 10, 27, 2, 12],
 [1, 3, 14, 4, 21, 10, 48, 20, 9],
 [1, 1, 21, 10, 35, 4],
 [1, 20, 6, 20, 2, 16],
 [4, 39, 24, 147, 115, 28, 5, 1, 193, 24, 12],
 [1, 180, 7, 47, 3, 80, 8],
 [1, 92, 5, 3, 118, 8, 73, 31, 152, 11, 7, 6, 169, 184],
 [14, 57, 6, 127],
 [1, 75, 10, 106, 49, 5, 62, 16, 5],
 [3, 84, 2, 3, 136, 125, 5, 32, 3, 47, 64, 10, 43, 16],
 [1, 50, 36, 15, 159, 19, 176, 166, 4],
 [1, 9, 162, 5, 11, 3, 35, 36, 15, 44, 6, 57, 14, 11, 8, 18],
 [5, 12],
 [20, 6, 20, 9, 62, 40, 2, 12],
 [1, 5, 41, 77, 31, 4, 3, 13, 2, 93, 13],
 [66, 2, 2, 132, 8],
 [185, 15, 21, 21, 45, 175, 113, 20, 6],
 [14, 99, 192, 9],
 [3, 84, 20, 38, 3, 84, 14, 2],
 [4, 23, 166, 2, 50, 37, 1],
 [70, 5, 56, 2, 131, 19, 9],
 [1, 58, 75, 10, 3, 36,

In [5]:
# Adding 1 because of reserved 0 index
vocab_size = len(tokenizer.word_index) + 1
print("vocab_size:", vocab_size)

maxlen = 100

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen) #makes sure all tweets have 100 words (padding)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)
X_train

vocab_size: 360744


array([[11,  0,  0, ...,  0,  0,  0],
       [17, 19, 12, ...,  0,  0,  0],
       [ 1,  0,  0, ...,  0,  0,  0],
       ...,
       [20,  2, 20, ...,  0,  0,  0],
       [ 1,  5, 16, ...,  0,  0,  0],
       [ 1, 85, 15, ...,  0,  0,  0]])

In [6]:
embeddings_dictionary = dict()
glove_file = open('w2v_full_w200_min4.txt', encoding="utf8")
for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary [word] = vector_dimensions
glove_file.close()

In [7]:
embedding_matrix = zeros((vocab_size, 200))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

## First model : simple neural network

In [None]:
#first model : simple neural network
model = Sequential()
embedding_layer = Embedding(vocab_size, 200, weights=[embedding_matrix], input_length=maxlen , trainable=False) #trainable set to False bc we use the downloaded dict
model.add(embedding_layer)

model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

print(model.summary())

In [None]:
history = model.fit(X_train, y_train, batch_size=32, epochs=6, verbose=1, validation_split=0.2)
score = model.evaluate(X_test, y_test, verbose=2)
print("Test Score:", score[0])
print("Test Accuracy:", score[1])

## 2nd model : convolutional neural network

In [None]:
#model 2 : convolutional neural network
model_2 = Sequential()

embedding_layer = Embedding(vocab_size, 200, weights=[embedding_matrix], input_length=maxlen , trainable=False)
model_2.add(embedding_layer)

model_2.add(Conv1D(32, 5, activation='relu'))
model_2.add(GlobalMaxPooling1D())
model_2.add(Dense(1, activation='sigmoid'))
model_2.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
print(model_2.summary())

In [None]:
history = model_2.fit(X_train, y_train, batch_size=32, epochs=6, verbose=1, validation_split=0.2)

score_2 = model_2.evaluate(X_test, y_test, verbose=1)

print("Test Score:", score_2[0])
print("Test Accuracy:", score_2[1])

## 3rd model : recurrent neural network

In [None]:
#3rd model : recurrent neural net
model_3 = Sequential()
embedding_layer = Embedding(vocab_size, 200, weights=[embedding_matrix], input_length=maxlen , trainable=False)
model_3.add(embedding_layer)
model_3.add(LSTM(128))

model_3.add(Dense(1, activation='sigmoid'))
model_3.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
print(model_3.summary())

In [None]:
history = model_3.fit(X_train, y_train, batch_size=128, epochs=6, verbose=1, validation_split=0.2)

score_3 = model_3.evaluate(X_test, y_test, verbose=1)
print("Test Score:", score_3[0])
print("Test Accuracy:", score_3[1])

## Fourth model : aucune idée du nom du modèle, pris de Sami

In [8]:
#4th model
model_4 = Sequential()
embedding_layer = Embedding(vocab_size, 200, weights=[embedding_matrix], input_length=maxlen , trainable=False) #trainable set to False bc we use the downloaded dict
model_4.add(embedding_layer)
model_4.add(Conv1D(64, kernel_size = 3, padding='same', activation='relu'))
model_4.add(MaxPooling1D(pool_size = 2))
model_4.add(Dropout(0.25))
model_4.add(GRU(128, return_sequences=True))
model_4.add(Dropout(0.3))
model_4.add(Flatten())
model_4.add(Dense(128, activation='relu'))
model_4.add(Dropout(0.5))
model_4.add(Dense(1,activation='sigmoid'))
model_4.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
print(model_4.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 200)          72148800  
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 100, 64)           38464     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 50, 64)            0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 50, 64)            0         
_________________________________________________________________
gru_1 (GRU)                  (None, 50, 128)           74112     
_________________________________________________________________
dropout_2 (Dropout)          (None, 50, 128)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 6400)             

In [9]:
history = model_4.fit(X_train, y_train, batch_size=128, epochs=6, verbose=1, validation_split=0.2)

score_4 = model_4.evaluate(X_test, y_test, verbose=1)
print("Test Score:", score_4[0])
print("Test Accuracy:", score_4[1])

Train on 1453108 samples, validate on 363277 samples
Epoch 1/6
  30976/1453108 [..............................] - ETA: 57:45 - loss: 0.5614 - acc: 0.6906

KeyboardInterrupt: 

## Computing predictions

It computes the predictions (on the preprocessed dataset with lemmatization from the preprocessing notebook) of the last model that was ran from the 3 models above.

In [None]:
to_predict = pd.read_pickle("/content/drive/My Drive/ml/preprocessed_test.pkl")
to_predict.index += 1

to_predict = to_predict['tweet']
to_predict = to_predict.astype(str)

to_predict= tokenizer.texts_to_sequences(to_predict)

to_predict = pad_sequences(to_predict, padding='post', maxlen=maxlen)

result_test = model_4.predict(to_predict)

#it returns values between [0,1] (since sigmoid is used) 
result_test[result_test < 0.5] = -1 #replace values < 0.5 to -1
result_test[result_test >= 0.5] = 1


In [None]:
#create_csv_submission(result_test,"xxx.csv")