In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


# Neural net with pretrained word embeddings

This notebook follows this tutorial:
https://stackabuse.com/python-for-nlp-movie-sentiment-analysis-using-deep-learning-in-keras/

Notes : 
- We used Pretrained Word Embeddings from https://nlp.stanford.edu/projects/glove/ (glove.6b.zip), not customed ones.
- Do not run the entire notebook at once. Run all the cells below til the first model cell and then run the cells of the model you want to use

In [0]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords

from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Flatten
from keras.layers import GlobalMaxPooling1D,MaxPooling1D,GRU
from keras.layers import Conv1D
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer

from numpy import asarray
from numpy import zeros

import matplotlib.pyplot as plt

from keras.callbacks import ModelCheckpoint

#from utils import create_csv_submission

## Loading the dataset

In [0]:
#Run this cell to work with the full dataset


df_full = pd.read_csv("/content/drive/My Drive/EPFL/Machine Learning/MA1/tweets_full.csv")
df_tweets_full = df_full[['tweet','label']]
#shuffle the dataset to mix the labels
df_tweets_full = df_tweets_full.sample(frac=1, random_state=1).reset_index(drop=True)

In [0]:
#Run this cell to work with the small dataset

# df_tweets_small = pd.read_pickle("./data/tweets.pkl")
# del df_tweets_small['tweet_len']
#shuffle the dataset to mix the labels
# df_tweets_small = df_tweets_small.sample(frac=1, random_state=1).reset_index(drop=True)


## Convert tweets to vectors

In [6]:
#to work with the full dataset
X_train, X_test, y_train, y_test = train_test_split(df_tweets_full['tweet'], df_tweets_full['label'], test_size=0.05, random_state=42)

#to work with the small dataset
#X_train, X_test, y_train, y_test = train_test_split(df_tweets_small['tweet'], df_tweets_small['label'], test_size=0.20, random_state=42)


#create word dictionary
#it will keep only the top num_words words
tokenizer = Tokenizer(num_words=120000)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)#convert each word to a integer based on the tokenizer
X_test = tokenizer.texts_to_sequences(X_test)
X_train

[[527,
  2316,
  2316,
  2303,
  259,
  1163,
  193,
  1534,
  527,
  2316,
  2316,
  193,
  295,
  27,
  10746,
  5,
  83290,
  2,
  10],
 [1, 79, 173, 156, 41, 16, 378, 328, 898, 31, 329, 21, 90, 4, 185],
 [12,
  22,
  1072,
  11,
  4846,
  56,
  8,
  242,
  192,
  71,
  95,
  8,
  368,
  74,
  117,
  11,
  81,
  31,
  14,
  205,
  329,
  72,
  35,
  870,
  8,
  101,
  2],
 [1,
  397,
  40,
  261,
  265,
  7,
  16,
  40,
  2145,
  4,
  19708,
  20737,
  4648,
  67,
  64,
  16,
  7273,
  34,
  40,
  11,
  31,
  14,
  1812,
  16,
  161],
 [1, 3, 62, 12, 41, 23, 33, 331, 6, 9676, 30, 12, 134, 248, 164, 4],
 [73,
  13,
  546,
  20,
  77,
  2,
  86,
  7,
  52,
  7,
  6,
  855,
  41,
  3,
  460,
  7,
  2,
  5252,
  32,
  400,
  334],
 [6,
  70612,
  22,
  5851,
  290,
  9,
  399,
  7,
  18525,
  5,
  32827,
  5,
  11,
  5051,
  11309,
  1260,
  2889,
  290,
  884,
  70613,
  2,
  10],
 [2033,
  8446,
  1221,
  504,
  22,
  100,
  5090,
  1233,
  839,
  372,
  1439,
  921,
  11,
  59,
  7,


In [7]:
# Adding 1 because of reserved 0 index
vocab_size = len(tokenizer.word_index) + 1
print("vocab_size:", vocab_size)

maxlen = 140

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen) #makes sure all tweets have 140 words (padding)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)
X_train

vocab_size: 404437


array([[  527,  2316,  2316, ...,     0,     0,     0],
       [    1,    79,   173, ...,     0,     0,     0],
       [   12,    22,  1072, ...,     0,     0,     0],
       ...,
       [   15,    20, 17400, ...,     0,     0,     0],
       [    1,  7920,     5, ...,     0,     0,     0],
       [    1,  1654,    88, ...,     0,     0,     0]], dtype=int32)

In [0]:
embeddings_dictionary = dict()
glove_file = open('/content/drive/My Drive/EPFL/Machine Learning/MA1/w2v_full_w200_min4.txt', encoding="utf8")
for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary [word] = vector_dimensions
glove_file.close()

In [0]:
embedding_matrix = zeros((vocab_size, 200))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

## First model : simple neural network

In [0]:
#first model : simple neural network
model = Sequential()
embedding_layer = Embedding(vocab_size, 200, weights=[embedding_matrix], input_length=maxlen , trainable=False) #trainable set to False bc we use the downloaded dict
model.add(embedding_layer)

model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

print(model.summary())

In [0]:
history = model.fit(X_train, y_train, batch_size=32, epochs=6, verbose=1, validation_split=0.2)
score = model.evaluate(X_test, y_test, verbose=2)
print("Test Score:", score[0])
print("Test Accuracy:", score[1])

## 2nd model : convolutional neural network

In [0]:
#model 2 : convolutional neural network
model_2 = Sequential()

embedding_layer = Embedding(vocab_size, 200, weights=[embedding_matrix], input_length=maxlen , trainable=False)
model_2.add(embedding_layer)

model_2.add(Conv1D(32, 5, activation='relu'))
model_2.add(GlobalMaxPooling1D())
model_2.add(Dense(1, activation='sigmoid'))
model_2.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
print(model_2.summary())

In [0]:
history = model_2.fit(X_train, y_train, batch_size=32, epochs=6, verbose=1, validation_split=0.2)

score_2 = model_2.evaluate(X_test, y_test, verbose=1)

print("Test Score:", score_2[0])
print("Test Accuracy:", score_2[1])

## 3rd model : recurrent neural network

In [0]:
#3rd model : recurrent neural net
model_3 = Sequential()
embedding_layer = Embedding(vocab_size, 200, weights=[embedding_matrix], input_length=maxlen , trainable=False)
model_3.add(embedding_layer)
model_3.add(LSTM(128))

model_3.add(Dense(1, activation='sigmoid'))
model_3.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
print(model_3.summary())

In [0]:
history = model_3.fit(X_train, y_train, batch_size=128, epochs=6, verbose=1, validation_split=0.2)

score_3 = model_3.evaluate(X_test, y_test, verbose=1)
print("Test Score:", score_3[0])
print("Test Accuracy:", score_3[1])

## Fourth model : aucune idée du nom du modèle, pris de Sami

In [10]:
# I just changed 100'000 -> 120'000 and batch size to 1000 and validation split to 0.05 and 100 to 140

#4th model
model_4 = Sequential()
embedding_layer = Embedding(vocab_size, 200, weights=[embedding_matrix], input_length=maxlen , trainable=False) #trainable set to False bc we use the downloaded dict
model_4.add(embedding_layer)
model_4.add(Conv1D(64, kernel_size = 3, padding='same', activation='relu'))
model_4.add(MaxPooling1D(pool_size = 2))
model_4.add(Dropout(0.25))
model_4.add(GRU(128, return_sequences=True))
model_4.add(Dropout(0.3))
model_4.add(Flatten())
model_4.add(Dense(128, activation='relu'))
model_4.add(Dropout(0.5))
model_4.add(Dense(1,activation='sigmoid'))
model_4.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
print(model_4.summary())












Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 140, 200)          80887400  
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 140, 64)           38464     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 70, 64)            0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 70, 64)            0         
_________________________________________________________________
gru_1 (GRU)                  (None, 70, 128)           74112     
______________________________

In [15]:
history = model_4.fit(X_train, y_train, batch_size=1000, callbacks=[
                         ModelCheckpoint(
                             filepath='/content/drive/My Drive/EPFL/Machine Learning/MA1/gru_best_weights.hdf5',
                             monitor='val_acc',
                             verbose=1,
                             save_best_only=True,
                             mode='max')], epochs=50, verbose=2, validation_split=0.05)

score_4 = model_4.evaluate(X_test, y_test, verbose=1)
print("Test Score:", score_4[0])
print("Test Accuracy:", score_4[1])

Train on 2049109 samples, validate on 107848 samples
Epoch 1/50
 - 206s - loss: 0.3687 - acc: 0.8323 - val_loss: 0.3530 - val_acc: 0.8402

Epoch 00001: val_acc improved from -inf to 0.84017, saving model to /content/drive/My Drive/EPFL/Machine Learning/MA1/gru_best_weights.hdf5
Epoch 2/50
 - 207s - loss: 0.3584 - acc: 0.8380 - val_loss: 0.3456 - val_acc: 0.8438

Epoch 00002: val_acc improved from 0.84017 to 0.84384, saving model to /content/drive/My Drive/EPFL/Machine Learning/MA1/gru_best_weights.hdf5
Epoch 3/50
 - 208s - loss: 0.3531 - acc: 0.8409 - val_loss: 0.3435 - val_acc: 0.8461

Epoch 00003: val_acc improved from 0.84384 to 0.84614, saving model to /content/drive/My Drive/EPFL/Machine Learning/MA1/gru_best_weights.hdf5
Epoch 4/50
 - 208s - loss: 0.3498 - acc: 0.8428 - val_loss: 0.3445 - val_acc: 0.8468

Epoch 00004: val_acc improved from 0.84614 to 0.84679, saving model to /content/drive/My Drive/EPFL/Machine Learning/MA1/gru_best_weights.hdf5
Epoch 5/50
 - 206s - loss: 0.3471 

## Fifth model : CNN de abdulfatir
https://github.com/abdulfatir/twitter-sentiment-analysis/blob/master/code/cnn.py

In [18]:
#5th model : convolutional neural network
filters = 600
kernel_size = 3

model_5 = Sequential()
model_5.add(Embedding(vocab_size, 200, weights=[embedding_matrix], input_length=maxlen))
model_5.add(Dropout(0.4))
model_5.add(Conv1D(filters, kernel_size, padding='valid', activation='relu', strides=1))
model_5.add(Conv1D(300, kernel_size, padding='valid', activation='relu', strides=1))
model_5.add(Conv1D(150, kernel_size, padding='valid', activation='relu', strides=1))
model_5.add(Conv1D(75, kernel_size, padding='valid', activation='relu', strides=1))
model_5.add(Flatten())
model_5.add(Dense(600))
model_5.add(Dropout(0.5))
model_5.add(Activation('relu'))
model_5.add(Dense(1))
model_5.add(Activation('sigmoid'))
model_5.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


print(model_5.summary())



Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 140, 200)          80887400  
_________________________________________________________________
dropout_7 (Dropout)          (None, 140, 200)          0         
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 138, 600)          360600    
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 136, 300)          540300    
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 134, 150)          135150    
_________________________________________________________________
conv1d_9 (Conv1D)            (None, 132, 75)           33825     
_________________________________________________________________
flatten_3 (Flatten)          (None, 9900)             

In [22]:
history = model_5.fit(X_train, y_train, batch_size=1000, callbacks=[
                         ModelCheckpoint(
                             filepath='/content/drive/My Drive/EPFL/Machine Learning/MA1/CNN_best_weights.hdf5',
                             monitor='val_acc',
                             verbose=1,
                             save_best_only=True,
                             mode='max')], epochs=6, verbose=1, validation_split=0.05)

score_5 = model_5.evaluate(X_test, y_test, verbose=1)
print("Test Score:", score_5[0])
print("Test Accuracy:", score_5[1])

Train on 2049109 samples, validate on 107848 samples
Epoch 1/6

Epoch 00001: val_acc improved from -inf to 0.84653, saving model to /content/drive/My Drive/EPFL/Machine Learning/MA1/CNN_best_weights.hdf5
Epoch 2/6

Epoch 00002: val_acc improved from 0.84653 to 0.85378, saving model to /content/drive/My Drive/EPFL/Machine Learning/MA1/CNN_best_weights.hdf5
Epoch 3/6

Epoch 00003: val_acc improved from 0.85378 to 0.85884, saving model to /content/drive/My Drive/EPFL/Machine Learning/MA1/CNN_best_weights.hdf5
Epoch 4/6

Epoch 00004: val_acc did not improve from 0.85884
Epoch 5/6

Epoch 00005: val_acc improved from 0.85884 to 0.86055, saving model to /content/drive/My Drive/EPFL/Machine Learning/MA1/CNN_best_weights.hdf5
Epoch 6/6

Epoch 00006: val_acc did not improve from 0.86055
Test Score: 0.3261682080628588
Test Accuracy: 0.8572825368861484


In [0]:
model_5.save('/content/drive/My Drive/EPFL/Machine Learning/MA1/cnn_0_862.h5')

## Computing predictions

It computes the predictions (on the preprocessed dataset with lemmatization from the preprocessing notebook) of the last model that was ran from the 3 models above.

In [0]:
to_predict = pd.read_pickle("/content/drive/My Drive/EPFL/Machine Learning/MA1/preprocessed_test.pkl")
to_predict.index += 1

to_predict = to_predict['tweet']
to_predict = to_predict.astype(str)

to_predict= tokenizer.texts_to_sequences(to_predict)

to_predict = pad_sequences(to_predict, padding='post', maxlen=maxlen)

result_test = model_5.predict(to_predict)

#it returns values between [0,1] (since sigmoid is used) 
result_test[result_test < 0.5] = -1 #replace values < 0.5 to -1
result_test[result_test >= 0.5] = 1


In [24]:
result_test

array([[-1.],
       [-1.],
       [ 1.],
       ...,
       [-1.],
       [ 1.],
       [-1.]], dtype=float32)

In [0]:
import csv

def create_csv_submission(y_pred, path):
    """
    Creates an output file in csv format for submission to kaggle
    Arguments: ids (event ids associated with each prediction)
               y_pred (predicted class labels)
               path (string name of .csv output file to be created)
    """
    ids=[i for i in range(1,len(y_pred)+1)]
    with open(path, 'w', newline='') as csvfile:
        fieldnames = ['Id', 'Prediction']
        writer = csv.DictWriter(csvfile, delimiter=",", fieldnames=fieldnames)
        writer.writeheader()
        for r1, r2 in zip(ids, y_pred):
            writer.writerow({'Id':int(r1),'Prediction':int(r2)})


In [0]:
create_csv_submission(result_test,"/content/drive/My Drive/EPFL/Machine Learning/MA1/cnn_submission_6_epochs.csv")