# Sentiment Analysis using DL techniques
In this notebook we try to classify IMDB movie reviews as happy or not using DL tecniques. In more details,
we use word2vec to train our word2vec model and transform words into word embeddings. Then we train and evaluate an RNN
model. In the end, we use it to predict the sentiment of actual IMDB reviews accomplishing **Accuracy up to 0.8713**. 


In [None]:
import pandas as pd
import numpy as np
import time


train_path = "files/data/train.csv"
predicitions_path = 'files/data/predictions_dl.csv'
test_path = "files/data/test_without_labels.csv"
embeddings_path = "files/data/embeddings.txt"

embedding_dim = 100


## Colab Configurations

In [None]:
train_path = '/content/drive/My Drive/Colab Notebooks/BDA/semantic_analysis/train.csv'
test_path = '/content/drive/My Drive/Colab Notebooks/BDA/semantic_analysis/test_without_labels.csv'
predicitions_path = '/content/drive/My Drive/Colab Notebooks/BDA/semantic_analysis/predictions_dl.csv'

import nltk
nltk.download('stopwords')
nltk.download('punkt')

from google.colab import drive
drive.mount('/content/drive')


## Loading Training Set

In [None]:
train = pd.read_csv(train_path)

X = train['Content']
y = train['Label']

test = pd.read_csv(test_path)
X_test = test['Content']

## Cleaning Text
In this method we tokenize the reviews and clean them from punctuations and non-alpha characters
but we don't remove stopwords, as they sometimes are considered useful in Sentiment analysis.

In [None]:
import string
from nltk.tokenize import word_tokenize

punctuation_table = str.maketrans('','', string.punctuation)

def clean_text(reviews):
  clean_reviews = list()
  lines = reviews.values.tolist()

  for line in lines:
      tokens = []
      for token in word_tokenize(line):
          token = token.lower()
          stripped = token.translate(punctuation_table)
          if token.isalpha() :
              tokens.append(stripped)
      clean_reviews.append(tokens)
      
  return clean_reviews

## Preprocess and Embeddings Configurations

In [None]:
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import  pad_sequences

review_lines = clean_text(X)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(review_lines)
review_seq = tokenizer.texts_to_sequences(review_lines)

max_length = max([len(review.split()) for review in X])
review_pad = pad_sequences(review_seq, max_length)

word_index = tokenizer.word_index
vocab_size = len(word_index) + 1
print("Unique Tokens " + str(vocab_size))

## Creating our own Word2Vec model based on our data.

In [None]:
import gensim

gensim_model = gensim.models.Word2Vec(review_lines, size=embedding_dim, window=5, workers=12, min_count=2)

words = list(gensim_model.wv.vocab)
print("Vocabulary size: " + str(len(words)))


## Constructing Embedding Matrix
The embedding matrix is used by the embedding layer to transform input words to their vector

In [None]:
embeddings_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in word_index.items():
    if i <= vocab_size and word in gensim_model.wv:
        embedding_vector = gensim_model.wv[word]
        if embedding_vector is not None:
            embeddings_matrix[i] = embedding_vector


## RNN
Our model architecture consists of:
- An Embedding Layer (not Trainable)
- An GRU layer with recurrent drop out rate set to 0.2
- A hidden dense layer with 16 units

In [None]:
from keras.models import Sequential
from keras.layers import GRU, LSTM, Dense
from keras.layers.embeddings import Embedding
from keras.initializers import Constant

def create_model(summary=False):
    model = Sequential([
        Embedding(vocab_size, embedding_dim, 
                  embeddings_initializer=Constant(embeddings_matrix), 
                  input_length=max_length, trainable=False),
        GRU(units=16, dropout=0.2, recurrent_dropout=0.2),
        Dense(16, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    if summary: model.summary()
    return model


## Model Evaluation
Perform evaluation using k-Fold Cross Validation. In each iteration the model is constructed
 from scratch.

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics

starting_tm = time.time()
precision = 0
recall = 0
f1 = 0
accuracy = 0

k = 5
epochs = 5
skf = StratifiedKFold(n_splits=k)
for train_index, test_index in skf.split(review_pad, y):
    
    X_train, X_test = review_pad[train_index], review_pad[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    model = create_model()
    model.fit(X_train, y_train, epochs=epochs, batch_size=256)
    predictions = model.predict(X_test)
    
    predictions = [1 if p[0] > 0.45 else 0 for p in predictions]
    
    precision += metrics.precision_score(y_test, predictions, average='micro')
    recall += metrics.recall_score(y_test, predictions, average='micro')
    f1 += metrics.f1_score(y_test, predictions, average='micro')
    accuracy += metrics.accuracy_score(y_test, predictions)

    print()

 # compute the average of each value
precision_score = precision/k
recall_score = recall/k
f1_score = f1/k
accuracy_score = accuracy/k

print("Precision: " + str(precision_score)
      + "\nRecall: " + str(recall_score)
      + "\nF1-Measure: " + str(f1_score) 
      + "\nAccuracy: " + str(accuracy_score)
      + "\nExecution time: " + str(time.time() - starting_tm))



## Prediction
Load the testing dataset, pre-process it and then predict it. In the end store the results as CSV.

In [None]:
test_review_lines = clean_text(X_test)

test_review_seq = tokenizer.texts_to_sequences(test_review_lines)
test_review_pad = pad_sequences(test_review_seq, max_length)

In [None]:
model = create_model(True)
model.fit(review_pad, y, epochs=10, batch_size=256)
predictions = model.predict(test_review_pad)
predictions = [1 if p[0] > 0.45 else 0 for p in predictions]

In [None]:

predictions_df = pd.DataFrame(data={'Id': list(test['Id']), 'Predicted':predictions})
predictions_df.to_csv(predicitions_path, index=False)
