# CSE 484 Natural Language Processing Course
## A deep learning approach for sentiment anaylsis system by combining Word2vec with LSTM

Yusuf Bavaş - 151044018

## Get Data From Google Drive

In [0]:
!pip install word2vec_keras
!pip install mlflow

from google.colab import drive
drive.mount('/content/drive')

!cp "/content/drive/My Drive/Colab Notebooks/Negative_RAW.txt" "Negative_RAW.txt"
!cp "/content/drive/My Drive/Colab Notebooks/Negative_Processed.txt" "Negative_Processed.txt"
!cp "/content/drive/My Drive/Colab Notebooks/Positive_RAW.txt" "Positive_RAW.txt"
!cp "/content/drive/My Drive/Colab Notebooks/Positive_Processed.txt" "Positive_Processed.txt"

## Imports

In [0]:
from sklearn.datasets import fetch_20newsgroups
from word2vec_keras import Word2VecKeras
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
import mlflow
import mlflow.sklearn

## Sentiment Analysis

In [0]:
class SentimentAnalysisSystem(object):
    def __init__(self):
        self.model = Word2VecKeras()
    
    def split_data(self):
        # Shuffle and split the data into training and testing subsets
        self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(self.x, self.y, test_size=0.25, random_state=42)
        
    def numpy_to_list(self):

        self.x_train = self.x_train.tolist()
        self.y_train = self.y_train.tolist()
        self.x_test  = self.x_test.tolist()
        self.y_test  = self.y_test.tolist()
    
    def prepare_data(self, feature):

        negative = open('Negative_' + feature + '.txt', 'r',encoding = "utf-8") 
        self.x = negative.readlines()
        self.y = ['Negative' for i in self.x]
        negative.close()

        positive = open('Positive_' + feature + '.txt', 'r',encoding = "utf-8") 
        self.x.extend(positive.readlines()) 
        positive.close()
        self.y.extend(['Positive' for i in range(len(self.x) - len(self.y))])
        
        self.x = np.array(self.x)
        self.y = np.array(self.y)
        
        self.split_data()
        self.numpy_to_list()
        
        self.split_data()
        self.numpy_to_list()
        
    def train_model(self):

        self.w2v_size = 300
        self.w2v_min_count = 1 # 5
        self.w2v_epochs = 100
        self.k_epochs = 5 # 32
        self.k_lstm_neurons = 512
        self.k_max_sequence_len = 1000
        
        self.model.train(self.x_train, self.y_train, 
            w2v_size=self.w2v_size, 
            w2v_min_count=self.w2v_min_count, 
            w2v_epochs=self.w2v_epochs, 
            k_epochs=self.k_epochs, 
            k_lstm_neurons=self.k_lstm_neurons, 
            k_max_sequence_len=self.k_max_sequence_len, 
            k_hidden_layer_neurons=[])
        
    def evaluate(self):
        self.result = self.model.evaluate(self.x_test, self.y_test)
        self.accuracy = self.result["ACCURACY"]
        self.clf_report_df = pd.DataFrame(self.result["CLASSIFICATION_REPORT"])
        self.cnf_matrix = self.result["CONFUSION_MATRIX"]
        print('Confusion Matrix: ', self.cnf_matrix)
        return self.result
    
    def predict(self):
        index = random.randrange(len(self.x_test))
        print("LABEL:", self.y_test[index])
        print("TEXT :", self.x_test[index])
        print("PREDICTION:", self.model.predict(self.x_test[index]))
 
    def mlFlow(self, feature='RAW'):
        np.random.seed(40)  
        with mlflow.start_run():
            self.prepare_data(feature=feature)
            self.train_model()
            self.evaluate()
            mlflow.log_param("feature", feature) 
            mlflow.log_param("w2v_size", self.w2v_size)  
            mlflow.log_param("w2v_min_count", self.w2v_min_count)
            mlflow.log_param("w2v_epochs", self.w2v_epochs)
            mlflow.log_param("k_lstm_neurons", self.k_lstm_neurons)
            mlflow.log_param("k_max_sequence_len", self.k_max_sequence_len)
            mlflow.log_metric("accuracy", self.accuracy)
            mlflow.sklearn.log_model(self.model, "Word2Vec-Keras")
        

In [72]:
analysis_raw = SentimentAnalysisSystem()
analysis_raw.mlFlow(feature='RAW')

2020-01-18 13:35:06,174 : INFO : Build & train Word2Vec model
2020-01-18 13:35:06,176 : INFO : collecting all words and their counts
2020-01-18 13:35:06,178 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-01-18 13:35:06,212 : INFO : collected 24165 word types from a corpus of 158678 raw words and 7991 sentences
2020-01-18 13:35:06,213 : INFO : Loading a fresh vocabulary
2020-01-18 13:35:06,248 : INFO : effective_min_count=1 retains 24165 unique words (100% of original 24165, drops 0)
2020-01-18 13:35:06,249 : INFO : effective_min_count=1 leaves 158678 word corpus (100% of original 158678, drops 0)
2020-01-18 13:35:06,309 : INFO : deleting the raw counts dictionary of 24165 items
2020-01-18 13:35:06,310 : INFO : sample=0.001 downsamples 38 most-common words
2020-01-18 13:35:06,312 : INFO : downsampling leaves estimated 132933 word corpus (83.8% of prior 158678)
2020-01-18 13:35:06,353 : INFO : estimated required memory for 24165 words and 300 dimensions: 

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 1000, 300)         7249800   
_________________________________________________________________
lstm_4 (LSTM)                (None, 512)               1665024   
_________________________________________________________________
dense_4 (Dense)              (None, 2)                 1026      
Total params: 8,915,850
Trainable params: 1,666,050
Non-trainable params: 7,249,800
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


2020-01-18 13:44:48,576 : INFO : Done


Confusion Matrix:  [[1225  118]
 [ 179 1142]]


In [73]:
analysis_processed = SentimentAnalysisSystem()
analysis_processed.mlFlow(feature='Processed')

2020-01-18 13:45:49,191 : INFO : Build & train Word2Vec model
2020-01-18 13:45:49,193 : INFO : collecting all words and their counts
2020-01-18 13:45:49,194 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-01-18 13:45:49,232 : INFO : collected 23685 word types from a corpus of 158500 raw words and 7991 sentences
2020-01-18 13:45:49,233 : INFO : Loading a fresh vocabulary
2020-01-18 13:45:49,274 : INFO : effective_min_count=1 retains 23685 unique words (100% of original 23685, drops 0)
2020-01-18 13:45:49,275 : INFO : effective_min_count=1 leaves 158500 word corpus (100% of original 158500, drops 0)
2020-01-18 13:45:49,361 : INFO : deleting the raw counts dictionary of 23685 items
2020-01-18 13:45:49,363 : INFO : sample=0.001 downsamples 38 most-common words
2020-01-18 13:45:49,365 : INFO : downsampling leaves estimated 132163 word corpus (83.4% of prior 158500)
2020-01-18 13:45:49,420 : INFO : estimated required memory for 23685 words and 300 dimensions: 

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 1000, 300)         7105800   
_________________________________________________________________
lstm_5 (LSTM)                (None, 512)               1665024   
_________________________________________________________________
dense_5 (Dense)              (None, 2)                 1026      
Total params: 8,771,850
Trainable params: 1,666,050
Non-trainable params: 7,105,800
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


2020-01-18 13:55:41,962 : INFO : Done


Confusion Matrix:  [[1209  134]
 [ 167 1154]]


## Prediction
Takes random sentence from test part of data and predict it.

In [77]:
analysis_raw.predict()

LABEL: Negative
TEXT :   artik bizler bu tarz filmlerden vazgeçmeliyiz çünkü hep ayni konu üzerinde sürekli düsük bütçede ask mesk dram konulari üzerinde duruyorlar komedi ise vasat lütfen artik bizde iyi filmlere imza atalim ne çikar.... 

PREDICTION: {'label': 'Negative', 'confidence': 0.798618733882904, 'elapsed_time': 0.773949384689331}


In [76]:
analysis_processed.predict()

LABEL: Negative
TEXT : butun animasyon filmlerinde oldugu gibi bu filmde de ana karakterin sevimli bir yaratik olmasi gerekmiyor bence herkes yorumlarda karakterin korkuncluguna cirkinligine bir seyler yazmis ne sacma ne sacmaa

PREDICTION: {'label': 'Positive', 'confidence': 0.6041070222854614, 'elapsed_time': 0.6753647327423096}
