<h3>Import Libraries and Data

In [9]:
# Mount Drive

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
# Import libraries

from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas as pd
import xgboost, numpy, textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

In [11]:
# Read Data

path = '/content/drive/MyDrive/Sentiment analysis/Natural-Language-Processing-main/Sentimental analysis Project/Data/Amazon_Customer_Reviews.csv'
data = pd.read_csv(path)
data.drop('Unnamed: 0',axis=1,inplace=True)
data.head()


Unnamed: 0,Ratings,Reviews
0,Good,"I use Amazon.com often, and 99% of the time, ..."
1,Good,With Amazon you can leisurely shop in the com...
2,Good,I am very happy with all my purchases since I...
3,Good,"I absolutely love Amazon. Their selections, pr..."
4,Good,We have shopped Amazon for years and always fi...


<h3>Pre-Processing

In [12]:
#Data cleaning and preprocessing
import re
from nltk.stem import WordNetLemmatizer
wordnet = WordNetLemmatizer()

# Function for preprocessing
def preprocessing(data):    
    
    reviews =  []
    for i in range(0, len(data)):
        # Replacing values in rows
        review = data['Reviews'][i]
        review = review.replace('$','dollars')
        review = review.lower()
        reviews.append(review)
    
    return reviews
        
reviews_1 = preprocessing(data)

In [None]:
# Create Dataframe

df = pd.DataFrame(reviews_1)

In [14]:
# Rename Column

df.columns=['reviews']

In [15]:
df.head()

Unnamed: 0,reviews
0,"i use amazon.com often, and 99% of the time, ..."
1,with amazon you can leisurely shop in the com...
2,i am very happy with all my purchases since i...
3,"i absolutely love amazon. their selections, pr..."
4,we have shopped amazon for years and always fi...


<h3> Word to vec Representation

In [16]:
# Import pretrained fasttext word to vec representation

path_vec = '/content/drive/MyDrive/Sentiment analysis/Natural-Language-Processing-main/Sentimental analysis Project/wiki-news-300d-1M.vec'

In [17]:
# load the pre-trained word-embedding vectors

embeddings_index = {}
for i, line in enumerate(open(path_vec)):
    values = line.split()
    embeddings_index[values[0]] = numpy.asarray(values[1:], dtype='float32')

In [18]:
embeddings_index

Output hidden; open in https://colab.research.google.com to view.

In [19]:
#dummyfying output variable
y=pd.get_dummies(data['Ratings'],drop_first=True)

<h3> Train Test Split

In [None]:
# split the dataset into training and validation datasets 
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(df['reviews'], y)

<h3> Tokenizing Train Data


In [21]:
# create a tokenizer 
token = text.Tokenizer()
token.fit_on_texts(train_x)
word_index = token.word_index

In [22]:
# create token-embedding mapping
embedding_matrix = numpy.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [23]:
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.3116    ,  0.0856    , -0.0069    , ...,  0.0877    ,
         0.1019    ,  0.0097    ],
       [ 0.0897    ,  0.016     , -0.0571    , ...,  0.1559    ,
        -0.0254    , -0.0259    ],
       ...,
       [ 0.1016    ,  0.2156    ,  0.26460001, ...,  0.109     ,
        -0.0412    ,  0.1864    ],
       [ 0.0319    ,  0.0129    ,  0.0671    , ...,  0.17030001,
         0.0359    ,  0.0689    ],
       [-0.0641    , -0.18279999, -0.1127    , ...,  0.0668    ,
         0.0088    , -0.0329    ]])

<h3 >Tokenizing Test Data

In [24]:
# create a tokenizer 
token2 = text.Tokenizer()
token2.fit_on_texts(valid_x)
word_index2 = token2.word_index

In [25]:
# create token-embedding mapping
embedding_matrix2 = numpy.zeros((len(word_index2) + 1, 300))
for word, i in word_index2.items():
    embedding_vector2 = embeddings_index.get(word)
    if embedding_vector2 is not None:
        embedding_matrix2[i] = embedding_vector2

In [26]:
embedding_matrix2

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.3116    ,  0.0856    , -0.0069    , ...,  0.0877    ,
         0.1019    ,  0.0097    ],
       [ 0.0897    ,  0.016     , -0.0571    , ...,  0.1559    ,
        -0.0254    , -0.0259    ],
       ...,
       [ 0.15970001, -0.1119    , -0.0863    , ...,  0.1899    ,
         0.0028    ,  0.2807    ],
       [-0.1044    , -0.022     , -0.146     , ...,  0.0607    ,
         0.0588    , -0.0217    ],
       [-0.13680001, -0.1796    ,  0.18350001, ...,  0.17900001,
        -0.1206    , -0.0409    ]])

<h3> Padding sequences

In [29]:
from tensorflow.keras.utils import pad_sequences

# convert text to sequence of tokens and pad them to ensure equal length vectors 
train_seq_x = pad_sequences(token.texts_to_sequences(train_x), maxlen=165)
valid_seq_x = pad_sequences(token2.texts_to_sequences(valid_x), maxlen=165)

In [31]:
train_seq_x.shape

(4965, 165)

In [32]:
valid_seq_x.shape

(1656, 165)

<h3> Evaluation Metrics

In [45]:
# Calculation of F1, Precision, recall

from keras import backend as K
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))


<h3>LSTM

In [46]:
## Creating model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

model=Sequential()
model.add(Embedding(len(word_index) + 1, 300, weights=[embedding_matrix],input_length=165 ,trainable=False))
model.add(Dropout(0.3))
model.add(LSTM(100))
model.add(Dense(1,activation='sigmoid'))
print(model.summary())



Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 165, 300)          4711500   
                                                                 
 dropout_4 (Dropout)         (None, 165, 300)          0         
                                                                 
 lstm_4 (LSTM)               (None, 100)               160400    
                                                                 
 dense_4 (Dense)             (None, 1)                 101       
                                                                 
Total params: 4,872,001
Trainable params: 160,501
Non-trainable params: 4,711,500
_________________________________________________________________
None


In [47]:
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc',f1_m,precision_m, recall_m])

# fit the model
history = model.fit(train_seq_x,train_y,validation_data=(valid_seq_x,valid_y),epochs=10,batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [48]:

# evaluate the model
loss, accuracy, f1_score, precision, recall = model.evaluate(valid_seq_x,valid_y, verbose=0)

In [49]:
f1_score

0.49693214893341064

<h3>Bi-Directional</h3>

In [50]:
from tensorflow.keras.layers import Bidirectional
## Creating model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import GRU
from tensorflow.keras.layers import Dense

model=Sequential()
model.add(Embedding(len(word_index) + 1, 300, weights=[embedding_matrix],input_length=165 ,trainable=False))
model.add(Dropout(0.3))
model.add(Bidirectional(LSTM(100)))
model.add(Dense(1,activation='sigmoid'))
print(model.summary())



Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 165, 300)          4711500   
                                                                 
 dropout_5 (Dropout)         (None, 165, 300)          0         
                                                                 
 bidirectional_3 (Bidirectio  (None, 200)              320800    
 nal)                                                            
                                                                 
 dense_5 (Dense)             (None, 1)                 201       
                                                                 
Total params: 5,032,501
Trainable params: 321,001
Non-trainable params: 4,711,500
_________________________________________________________________
None


In [51]:
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc',f1_m,precision_m, recall_m])

# fit the model
history = model.fit(train_seq_x,train_y,validation_data=(valid_seq_x,valid_y),epochs=10,batch_size=64)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [52]:

# evaluate the model
loss, accuracy, f1_score, precision, recall = model.evaluate(valid_seq_x,valid_y, verbose=0)

In [54]:
f1_score

0.31637561321258545