In [None]:
pip install keras



In [None]:
import pandas as pd
import os
import nltk
from sklearn.model_selection import train_test_split
from nltk.tokenize import sent_tokenize,word_tokenize
import numpy as np
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import seaborn as sns
import string
from nltk import WordNetLemmatizer
from nltk import PorterStemmer
import re

import tensorflow as tf
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report,confusion_matrix
from keras.preprocessing.text import Tokenizer

from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Dropout


from keras.layers import GRU, Bidirectional

from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping


from keras.layers import SimpleRNN


In [None]:
train_pos = os.listdir('train_pos')
train_neg= os.listdir('train_neg')

In [None]:

def read(folder_name,f_list):

    data = []
    label = []
    count=0
    for file in f_list:
        if count<1000:
            with open(folder_name + '/' + file, 'r',encoding="utf-8") as f:
                data.append(f.read())
                label.append(1 if folder_name == 'train_pos' else 0)
            count+=1
        else:
             break
    return data,label


In [None]:
train_pos=read('train_pos',train_pos)
train_neg=read('train_neg',train_neg)

In [None]:

train_pos_df = pd.DataFrame({'data':train_pos[0],'label':train_pos[1]})
train_neg_df = pd.DataFrame({'data':train_neg[0],'label':train_neg[1]})

In [None]:
train_df=pd.concat([train_pos_df,train_neg_df],ignore_index=True)

In [None]:

train_df = train_df.sample(frac=1).reset_index(drop=True)


In [None]:

def preprocess_text(text):
    text = text.str.replace(r'[\(\[].*?[\)\]]', '', regex=True)# Rem
    text = text.str.replace('[{}]'.format(re.escape(string.punctuation)), '')  # Remove punctuation


    clean_text = []
    ignore = set(stopwords.words('english'))  # Remove stopwords from text

    for i in text:
        words = nltk.word_tokenize(i)
        words = [word for word in words if word not in ignore and len(word) > 1]
        res_text = " ".join(words)
        clean_text.append(res_text)

    return clean_text
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(lemmatized_tokens)





In [None]:
processed_df = pd.DataFrame(columns=['old_text', 'review'])
processed_df['old_text'] = train_df['data']
processed_df['text'] = preprocess_text(train_df['data'])
processed_df['text_lemmatized'] = processed_df['text'].apply(lemmatize_text)
processed_df['sentiment'] = train_df['label']

processed_df.head()

Unnamed: 0,old_text,review,text,text_lemmatized,sentiment
0,Forest of the Damned starts out as five young ...,,forest damned starts five young friends brothe...,forest damned start five young friend brother ...,-1
1,I thought I should qualify my position after r...,,thought qualify position reading reviews movie...,thought qualify position reading review movie ...,1
2,I don't agree with one of the reviewers who co...,,n't agree one reviewers compared film american...,n't agree one reviewer compared film american ...,1
3,Los Angeles TV news reporter Jennifer (the bea...,,los angeles tv news reporter jennifer two assi...,los angeles tv news reporter jennifer two assi...,1
4,"After a chance encounter on the train, a young...",,chance encounter train young couple spends sin...,chance encounter train young couple spends sin...,1


In [None]:
analysis_df = pd.DataFrame(columns=['Review', 'Sentiment'])
analysis_df['Review'] = processed_df['text_lemmatized'].apply(str)
analysis_df['Sentiment'] = processed_df['sentiment']

analysis_df.head()

Unnamed: 0,Review,Sentiment
0,forest damned start five young friend brother ...,-1
1,thought qualify position reading review movie ...,1
2,n't agree one reviewer compared film american ...,1
3,los angeles tv news reporter jennifer two assi...,1
4,chance encounter train young couple spends sin...,1


In [None]:
analysis_df.to_csv('analysis_df.csv', index=False)

In [None]:
analysis_df=pd.read_csv("analysis_df.csv")

### RNN

## Default Embedding(RNN)

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(analysis_df['Review'])
sequences = tokenizer.texts_to_sequences(analysis_df['Review'])

max_sequence_length = max(len(x) for x in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='pre')

In [None]:
# Splitting the dataset

X_train_emb, X_test_emb, y_train_emb, y_test_emb = train_test_split(padded_sequences, analysis_df['Sentiment'], test_size=0.25, random_state=42)

print("Training set shape:",X_train_emb.shape)
print("Testing set shape:", X_test_emb.shape)

Training set shape: (1500, 947)
Testing set shape: (500, 947)


In [None]:
X_train_emb=np.array(X_train_emb)
y_train_emb=np.array(y_train_emb)

In [None]:

model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=128))
model.add(SimpleRNN(16))
model.add(Dense(1, activation='sigmoid'))


In [None]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, None, 128)         2560384   
                                                                 
 simple_rnn_2 (SimpleRNN)    (None, 16)                2320      
                                                                 
 dense_2 (Dense)             (None, 1)                 17        
                                                                 
Total params: 2562721 (9.78 MB)
Trainable params: 2562721 (9.78 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train_emb, y_train_emb, epochs=5, batch_size=128, validation_split=0.25)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7c43ea7731f0>

In [None]:
y_pred_emb = np.round(model.predict(X_test_emb))
print(f"Accuracy Score: {accuracy_score(y_test_emb,y_pred_emb) * 100}%")
print(f"Recall {recall_score(y_test_emb,y_pred_emb) * 100}%")
print(f"Precision Score: {precision_score(y_test_emb,y_pred_emb) * 100}%")
print(f"F1 Score: {f1_score(y_test_emb,y_pred_emb) * 100}%")

Accuracy Score: 81.0%
Recall 79.08745247148289%
Precision Score: 83.87096774193549%
F1 Score: 81.40900195694715%


## BOW with embedding (RNN)

In [None]:


from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()


X = vectorizer.fit_transform(analysis_df['Review'])

X = X.toarray()

X_train, X_test, y_train, y_test = train_test_split(X, analysis_df['Sentiment'], test_size=0.25, random_state=42)
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)


In [None]:
# prompt: Create RNN model for above dataset and use embedding layer

# Create a RNN model
model = Sequential()
model.add(Embedding(input_dim=len(vectorizer.vocabulary_), output_dim=128))
model.add(SimpleRNN(16))
model.add(Dense(1, activation='sigmoid'))
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 128)         2514304   
                                                                 
 simple_rnn (SimpleRNN)      (None, 16)                2320      
                                                                 
 dense (Dense)               (None, 1)                 17        
                                                                 
Total params: 2516641 (9.60 MB)
Trainable params: 2516641 (9.60 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
def load_and_preprocess_data(feature, label):
    return feature, label

# Create a TensorFlow dataset with batch size 1
dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
dataset = dataset.map(load_and_preprocess_data)
dataset = dataset.batch(128)
# Train the model
model.fit(dataset, epochs=5, batch_size=128)
# Evaluate the model



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x797c80fd1e10>

In [None]:
y_pred = np.round(model.predict(X_test))
print(f"Accuracy Score: {accuracy_score(y_test,y_pred) * 100}%")
print(f"Recall {recall_score(y_test,y_pred) * 100}%")
print(f"Precision Score: {precision_score(y_test,y_pred) * 100}%")
print(f"F1 Score: {f1_score(y_test,y_pred) * 100}%")

Accuracy Score: 49.6%
Recall 19.771863117870723%
Precision Score: 55.91397849462365%
F1 Score: 29.213483146067414%


## RNN wordvec

In [None]:
!pip install gensim



In [None]:
import gensim
from gensim.models import Word2Vec

In [None]:
from gensim.utils import simple_preprocess




documents = analysis_df['Review'].apply(simple_preprocess)

embedding_dim = 100
word2vec_model = Word2Vec(documents, vector_size=embedding_dim, window=5, min_count=1, workers=4)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(analysis_df['Review'])
vocab_size = len(tokenizer.word_index) + 1

sequences = tokenizer.texts_to_sequences(analysis_df['Review'])

max_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

X_train_wv, X_test_wv, y_train_wv, y_test_wv = train_test_split(padded_sequences, analysis_df['Sentiment'], test_size=0.25, random_state=42)

embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    if word in word2vec_model.wv:
        embedding_matrix[i] = word2vec_model.wv[word]

In [None]:


model_rnn = Sequential()
model_rnn.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix], trainable=False))
model_rnn.add(SimpleRNN(units=64, activation='tanh'))
model_rnn.add(Dense(1, activation='sigmoid'))
model_rnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_rnn.summary()


Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, None, 100)         2000300   
                                                                 
 simple_rnn_3 (SimpleRNN)    (None, 64)                10560     
                                                                 
 dense_3 (Dense)             (None, 1)                 65        
                                                                 
Total params: 2010925 (7.67 MB)
Trainable params: 10625 (41.50 KB)
Non-trainable params: 2000300 (7.63 MB)
_________________________________________________________________


In [None]:
model_rnn.fit(X_train_wv, y_train_wv, epochs=5, batch_size=128)



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7c43eb7d5c90>

In [None]:
y_pred_wv = np.round(model_rnn.predict(X_test_wv))
print(f"Accuracy Score: {accuracy_score(y_test_wv,y_pred_wv) * 100}%")
print(f"Recall {recall_score(y_test_wv,y_pred_wv) * 100}%")
print(f"Precision Score: {precision_score(y_test_wv,y_pred_wv) * 100}%")
print(f"F1 Score: {f1_score(y_test_wv,y_pred_wv) * 100}%")

Accuracy Score: 47.4%
Recall 0.0%
Precision Score: 0.0%
F1 Score: 0.0%


  _warn_prf(average, modifier, msg_start, len(result))


##FastTEXT (RNN)

In [None]:
!pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/68.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━[0m [32m61.4/68.8 kB[0m [31m1.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.12.0-py3-none-any.whl (234 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp310-cp310-linux_x86_64.whl size=4227137 sha256=3c2e2a8787e6a10ca603256c0e2bee95bb3c22911424124e8a4dead72be16b31
  Stored in directory: /root/.cache/pip/wheels/a5/13/75/f811c84a8ab36eedbaef977a6a58a98990e8e0f1967f98f394
Successfully built fa

In [None]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip
!unzip wiki-news-300d-1M.vec.zip



In [None]:
import fasttext

In [None]:
analysis_df_list = analysis_df['Review'].to_list()
analysis_df_list = [i.split(' ') for i in analysis_df_list]

In [None]:
vocab = set()
for i in analysis_df_list:
    for j in i:
        if j not in vocab:
            vocab.add(j)

In [None]:
import io

def load_vectors(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        if tokens[0].lower().strip() in vocab:
            data[tokens[0]] = np.array(list(map(float, tokens[1:])))
    return data

In [None]:
embedding = load_vectors('/content/wiki-news-300d-1M.vec')

In [None]:
embedding_dim = 300
tokenizer = Tokenizer()
tokenizer.fit_on_texts(analysis_df['Review'])
vocab_size = len(tokenizer.word_index) + 1

sequences = tokenizer.texts_to_sequences(analysis_df['Review'])

max_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

X_train_ft, X_test_ft, y_train_ft, y_test_ft = train_test_split(padded_sequences, analysis_df['Sentiment'], test_size=0.25, random_state=42)


In [None]:
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    if word in embedding:
        embedding_matrix[i] = embedding[word]

In [None]:
model_rnn = Sequential()
model_rnn.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix], trainable=False))
model_rnn.add(SimpleRNN(units=64, activation='tanh'))
model_rnn.add(Dense(1, activation='sigmoid'))
model_rnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_rnn.summary()


Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, None, 300)         6000900   
                                                                 
 simple_rnn_4 (SimpleRNN)    (None, 64)                23360     
                                                                 
 dense_4 (Dense)             (None, 1)                 65        
                                                                 
Total params: 6024325 (22.98 MB)
Trainable params: 23425 (91.50 KB)
Non-trainable params: 6000900 (22.89 MB)
_________________________________________________________________


In [None]:

model_rnn.fit(X_train_ft, y_train_ft, epochs=5, batch_size=128, validation_split=0.25)



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7c43eb63bd60>

In [None]:
y_pred_ft = np.round(model_rnn.predict(X_test_ft))
print(f"Accuracy Score: {accuracy_score(y_test_ft,y_pred_ft) * 100}%")
print(f"Recall {recall_score(y_test_ft,y_pred_ft) * 100}%")
print(f"Precision Score: {precision_score(y_test_ft,y_pred_ft) * 100}%")
print(f"F1 Score: {f1_score(y_test_ft,y_pred_ft) * 100}%")

Accuracy Score: 47.4%
Recall 32.69961977186312%
Precision Score: 50.0%
F1 Score: 39.54022988505747%


### LSTM

## LSTM with defaut embedding

In [None]:

model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=128))
model.add(LSTM(16))
model.add(Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_10 (Embedding)    (None, None, 128)         2560384   
                                                                 
 lstm_3 (LSTM)               (None, 16)                9280      
                                                                 
 dense_10 (Dense)            (None, 1)                 17        
                                                                 
Total params: 2569681 (9.80 MB)
Trainable params: 2569681 (9.80 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train_emb, y_train_emb, epochs=5, batch_size=128, validation_split=0.25)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7c43c9a38730>

In [None]:
y_pred_emb = np.round(model.predict(X_test_emb))
print(f"Accuracy Score: {accuracy_score(y_test_emb,y_pred_emb) * 100}%")
print(f"Recall {recall_score(y_test_emb,y_pred_emb) * 100}%")
print(f"Precision Score: {precision_score(y_test_emb,y_pred_emb) * 100}%")
print(f"F1 Score: {f1_score(y_test_emb,y_pred_emb) * 100}%")

Accuracy Score: 80.80000000000001%
Recall 72.24334600760456%
Precision Score: 89.2018779342723%
F1 Score: 79.83193277310924%


## LSTM with word2vec

In [None]:


model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix], trainable=False))
model.add(LSTM(units=64, activation='tanh'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()


Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_8 (Embedding)     (None, None, 300)         6000900   
                                                                 
 lstm_2 (LSTM)               (None, 64)                93440     
                                                                 
 dense_8 (Dense)             (None, 1)                 65        
                                                                 
Total params: 6094405 (23.25 MB)
Trainable params: 93505 (365.25 KB)
Non-trainable params: 6000900 (22.89 MB)
_________________________________________________________________


In [None]:
model.fit(X_train_wv, y_train_wv, epochs=5, batch_size=128)



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7c43d200cee0>

In [None]:
y_pred_wv = np.round(model.predict(X_test_wv))
print(f"Accuracy Score: {accuracy_score(y_test_wv,y_pred_wv) * 100}%")
print(f"Recall {recall_score(y_test_wv,y_pred_wv) * 100}%")
print(f"Precision Score: {precision_score(y_test_wv,y_pred_wv) * 100}%")
print(f"F1 Score: {f1_score(y_test_wv,y_pred_wv) * 100}%")

Accuracy Score: 47.4%
Recall 0.0%
Precision Score: 0.0%
F1 Score: 0.0%


  _warn_prf(average, modifier, msg_start, len(result))


## LSTM WITH Fasttext

In [None]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix], trainable=False))
model.add(SimpleRNN(units=64, activation='tanh'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()


Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_9 (Embedding)     (None, None, 300)         6000900   
                                                                 
 simple_rnn_6 (SimpleRNN)    (None, 64)                23360     
                                                                 
 dense_9 (Dense)             (None, 1)                 65        
                                                                 
Total params: 6024325 (22.98 MB)
Trainable params: 23425 (91.50 KB)
Non-trainable params: 6000900 (22.89 MB)
_________________________________________________________________


In [None]:

model.fit(X_train_ft, y_train_ft, epochs=5, batch_size=128, validation_split=0.25)



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7c43d1b22d40>

In [None]:
y_pred_ft = np.round(model_rnn.predict(X_test_ft))
print(f"Accuracy Score: {accuracy_score(y_test_ft,y_pred_ft) * 100}%")
print(f"Recall {recall_score(y_test_ft,y_pred_ft) * 100}%")
print(f"Precision Score: {precision_score(y_test_ft,y_pred_ft) * 100}%")
print(f"F1 Score: {f1_score(y_test_ft,y_pred_ft) * 100}%")

Accuracy Score: 47.4%
Recall 0.0%
Precision Score: 0.0%
F1 Score: 0.0%


  _warn_prf(average, modifier, msg_start, len(result))


### GRU

### GRU WITH default embedding

In [None]:

model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=128))
model.add(GRU(16))
model.add(Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_12 (Embedding)    (None, None, 128)         2560384   
                                                                 
 gru (GRU)                   (None, 16)                7008      
                                                                 
 dense_12 (Dense)            (None, 1)                 17        
                                                                 
Total params: 2567409 (9.79 MB)
Trainable params: 2567409 (9.79 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train_emb, y_train_emb, epochs=5, batch_size=128)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7c43c9a39270>

In [None]:
y_pred_emb = np.round(model.predict(X_test_emb))
print(f"Accuracy Score: {accuracy_score(y_test_emb,y_pred_emb) * 100}%")
print(f"Recall {recall_score(y_test_emb,y_pred_emb) * 100}%")
print(f"Precision Score: {precision_score(y_test_emb,y_pred_emb) * 100}%")
print(f"F1 Score: {f1_score(y_test_emb,y_pred_emb) * 100}%")

Accuracy Score: 67.60000000000001%
Recall 65.39923954372624%
Precision Score: 70.78189300411523%
F1 Score: 67.98418972332016%
