In [None]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras import layers
from keras import regularizers
from keras import backend as K
from keras.callbacks import ModelCheckpoint
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, SimpleRNN
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from keras.utils.np_utils import to_categorical

**Attempt 1: The shortened version of the dataset**

In [None]:
df = pd.read_csv('/short_dataset.csv')
reviews = df['Review']

In [83]:
max_words = 5000
max_len = 200

tokenizer = Tokenizer(num_words=max_words)

tokenizer.fit_on_texts(reviews)

X = tokenizer.texts_to_sequences(reviews)
X = pad_sequences(X, maxlen=maxlen)

vocab_size = len(tokenizer.word_index) + 1

In [None]:
def sent_changer(sentiment):
    if sentiment == 'positive':
        x = 1
    else:
        x = 0
    return x
sentiments = df['Sentiment'].apply(sent_changer)

Le = LabelEncoder()

y = Le.fit_transform(df['Sentiment'])
y = to_categorical(sentiments, 2)
y.shape

In [84]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [70]:
model = Sequential()
model.add(Embedding(max_words, 100))
model.add(LSTM(15, dropout=0.3))
model.add(Dense(2, activation='sigmoid'))
opt = keras.optimizers.Adam(lr=0.001)
model.compile(loss='binary_crossentropy',
              optimizer=opt,
              metrics=[tf.keras.metrics.Precision(), tf.keras.metrics.Recall(),'accuracy'])

print(model.summary())

Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, None, 100)         500000    
_________________________________________________________________
lstm_11 (LSTM)               (None, 15)                6960      
_________________________________________________________________
dense_11 (Dense)             (None, 2)                 32        
Total params: 506,992
Trainable params: 506,992
Non-trainable params: 0
_________________________________________________________________
None


In [95]:
batch_size = 128
epochs = 10

history = model.fit(X_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


**Attempt 2: The full version of the dataset**

In [11]:
df = pd.read_csv('/content/combined_dataset.csv', engine='python')
df_pos = df[df['Sentiment']=='positive']
df_neg = df[df['Sentiment']=='negative']
df = pd.concat([df_pos, df_neg])
df = df.sample(frac=1).reset_index(drop=True)

In [58]:
def sent_changer(sentiment):
    if sentiment == 'positive':
        x = 1
    else:
        x = 0
    return x
sentiments = df['Sentiment'].apply(sent_changer)

Le = LabelEncoder()

y = Le.fit_transform(df['Sentiment'])
y = to_categorical(sentiments, 2)
y.shape

(71239, 2)

In [59]:
max_words = 5000
max_len = 200

tokenizer = Tokenizer(num_words=max_words)

tokenizer.fit_on_texts(df['Review'])

X = tokenizer.texts_to_sequences(df['Review'])
X = pad_sequences(X, maxlen=max_len)

vocab_size = len(tokenizer.word_index) + 1

In [60]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [61]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
import keras
import tensorflow as tf


model = Sequential()
model.add(Embedding(max_words, 200))
model.add(LSTM(15, dropout=0.3))
model.add(Dense(2, activation='sigmoid'))
opt = keras.optimizers.Adam(lr=0.001)
model.compile(loss='binary_crossentropy',
              optimizer=opt,
              metrics=[tf.keras.metrics.Precision(), tf.keras.metrics.Recall(), 'accuracy'])

print(model.summary())

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, None, 200)         1000000   
_________________________________________________________________
lstm_10 (LSTM)               (None, 15)                12960     
_________________________________________________________________
dense_10 (Dense)             (None, 2)                 32        
Total params: 1,012,992
Trainable params: 1,012,992
Non-trainable params: 0
_________________________________________________________________
None


In [63]:
batch_size = 128
epochs = 3

history = model.fit(X_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(X_test, y_test))

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [69]:
precision = 0.8577 # values taken from the previous step after the last epoch
recall = 0.8578 # same as precision
f1_score = 2*precision*recall / (precision + recall)
round(f1_score*100, 1)

85.8