In [1]:
import pandas as pd
import numpy as np
train_data = pd.read_csv("train_preprocessed.csv")
test_data = pd.read_csv("test_preprocessed.csv")

In [2]:
# Reducing the data as my machine cannot handle big data even with gpu acceleration.
train_data_cut = len(train_data)//10
test_data_cut = len(test_data)//10
train_data = train_data[:train_data_cut]
test_data = test_data[:test_data_cut]

print(f"Length of train data: {len(train_data)}")
print(f"Length of test data: {len(test_data)}")

Length of train data: 359979
Length of test data: 39997


In [3]:
count = 0 
max_words = 0
min_words = 1000
for i in range(len(train_data)):
    max_words = max(max_words, len(train_data['Text'].iloc[i].split())) # checking max no. of words
    min_words = min(min_words, len(train_data['Text'].iloc[i].split())) # checking min no. of words
    
    count += len(train_data['Text'].iloc[i].split())
    # Calculating average no. of words in 2 diff ways
    average_words1 = (max_words + min_words)/2 
    average_words2 = count/len(train_data)

In [4]:
print(f"Maximum number of words:{max_words}")
print(f"Minimum number of words:{min_words}")
print(f"Average number of words if averaged max and min numbers:{average_words1}")
print(f"Total words (just for fun):{count}")
print(f"Average number of words if averaged total words by the len of data:{average_words2}")

Maximum number of words:240
Minimum number of words:5
Average number of words if averaged max and min numbers:122.5
Total words (just for fun):27198819
Average number of words if averaged total words by the len of data:75.55668247314426


### I will be using average_words1 just because it has a bigger number obviously

In [5]:
# Rounding up the average words
average_words = round(average_words1)
average_words

122

In [6]:
# Checking Unique words in the dataset
import re
def unique_words(train_sentence):
    train_sentence = train_sentence.lower()
    train_sentence = re.sub(r'[^\w\s]', '', train_sentence)
    words = train_sentence.split()
    unique_words_set = set(words)
    return unique_words_set

uniqueWords = set()
for i in range(len(train_data)):
    uniqueWords.update(unique_words(train_data['Text'].iloc[i]))

In [7]:
print(f"Number of Unique Words:{len(uniqueWords)}")

Number of Unique Words:433193


In [8]:
# Initializing vectorizer
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
vectorizer_int = TextVectorization(max_tokens = 10000,
                                   standardize = "lower_and_strip_punctuation",
                                   split = "whitespace",
                                   output_mode = "int",
                                   output_sequence_length = 128)
vectorizer_tfidf = TextVectorization(max_tokens = 10000,
                                     standardize = "lower_and_strip_punctuation",
                                     split = "whitespace",
                                     output_mode = "tf_idf")

### After running the fit function I got to know that the model requires it's target to be numeric

In [9]:
# Mapping Positive to 1 and Negative to 0
train_data['Polarity'] = train_data['Polarity'].map({'Positive': 1, 'Negative': 0})
test_data['Polarity'] = test_data['Polarity'].map({'Positive': 1, 'Negative': 0})

In [10]:
# Combining Title and Text columns into a single column
train_data['Feature'] = train_data['Title'] + ' ' + train_data['Text']
test_data['Feature'] = test_data['Title'] + ' ' + test_data['Text']

# Splitting the data into features and target
train_features = train_data['Feature']
train_target = train_data['Polarity']
test_features = test_data['Feature']
test_target = test_data['Polarity']

# Making validation set
val_features = train_features[:int((0.1)*len(train_features))].sample(frac=1, random_state=42).reset_index(drop=True)
val_target = train_target[:int((0.1)*len(train_target))].sample(frac=1, random_state=42).reset_index(drop=True)

print(f"Validation feature data:\n{val_features}")
print(f"Valdiation target data:\n{val_target}")

# Converting to numpy arr for better processing in tf
train_features = train_features.to_numpy()
train_target = train_target.to_numpy()
test_features = test_features.to_numpy()
test_target = test_target.to_numpy()
val_features = val_features.to_numpy()
val_target = val_target.to_numpy()

Validation feature data:
0        Just plain cute... Hugh Jackman is so handsome...
1        This is a good reason why people download musi...
2        Good Value This was ordered for a teenager who...
3        Doesn't even play I think it's not an authenti...
4        It will become a part of you... Sweet Dream Ba...
                               ...                        
35992    THIS GAME SUCKS!! I mean it is really sweet to...
35993    Prehistoric Research Fiction? I can't say enou...
35994    AWFUL!!! THESE DIAPERS ARE HORRIBLE. I ALWAYS ...
35995    This book is a wast of good paper Oh man, wher...
35996    I couldn't put it down!! A set of wonderful st...
Name: Feature, Length: 35997, dtype: object
Valdiation target data:
0        1
1        0
2        1
3        0
4        1
        ..
35992    0
35993    1
35994    0
35995    0
35996    1
Name: Polarity, Length: 35997, dtype: int64


In [11]:
# Adapting the vectorizer to train_features
vectorizer_int.adapt(train_features)
vectorizer_tfidf.adapt(train_features)

In [12]:
sample = 'Bad product. Would not recommend it'
a = vectorizer_int([sample])
b = vectorizer_tfidf([sample])
print(f"Vectorizer int:\n{a}\n")
print(f"Vectorizer tfidf:\n{b}\n")

Vectorizer int:
[[123 110  44  15 141   8   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]]

Vectorizer tfidf:
[[0. 0. 0. ... 0. 0. 0.]]



In [13]:
words_in_vocab = vectorizer_int.get_vocabulary()
print(f"Top 10 words:\n{words_in_vocab[:10]}")
print(f"Bottom 10 words:\n{words_in_vocab[-10:]}")

Top 10 words:
['', '[UNK]', 'the', 'and', 'a', 'i', 'to', 'of', 'it', 'this']
Bottom 10 words:
['brewed', 'breathless', 'brando', 'bookthis', 'bonnie', 'blunt', 'behaviors', 'appendix', 'amber', 'adjectives']


In [14]:
print(f"Number of unique words in vocab: {len(words_in_vocab)}")

Number of unique words in vocab: 10000


In [15]:
from tensorflow.keras.layers import Embedding
embedding = Embedding(input_dim = len(words_in_vocab),
                      output_dim = 128,
                      input_length = average_words)

In [16]:
print(f"Original sentence:\n{sample}")
sample_embed_int = embedding(vectorizer_int([sample]))
sample_embed_tfidf = embedding(vectorizer_tfidf([sample]))
print(f"Embedded sentence of 'int' vector:\n{sample_embed_int}")
print(f"Embedded sentence of 'tfidf' vector:\n{sample_embed_tfidf}")

Original sentence:
Bad product. Would not recommend it
Embedded sentence of 'int' vector:
[[[-0.03824536  0.01471433 -0.03776103 ... -0.00113573  0.01162968
   -0.02690892]
  [ 0.03346907 -0.00536389  0.04927898 ...  0.01422555 -0.01565919
   -0.02491096]
  [ 0.0311914   0.02994764  0.01037955 ...  0.0243897   0.02777798
    0.03203355]
  ...
  [ 0.00529462  0.03227958 -0.04427711 ...  0.00457172 -0.03220586
   -0.04457368]
  [ 0.00529462  0.03227958 -0.04427711 ...  0.00457172 -0.03220586
   -0.04457368]
  [ 0.00529462  0.03227958 -0.04427711 ...  0.00457172 -0.03220586
   -0.04457368]]]
Embedded sentence of 'tfidf' vector:
[[[ 0.00529462  0.03227958 -0.04427711 ...  0.00457172 -0.03220586
   -0.04457368]
  [ 0.00529462  0.03227958 -0.04427711 ...  0.00457172 -0.03220586
   -0.04457368]
  [ 0.00529462  0.03227958 -0.04427711 ...  0.00457172 -0.03220586
   -0.04457368]
  ...
  [ 0.00529462  0.03227958 -0.04427711 ...  0.00457172 -0.03220586
   -0.04457368]
  [ 0.00529462  0.03227958 -0

### Training deep learning model

In [17]:
# Training GRU and LSTM layer stacked model
from tensorflow.keras import layers
inputs = layers.Input(shape = (1,), dtype = tf.string)
x = vectorizer_int(inputs)
x = embedding(x)
x = layers.GRU(128, return_sequences = True)(x)
x = layers.LSTM(128, return_sequences = True)(x)
x = layers.GRU(64)(x)
x = layers.Dense(64, activation = 'relu')(x)
outputs = layers.Dense(1, activation = 'sigmoid')(x)
RNN1 = tf.keras.Model(inputs, outputs)

In [18]:
RNN1.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization (TextVec  (None, 128)              0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 128, 128)          1280000   
                                                                 
 gru (GRU)                   (None, 128, 128)          99072     
                                                                 
 lstm (LSTM)                 (None, 128, 128)          131584    
                                                                 
 gru_1 (GRU)                 (None, 64)                37248     
                                                             

In [19]:
# Compiling the model
RNN1.compile(loss = 'binary_crossentropy',
            optimizer = tf.keras.optimizers.Adam(),
            metrics = ['accuracy'])

In [20]:
# Fitting the model
RNN_model_intVect = RNN1.fit(train_features,
                    train_target,
                    epochs = 5,
                    validation_data = (val_features, val_target))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [22]:
RNN_model_intVect_pred = RNN1.predict(test_features)
RNN_model_intVect_pred[:10]



array([[0.9980615 ],
       [0.9994343 ],
       [0.00162119],
       [0.22521397],
       [0.9957461 ],
       [0.00746008],
       [0.00326221],
       [0.03149999],
       [0.9908599 ],
       [0.0250385 ]], dtype=float32)

In [33]:
# Converting RNN1 predictions to labels
RNN_model_intVect_pred = tf.squeeze(tf.round(RNN_model_intVect_pred))
RNN_model_intVect_pred[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([1., 1., 0., 0., 1., 0., 0., 0., 1., 0.], dtype=float32)>

In [4]:
# Creating a helper function to evaluate accuracy, precision, recall, f1-score
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
def evaluationMetrics(y_true, y_pred):
    model_accuracy = accuracy_score(y_true, y_pred) * 100
    model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average = 'weighted')
    
    model_result = {"Accuracy": model_accuracy,
                    "Precision": model_precision,
                    "Recall": model_recall,
                    "F1-Score": model_f1}
    
    return model_result

In [35]:
RNN1_eval = evaluationMetrics(test_target, RNN_model_intVect_pred)
print(f"Result of the model on unseen data: {RNN1_eval}")

Result of the model on unseen data: {'Accuracy': 93.16948771157837, 'Precision': 0.9316955269754872, 'Recall': 0.9316948771157837, 'F1-Score': 0.9316926641262752}


In [32]:
# Save the model in TensorFlow's SavedModel format
RNN1.save('RNN_model', save_format='tf')



INFO:tensorflow:Assets written to: RNN_model\assets


INFO:tensorflow:Assets written to: RNN_model\assets


# Looks like even after training the model on 10% of original data it outperformed the NB models

In [1]:
from tensorflow.keras.models import load_model
import pandas as pd
import numpy as np
RNN1 = load_model('RNN_model')

In [2]:
# Let's predict on the original test data
test_data_og = pd.read_csv("test_preprocessed.csv")
test_data_og['Feature'] = test_data_og['Title'] + ' ' + test_data_og['Text']
test_features_og = test_data_og['Feature']

RNN_model_intVect_pred_og = RNN1.predict(test_features_og)



In [6]:
# Evaluating 
import tensorflow as tf
RNN_model_intVect_pred_og = tf.squeeze(tf.round(RNN_model_intVect_pred_og))
test_data_og['Polarity'] = test_data_og['Polarity'].map({'Positive': 1, 'Negative': 0})
test_target_og = test_data_og['Polarity']

RNN1_eval_og = evaluationMetrics(test_target_og, RNN_model_intVect_pred_og)
print(f"Result of the model on unseen original data: {RNN1_eval_og}")

Result of the model on unseen original data: {'Accuracy': 93.10683641018461, 'Precision': 0.9310812328837387, 'Recall': 0.9310683641018461, 'F1-Score': 0.9310678455866469}


# The model turned out to be incredible