In [8]:
# import libraries
import tensorflow as tf
from tensorflow import keras # keras is a high level API for tensorflow
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use("fivethirtyeight")
import seaborn as sns # seaborn is a visualization library based on matplotlib
sns.set_style("darkgrid") # set the style of the axes
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split # split data into training and testing set
import re, string, nltk
from nltk.corpus import stopwords # stopwords are the words that do not contribute to the deeper meaning of the phrase
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier # decision tree classifier
from keras.models import Sequential # sequential model
from keras.layers import Dense, LSTM, Bidirectional, Dropout # dropout to tackle overfitting
from keras.layers import Embedding, Flatten, Dense # embedding layer to create word vectors
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer # tfidf vectorizer to convert text into vectors
import warnings # to ignore any warnings
warnings.filterwarnings("ignore")
import nltk # natural language toolkit
nltk.download('punkt') # punkt is a pre-trained model that helps you tokenize words and sentences
!pip install transformers # transformers is a library of state-of-the-art pretrained models for Natural Language Processing (NLP)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.




In [9]:
''' Reading the Data '''

# reading the csv file
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/IMDB.csv")
df.head(5) # display first 5 rows of the dataset
df.shape # shape of the dataset
df = df[["review","sentiment"]] # selecting only review and sentiment columns
# shape of data
print(f"Data consists of {df.shape[0]} rows and {df.shape[1]} columns.")


Data consists of 50000 rows and 2 columns.


In [10]:
''' PreProcessing '''

# checking for null values
df.isna().sum()
# dropping null values
df = df.dropna()
# checking for null values
df1 = df
df1.shape

def clean_text(df, field):
    df[field] = df[field].str.replace(r"@"," at ") # replacing @ with at
    df[field] = df[field].str.replace("#[^a-zA-Z0-9_]+"," ") # replacing #word with word
    df[field] = df[field].str.replace(r"[^a-zA-Z(),\"'\n_]"," ") # replacing all characters except alphabets, commas, quotations, newlines and underscores
    df[field] = df[field].str.replace(r"http\S+","") # replacing urls with space
    df[field] = df[field].str.lower() # converting text to lowercase
    return df

clean_text(df1,"review")



# function to lemmatize text
def lemmatize(word):
    if word.endswith('s'): # removing plural words
        if word.endswith('ss'):
            return word
        return word[:-1]
    elif word.endswith('ed'): # removing past tense words
        if len(word) > 3 and word[-3] == word[-4]:
            return word[:-3]
        return word[:-2]
    elif word.endswith('ing'):# removing continuous tense words
        if len(word) > 4 and word[-4] == word[-5]:
            return word[:-4]
        return word[:-3]
    else:
        return word # returning the word as it is

# function to preprocess text
def preprocess_text(text):
    text = re.sub(r"won\'t", "will not", text) # replacing won't with will not
    text = re.sub(r"can\'t", "can not", text) # replacing can't with can not
    text = re.sub('[^a-zA-Z0-9]',' ',text) # replacing all characters except alphabets and numbers with space
    text = [lemmatize(word) for word in text.split()] # lemmatizing words
    stop_words = set(['a', 'an', 'the', 'and', 'or', 'but', 'if', 'because', 'as', 'of', 'at', 'by', 'for', 'with', 'about', 'into', 'through', 'during', 'to']) # defining stopwords
    text = ' '.join(text) # joining the words
    return text # returning the text

df1["clean_review"] = df1["review"].apply(preprocess_text) # applying preprocess_text function to review column

df1.head()






Unnamed: 0,review,sentiment,clean_review
0,one of the other reviewers has mentioned that ...,positive,one of the other reviewer ha mention that afte...
1,a wonderful little production br br the...,positive,a wonderful little production br br the film t...
2,i thought this was a wonderful way to spend ti...,positive,i thought thi wa a wonderful way to spend time...
3,basically there's a family where a little boy ...,negative,basically there a family where a little boy j...
4,"petter mattei's ""love in the time of money"" is...",positive,petter mattei love in the time of money i a v...


In [11]:
''' Train Test Split '''

# train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(np.array(df["clean_review"]),np.array(df["sentiment"]), test_size=0.25,random_state=42)
print(X_train.shape)
print(X_test.shape)


''' TF-IDF Vectorization '''

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf2 = TfidfVectorizer(use_idf=True, tokenizer=word_tokenize)
# transforming training and testing set to vectors
X_train_tf2 = tfidf2.fit_transform(X_train)
X_test_tf2 = tfidf2.transform(X_test)


from sklearn.model_selection import train_test_split
X = df["clean_review"]
y = df.sentiment
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
display(X_train.shape)
display(X_test.shape)

from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
# using tokenizer to transform text messages into training and testing set
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_train_seq_padded = pad_sequences(X_train_seq, maxlen=64)
X_test_seq_padded = pad_sequences(X_test_seq, maxlen=64)
X_train_seq_padded[0]


(37500,)
(12500,)


(37500,)

(12500,)

array([  622,   202,   230,   512,    38,  1075,     6,   277,    60,
         152,     8,     8,     6,    21,  1139,     3,   122,  1017,
          54,    54,    16,   393,     6,    63,    43,   243,  1094,
        4604,     7,   164,    22,    35,  2876,     2,     7,     3,
           3,   590,    22,   311,    10,    12,  4799,     7,   109,
         887,     6,    63,  1279,  4003,  1777,     2,     1,  2849,
          56,   394,  2644,     3,   393,     1,  5948,    13, 23742,
         200], dtype=int32)

In [12]:
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Bidirectional, LSTM, Dense
from keras.callbacks import EarlyStopping
from sklearn.metrics import roc_auc_score
import numpy as np

# Assuming df is your DataFrame and it contains 'clean_review' and 'sentiment' columns
X = df["clean_review"]
y = df.sentiment

# Convert string labels to binary (0 and 1)
y = np.where(y == 'positive', 1, 0)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Tokenization and padding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_train_seq_padded = pad_sequences(X_train_seq, maxlen=64)
X_test_seq_padded = pad_sequences(X_test_seq, maxlen=64)

# Construct model
model = Sequential()
model.add(Embedding(len(tokenizer.index_word)+1, 64))
model.add(Bidirectional(LSTM(100, dropout=0, recurrent_dropout=0)))
model.add(Dense(128, activation="relu"))
model.add(Dense(1, activation="sigmoid"))
model.compile("adam", "binary_crossentropy", metrics=["accuracy"])
model.summary()

# Early stopping to prevent overfitting
early_stop = EarlyStopping(monitor="val_loss", patience=5, verbose=True)

# Fit the model
history = model.fit(X_train_seq_padded, y_train, batch_size=64, epochs=15, validation_data=(X_test_seq_padded, y_test), callbacks=[early_stop])
model.save('/content/drive/MyDrive/Colab Notebooks/sentiment_model_BLSTM.h5')
# Evaluate the model
pred_train = model.predict(X_train_seq_padded)
pred_test = model.predict(X_test_seq_padded)
print('LSTM Recurrent Neural Network baseline: ' + str(roc_auc_score(y_train, pred_train)))
print('LSTM Recurrent Neural Network: ' + str(roc_auc_score(y_test, pred_test)))

model.evaluate(X_test_seq_padded, y_test)


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 64)          4585152   
                                                                 
 bidirectional (Bidirection  (None, 200)               132000    
 al)                                                             
                                                                 
 dense (Dense)               (None, 128)               25728     
                                                                 
 dense_1 (Dense)             (None, 1)                 129       
                                                                 
Total params: 4743009 (18.09 MB)
Trainable params: 4743009 (18.09 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 6: early s



LSTM Recurrent Neural Network baseline: 0.9999186781771309
LSTM Recurrent Neural Network: 0.8852435832768452


[0.9646524786949158, 0.8064799904823303]