In [None]:
# import libraries
import tensorflow as tf
from tensorflow import keras # keras is a high level API for tensorflow
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use("fivethirtyeight")
import seaborn as sns # seaborn is a visualization library based on matplotlib
sns.set_style("darkgrid") # set the style of the axes
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split # split data into training and testing set
import re, string, nltk
from nltk.corpus import stopwords # stopwords are the words that do not contribute to the deeper meaning of the phrase
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier # decision tree classifier
from keras.models import Sequential # sequential model
from keras.layers import Dense, LSTM, Bidirectional, Dropout # dropout to tackle overfitting
from keras.layers import Embedding, Flatten, Dense # embedding layer to create word vectors
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer # tfidf vectorizer to convert text into vectors
import warnings # to ignore any warnings
warnings.filterwarnings("ignore")
import nltk # natural language toolkit
nltk.download('punkt') # punkt is a pre-trained model that helps you tokenize words and sentences
!pip install transformers # transformers is a library of state-of-the-art pretrained models for Natural Language Processing (NLP)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Collecting transformers
  Downloading transformers-4.35.2-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.19.4-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.7/311.7 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.19,>=0.14 (from transformers)
  Downloading tokenizers-0.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m31.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
Inst

In [None]:
''' Reading the Data '''

# reading the csv file
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/IMDB.csv")
df.head(5) # display first 5 rows of the dataset
df.shape # shape of the dataset
df = df[["review","sentiment"]] # selecting only review and sentiment columns
# shape of data
print(f"Data consists of {df.shape[0]} rows and {df.shape[1]} columns.")


Data consists of 50000 rows and 2 columns.


In [None]:
''' PreProcessing '''

# checking for null values
df.isna().sum()
# dropping null values
df = df.dropna()
# checking for null values
df1 = df
df1.shape

def clean_text(df, field):
    df[field] = df[field].str.replace(r"@"," at ") # replacing @ with at
    df[field] = df[field].str.replace("#[^a-zA-Z0-9_]+"," ") # replacing #word with word
    df[field] = df[field].str.replace(r"[^a-zA-Z(),\"'\n_]"," ") # replacing all characters except alphabets, commas, quotations, newlines and underscores
    df[field] = df[field].str.replace(r"http\S+","") # replacing urls with space
    df[field] = df[field].str.lower() # converting text to lowercase
    return df

clean_text(df1,"review")



# function to lemmatize text
def lemmatize(word):
    if word.endswith('s'): # removing plural words
        if word.endswith('ss'):
            return word
        return word[:-1]
    elif word.endswith('ed'): # removing past tense words
        if len(word) > 3 and word[-3] == word[-4]:
            return word[:-3]
        return word[:-2]
    elif word.endswith('ing'):# removing continuous tense words
        if len(word) > 4 and word[-4] == word[-5]:
            return word[:-4]
        return word[:-3]
    else:
        return word # returning the word as it is

# function to preprocess text
def preprocess_text(text):
    text = re.sub(r"won\'t", "will not", text) # replacing won't with will not
    text = re.sub(r"can\'t", "can not", text) # replacing can't with can not
    text = re.sub('[^a-zA-Z0-9]',' ',text) # replacing all characters except alphabets and numbers with space
    text = [lemmatize(word) for word in text.split()] # lemmatizing words
    stop_words = set(['a', 'an', 'the', 'and', 'or', 'but', 'if', 'because', 'as', 'of', 'at', 'by', 'for', 'with', 'about', 'into', 'through', 'during', 'to']) # defining stopwords
    text = ' '.join(text) # joining the words
    return text # returning the text

df1["clean_review"] = df1["review"].apply(preprocess_text) # applying preprocess_text function to review column

df1.head()






Unnamed: 0,review,sentiment,clean_review
0,one of the other reviewers has mentioned that ...,positive,one of the other reviewer ha mention that afte...
1,a wonderful little production br br the...,positive,a wonderful little production br br the film t...
2,i thought this was a wonderful way to spend ti...,positive,i thought thi wa a wonderful way to spend time...
3,basically there's a family where a little boy ...,negative,basically there a family where a little boy j...
4,"petter mattei's ""love in the time of money"" is...",positive,petter mattei love in the time of money i a v...


In [None]:
''' Train Test Split '''

# train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(np.array(df["clean_review"]),np.array(df["sentiment"]), test_size=0.25,random_state=42)
print(X_train.shape)
print(X_test.shape)


''' TF-IDF Vectorization '''

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf2 = TfidfVectorizer(use_idf=True, tokenizer=word_tokenize)
# transforming training and testing set to vectors
X_train_tf2 = tfidf2.fit_transform(X_train)
X_test_tf2 = tfidf2.transform(X_test)


from sklearn.model_selection import train_test_split
X = df["clean_review"]
y = df.sentiment
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
display(X_train.shape)
display(X_test.shape)

from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
# using tokenizer to transform text messages into training and testing set
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_train_seq_padded = pad_sequences(X_train_seq, maxlen=64)
X_test_seq_padded = pad_sequences(X_test_seq, maxlen=64)
X_train_seq_padded[0]


(37500,)
(12500,)


(37500,)

(12500,)

array([  622,   202,   230,   512,    38,  1075,     6,   277,    60,
         152,     8,     8,     6,    21,  1139,     3,   122,  1017,
          54,    54,    16,   393,     6,    63,    43,   243,  1094,
        4604,     7,   164,    22,    35,  2876,     2,     7,     3,
           3,   590,    22,   311,    10,    12,  4799,     7,   109,
         887,     6,    63,  1279,  4003,  1777,     2,     1,  2849,
          56,   394,  2644,     3,   393,     1,  5948,    13, 23742,
         200], dtype=int32)

In [None]:
# Hybrid CNN-RNN model.
# importing libraries
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, Conv1D, MaxPooling1D, LSTM, Dense, Bidirectional
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping

import time

st_time  = time.time()
# Load and preprocess data
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Amazon_review.csv')
reviews = df['review'].values
labels = df['sentiment'].values
# preprocessing according to the model architecture
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(reviews)
sequences = tokenizer.texts_to_sequences(reviews)
padded_sequences = pad_sequences(sequences, maxlen=100)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

# CNN architecture
input_cnn = Input(shape=(100,)) # input layer
embedding_cnn = Embedding(input_dim=5000, output_dim=64)(input_cnn) # embedding layer
conv1d_cnn = Conv1D(filters=64, kernel_size=3, activation='relu')(embedding_cnn) # convolutional layer
max_pool_cnn = MaxPooling1D(pool_size=2)(conv1d_cnn) # max pooling layer
flatten_cnn = tf.keras.layers.Flatten()(max_pool_cnn) # flatten layer
dense_cnn = Dense(32, activation='relu')(flatten_cnn) # dense layer

# RNN architecture
input_rnn = Input(shape=(100,)) # input layer
embedding_rnn = Embedding(input_dim=5000, output_dim=64)(input_rnn) # embedding layer
lstm_rnn = Bidirectional(LSTM(64, return_sequences=True))(embedding_rnn) # lstm layer
lstm_rnn = Bidirectional(LSTM(32))(lstm_rnn) # lstm layer
dense_rnn = Dense(32, activation='relu')(lstm_rnn) # dense layer

# Merge CNN and RNN outputs
merged = tf.keras.layers.concatenate([dense_cnn, dense_rnn], axis=-1)
output = Dense(1, activation='sigmoid')(merged)

# Build model
model = Model(inputs=[input_cnn, input_rnn], outputs=output)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Set up early stopping
early_stop = EarlyStopping(monitor='val_loss', patience=3, verbose=1)

# Train model
model.fit([X_train, X_train], y_train, validation_data=([X_test, X_test], y_test), epochs=20, batch_size=64, callbacks=[early_stop])


model.save('/content/drive/MyDrive/Colab Notebooks/sentiment_model_Hybrid.h5')

# Evaluate model
loss, accuracy = model.evaluate([X_test, X_test], y_test, verbose=0)
print('Test accuracy:', accuracy)

en_time = time.time()

t_time = en_time - st_time

print('\n')

print(f'total time taken for running hybrid model CNN+ RNN is : {t_time}')

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 5: early stopping


  saving_api.save_model(


Test accuracy: 0.8721725940704346


total time taken for running hybrid model CNN+ RNN is : 291.09597086906433
