PS: Classification using Deep Neural Network - Binary classification using Deep Neural Networks Example: Classify movie reviews into positive" reviews and "negative" reviews, just based on the text content of the reviews. Use IMDB dataset

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.datasets import imdb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
# Load the IMDb dataset
df = pd.read_csv("https://github.com/tejwhatsnew/LP5/blob/781acea9e3da70e0fa6a12461c44175ce76feb70/DL/IMDB_Dataset.csv")

df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [None]:
max_words = 1000      # Number of words to be extracted from 1 particular row while importing the dataset
max_len = 150         # For each sentence in 1 row, consider only 1st 100 common words [Cut texts after this number of words (among top max_features most common words)]

tokenizer = Tokenizer(num_words=max_words)

tokenizer.fit_on_texts(df['review'])
X = tokenizer.texts_to_sequences(df['review'])

X = pad_sequences(X, maxlen=max_len)
y = np.array(df['sentiment'].map({'positive': 1, 'negative': 0}))

In [None]:
X

array([[433,  14,  12, ...,  16, 125, 486],
       [  0,   0,   0, ...,  23,  69, 221],
       [  0,   0,   0, ...,  63,  16, 350],
       ...,
       [262, 764, 177, ...,  16,   2,   2],
       [  0,   0,   0, ...,  67, 739,  42],
       [  0,   0,   0, ..., 794,  11,  17]], dtype=int32)

In [None]:
y

array([1, 1, 1, ..., 0, 0, 0])

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [None]:
# Build the neural network model
model = Sequential([
    Embedding(input_dim=max_words, output_dim=32, input_length=max_len),
    Flatten(),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])

In [None]:
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
# Train the model
history = model.fit(X_train, y_train, epochs=5, batch_size=128, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_accuracy)

Test Loss: 0.48288047313690186
Test Accuracy: 0.8307999968528748
