https://www.kaggle.com/code/ahmedgaitani/imdb-simple-rnn-code

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [3]:
import os

pandas and numpy: Used for data manipulation and numerical operations.

tensorflow: Deep learning library used to build and train the RNN model.

sklearn: Used for splitting the dataset and calculating accuracy.

In [4]:
os.chdir('E:\Python code\IBM 文本分类数据')

file_name = 'IMDB Dataset.csv'
df = pd.read_csv(file_name)
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


Preprocessing the Data


In [5]:
sentences = df['review'].values
labels = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0).values

In [7]:
labels

array([1, 1, 1, ..., 0, 0, 0], dtype=int64)

Tokenization and Padding

In [8]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)

Tokenizer: Converts text into sequences of integers, where each integer represents a word in the vocabulary.

num_words=5000: Limits the tokenizer to the top 10,000 most frequent words.

fit_on_texts: Learns the vocabulary from the sentences.

texts_to_sequences: Transforms each review into a sequence of integers.

In [9]:
maxlen = 200
X = pad_sequences(sequences, maxlen=maxlen)
y = np.array(labels)

Splitting the Data

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2, random_state=21)

Building the RNN Model

In [12]:
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128))
model.add(SimpleRNN(64, return_sequences=False, 
               kernel_regularizer=regularizers.l2(0.01)))
model.add(Dropout(0.7))
model.add(Dense(32, activation='relu', 
                kernel_regularizer=regularizers.l2(0.01)))
model.add(Dropout(0.7))
model.add(Dense(1, activation='sigmoid'))

Sequential(): Initializes a linear stack of layers for the model.

Embedding(input_dim=5000, output_dim=128): Maps 5,000 unique words to dense 128-dimensional vectors.

SimpleRNN(64, return_sequences=False): Adds a Simple RNN layer with 64 units and applies L2 regularization to reduce overfitting.

Dropout(0.7): Drops 70% of the neurons randomly during training to prevent overfitting.

Dense(32, activation='relu'): Adds a fully connected layer with 32 units, ReLU activation, and L2 regularization.

Dense(1, activation='sigmoid'): Adds an output layer with 1 unit for binary classification using sigmoid activation.

# Compiling the Model

In [13]:
model.compile(optimizer=Adam(learning_rate=0.0001),
              loss='binary_crossentropy', metrics=['accuracy'])

optimizer=Adam(learning_rate=0.0001): Uses the Adam optimizer with a learning rate of 0.0001 for efficient gradient descent.

loss='binary_crossentropy': Loss function for binary classification.

metrics=['accuracy']: Tracks the accuracy during training and evaluation.

# Training the Model

In [14]:
early_stopping = EarlyStopping(monitor='val_loss',
                               patience=3, restore_best_weights=True)

history = model.fit(X_train, y_train, 
                    epochs=10, batch_size=128, 
                    validation_data=(X_test, y_test),
                    callbacks=[early_stopping])

Epoch 1/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 62ms/step - accuracy: 0.4977 - loss: 1.8847 - val_accuracy: 0.5838 - val_loss: 1.5753
Epoch 2/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 61ms/step - accuracy: 0.5597 - loss: 1.4946 - val_accuracy: 0.7417 - val_loss: 1.2384
Epoch 3/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 60ms/step - accuracy: 0.7245 - loss: 1.1773 - val_accuracy: 0.8365 - val_loss: 0.9336
Epoch 4/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 58ms/step - accuracy: 0.8202 - loss: 0.9286 - val_accuracy: 0.8390 - val_loss: 0.7774
Epoch 5/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 59ms/step - accuracy: 0.8623 - loss: 0.7563 - val_accuracy: 0.8553 - val_loss: 0.6527
Epoch 6/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 59ms/step - accuracy: 0.8808 - loss: 0.6417 - val_accuracy: 0.8715 - val_loss: 0.5571
Epoch 7/10
[1m3

EarlyStopping(monitor='val_loss'): Monitors the validation loss during training and stops if it doesn't improve.

patience=3: Stops training if the validation loss doesn't improve for 3 consecutive epochs.

fit: Trains the model using the training data. The validation data is used to evaluate the model during training.

epochs=10: The model will go through the entire dataset 10 times.

batch_size=128: The number of samples processed before the model is updated.

# Evaluating the Model

In [15]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy:.4f}')

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.8748 - loss: 0.4188
Test Accuracy: 0.8737


In [16]:
y_pred = (model.predict(X_test) > 0.5).astype("int32")
print(f'Accuracy Score: {accuracy_score(y_test, y_pred):.4f}')

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step
Accuracy Score: 0.8737


predict: Generates predictions for the test data.

y_pred: Converts probabilities to binary predictions (0 or 1).

accuracy_score: Computes the accuracy between the true labels and the predicted labels.