https://www.kaggle.com/code/mohamedtarek77/imdb-lstm-and-dnn

In [3]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [1]:
import os
import pandas as pd

In [2]:
os.chdir('E:\Python code\IBM 文本分类数据')

file_name = 'IMDB Dataset.csv'
df = pd.read_csv(file_name)
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# 数据处理

In [4]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
def preprocess_text(text):
    text = re.sub(r'<[^>]+>', '', text)
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    processed_text = ' '.join(tokens)
    return processed_text
df['processed_review'] = df['review'].apply(preprocess_text)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...


In [6]:
df.drop('review', axis=1, inplace=True)

In [7]:
X = df.drop('sentiment', axis=1)
y = df.drop('processed_review', axis=1)

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)

In [9]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


In [10]:
X_train = X_train['processed_review'].tolist()
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [11]:
word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts(X_train)
X_train_sequences = word_tokenizer.texts_to_sequences(X_train)
X_test_sequences = word_tokenizer.texts_to_sequences(X_test)
max_length = max([len(seq) for seq in X_train_sequences])
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_length, padding='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_length, padding='post')
vocab_length = len(word_tokenizer.word_index) + 1
print(f"Vocabulary Length: {vocab_length}")
print(f"X_train_padded Shape: {X_train_padded.shape}")
print(f"X_test_padded Shape: {X_test_padded.shape}")

Vocabulary Length: 192333
X_train_padded Shape: (40000, 1429)
X_test_padded Shape: (1, 1429)


In [12]:
X_train_padded, X_test_padded, y_train, y_test = train_test_split(X_train_padded, y_train, test_size=0.2, random_state=21)

In [13]:
print(f"X_train_padded Shape: {X_train_padded.shape}")
print(f"y_train Shape: {y_train.shape}")
print(f"X_test_padded Shape: {X_test_padded.shape}")
print(f"y_test Shape: {y_test.shape}")

X_train_padded Shape: (32000, 1429)
y_train Shape: (32000,)
X_test_padded Shape: (8000, 1429)
y_test Shape: (8000,)


# Using DNN

In [14]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten, Dropout
embedding_dim = 100  
max_length = 1429 
model = Sequential([
    Embedding(input_dim=vocab_length, output_dim=embedding_dim, input_length=max_length),        
    Dense(64, activation='relu'),    
    Dropout(0.7),    
    Dense(32, activation='relu'),    
    Dropout(0.7),
    Flatten(),    
    Dense(1, activation='sigmoid')
])



In [15]:
model.compile(optimizer='adam', loss='binary_crossentropy',  metrics=['accuracy'])

In [16]:
from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

In [17]:
history = model.fit(X_train_padded, y_train, epochs=10, batch_size=200, validation_data=(X_test_padded, y_test),
                    callbacks=[early_stopping])

Epoch 1/10
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 238ms/step - accuracy: 0.5188 - loss: 0.6915 - val_accuracy: 0.7894 - val_loss: 0.4976
Epoch 2/10
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 234ms/step - accuracy: 0.8223 - loss: 0.3977 - val_accuracy: 0.8830 - val_loss: 0.2883
Epoch 3/10
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 235ms/step - accuracy: 0.9353 - loss: 0.1709 - val_accuracy: 0.8879 - val_loss: 0.2957
Epoch 4/10
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 241ms/step - accuracy: 0.9655 - loss: 0.0926 - val_accuracy: 0.8791 - val_loss: 0.3481
Epoch 5/10
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 237ms/step - accuracy: 0.9791 - loss: 0.0596 - val_accuracy: 0.8759 - val_loss: 0.4090
