In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import tensorflow as tf
from tensorflow.keras.layers import SimpleRNN, LSTM, Dense, Dropout, Embedding,  BatchNormalization
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences

import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv("Amazon-Product-Review-Sentiment-Analysis-using-RNN-Dataset.csv")
print(data.shape) 


(25000, 2)


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Review     24999 non-null  object
 1   Sentiment  25000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 390.8+ KB


In [4]:
#探索性数据分析eda 
print("Null Values:\n",data.isna().sum()) 
data = data.dropna() 
print("Null Values after dropping:\n",data.isna().sum())

Null Values:
 Review       1
Sentiment    0
dtype: int64
Null Values after dropping:
 Review       0
Sentiment    0
dtype: int64


In [6]:
data['Sentiment'].value_counts()

1    5000
2    5000
3    5000
4    5000
5    4999
Name: Sentiment, dtype: int64

In [7]:
#文本清理
nltk.download("stopwords") 
stop_words = set(stopwords.words('english')) 
nltk.download("punkt") 
def clean_reviews(text):
    regex = re.compile('<.*?>') 
    text = re.sub(regex, '', text) 
    pattern = re.compile('[~a-zA-Z0-9\s]') 
    text = re.sub(pattern,'',text) 

    pattern = re.compile("\d+") 
    text = re.sub(pattern,'',text) 
    text = text.lower() 

    text = word_tokenize(text) 
    text = [word for word in text if not word in stop_words] 

    return text 
data['Review'] = data['Review'].apply(clean_reviews) 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\石天辰\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\石天辰\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [8]:
tokenizer = Tokenizer() 
reviews_to_list = data['Review'].tolist() 
tokenizer.fit_on_texts(reviews_to_list) 

text_sequences = tokenizer.texts_to_sequences(reviews_to_list ) 

max_words = 500 
padded_sequences = pad_sequences(text_sequences,maxlen=max_words) 

text_sequences = np.array(padded_sequences) 
data = pd.get_dummies(data,columns=['Sentiment']) 

X = pad_sequences(text_sequences,maxlen=max_words) 
y = data[['Sentiment_1',"Sentiment_2",'Sentiment_3',"Sentiment_4",'Sentiment_5']] 
print(X.shape,y.shape)

(24999, 500) (24999, 5)


In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(19999, 500) (5000, 500) (19999, 5) (5000, 5)


In [10]:
rnn = Sequential(name = "Simple_RNN") 
rnn.add(Embedding(len(tokenizer.word_index)+1,max_words,input_length=max_words)) 
rnn.add(SimpleRNN(128,activation='relu',return_sequences=True)) 
rnn.add(SimpleRNN(64,activation='relu',return_sequences=False)) 
rnn.add(Dense(5,activation='softmax')) 
print(rnn.summary())

None


In [11]:
rnn.compile(
    loss = "categorical_crossentropy",
    optimizer = "adam",
    metrics = ['accuracy'],

)
history = rnn.fit(X_train,y_train,batch_size=64,epochs = 2,verbose=1,validation_data=(X_test,y_test)) 
print("Simple_RNN Score-->",rnn.evaluate(X_test,y_test,verbose=1))

Epoch 1/2
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m362s[0m 1s/step - accuracy: 0.2383 - loss: 1.5920 - val_accuracy: 0.2576 - val_loss: 1.5722
Epoch 2/2
[1m220/313[0m [32m━━━━━━━━━━━━━━[0m[37m━━━━━━[0m [1m2:00[0m 1s/step - accuracy: 0.2710 - loss: 1.5670

KeyboardInterrupt: 

In [None]:
model = Sequential(name="LSTM_Model") 
model.add(Embedding(len(tokenizer.word_index)+1,max_words,input_length=max_words)) 
model.add(LSTM(150,return_sequences = False)) 
BatchNormalization() 
model.add(Dropout(0.5)) 
#adding a dense layer with activation function of relu
model.add(Dense(50, activation='relu'))
BatchNormalization()
model.add(Dropout(0.5))

# adding the final output activation with activation function of softmax
model.add(Dense(5, activation='softmax'))

# printing model summary
print(model.summary())

In [None]:
model.compile(
    loss="categorical_crossentropy",
    optimizer='adam',
    metrics=['accuracy']
)

# Training the GRU model
history = model.fit(X_train, y_train,
                         batch_size=64,
                         epochs=3,
                         verbose=1,
                       validation_data=(X_test, y_test))

# Printing model score on test data
print("LSTM model Score---> ", model.evaluate(X_test, y_test, verbose=1))

In [None]:
metrics = history.history 
plt.figure(figsize=(10,5)) 
plt.subplot(1, 2, 1)
plt.plot(history.epoch, metrics['loss'], metrics['val_loss'])
plt.legend(['loss', 'val_loss'])
plt.xlabel('Epoch')
plt.ylabel('Loss')
  
# Plotting training and validation accuracy
plt.subplot(1, 2, 2)
plt.plot(history.epoch, metrics['accuracy'],
         metrics['val_accuracy'])
plt.legend(['accuracy', 'val_accuracy'])
plt.xlabel('Epoch')
plt.ylabel('Accuracy')

In [None]:
y_true = np.argmax(y_test.values, axis=1)
y_true.shape

# Confusion matrix
y_pred = np.argmax(model.predict(X_test), axis=1)
cm = tf.math.confusion_matrix(y_true, y_pred)
  
# Plotting the confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='g')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
from sklearn.metrics import classification_report
report = classification_report(y_true, y_pred)
print(report)

In [None]:
def predict_review_rating(text):
  text_sequences_test = np.array(tokenizer.texts_to_sequences([text]))
  testing = pad_sequences(text_sequences_test, maxlen = max_words)
  y_pred_test = np.argmax(model.predict(testing), axis=1)
  return y_pred_test[0]+1

# Testing
rating1 = predict_review_rating('Worst product')
print("The rating according to the review is: ", rating1)

rating2 = predict_review_rating('Awesome product,  I will recommend this to other users.')
print("The rating according to the review is: ", rating2)