In [1]:
import random

import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam

In [151]:
df = pd.read_csv('genshin_review.csv')

In [153]:
import string

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    words = nltk.wordpunct_tokenize(text)
    
    # Lowercase and lemmatize
    text = ' '.join(lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words)
    
    return text

df['review'] = df['review'].apply(clean_text)

In [154]:
df['rating'] = df['rating'].apply(lambda x: 1 if x > 3 else 0)

In [155]:
num_zeros = (df['rating'] == 0).sum()
print(num_zeros)

336


In [207]:
# Separate majority and minority classes
df_majority = df[df.rating==1]
df_minority = df[df.rating==0]

# Downsample majority class
from sklearn.utils import resample
df_majority_downsampled = resample(df_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=336,     # to match minority class
                                 random_state=123) # reproducible results
 
# Combine minority class with downsampled majority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])

# Display new class counts
df_downsampled.rating.value_counts()
df_downsampled.reset_index(drop=True, inplace=True)

In [208]:
X_train, X_test, y_train, y_test = train_test_split(df_downsampled['review'], df_downsampled['rating'], test_size=0.25, random_state=25)

In [209]:
tokenizer = Tokenizer(num_words=10000, oov_token='OOV')
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

# Padding sequences
X_train = pad_sequences(X_train, padding='post')
X_test = pad_sequences(X_test, padding='post')

In [210]:
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=64))
model.add(LSTM(64))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

In [211]:
checkpoint = ModelCheckpoint('model-{epoch:03d}-{accuracy:03f}-{val_accuracy:03f}.keras', verbose=1, monitor='val_loss',save_best_only=True, mode='auto')
model.fit(X_train, y_train, epochs=15, validation_data=(X_test, y_test), callbacks=[checkpoint])

Epoch 1/15
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step - accuracy: 0.4669 - loss: 0.6947
Epoch 1: val_loss improved from inf to 0.69549, saving model to model-001-0.490079-0.464286.keras
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 133ms/step - accuracy: 0.4683 - loss: 0.6947 - val_accuracy: 0.4643 - val_loss: 0.6955
Epoch 2/15
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step - accuracy: 0.5414 - loss: 0.6913
Epoch 2: val_loss did not improve from 0.69549
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 81ms/step - accuracy: 0.5399 - loss: 0.6914 - val_accuracy: 0.4643 - val_loss: 0.6962
Epoch 3/15
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step - accuracy: 0.4928 - loss: 0.6942
Epoch 3: val_loss improved from 0.69549 to 0.69333, saving model to model-003-0.503968-0.470238.keras
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 99ms/step - accuracy: 0.4934 - 

<keras.src.callbacks.history.History at 0x22bf77ecc10>

In [226]:
def predict_mood(review):
    review = clean_text(review)
    review = tokenizer.texts_to_sequences([review])
    review = pad_sequences(review, padding='post')
    prediction = model.predict(review)
    print(prediction)
    if 0.5 <= prediction < 0.51:
        return 'Neutral/Not graded'
    elif prediction >= 0.51:
        return 'Positive'
    else:
        return 'Negative'
    
import random

random_index = random.randint(0, 650)
review = df_downsampled['review'][random_index]
rating = df_downsampled['rating'][random_index]
# 
# print(f"Review: {review}")
# print(f"Rating: {rating}")
# print(predict_mood(review))
message = "I want to kill myself when I play this crap"
print(message)
print(predict_mood(message))

I want to kill myself when I play this crap
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step
[[0.49822283]]
Negative


In [222]:
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Testing Accuracy:  0.6488


In [223]:
from sklearn.metrics import f1_score

# Get predicted values
y_pred = model.predict(X_test)
y_pred = np.where(y_pred >= 0.5, 1, 0)  # convert probabilities to binary output

# Compute F1 score
f1 = f1_score(y_test, y_pred)
print("Testing F1 Score: {:.4f}".format(f1))

[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step
Testing F1 Score: 0.6550


In [224]:
from sklearn.metrics import recall_score

# Compute recall
recall = recall_score(y_test, y_pred)
print("Testing Recall: {:.4f}".format(recall))

Testing Recall: 0.7273
