# Monster Hunter Franchise Review Sentiment Analysis
---

## 1. Read Scrapping Results

In [None]:
import pandas as pd

In [9]:
column_names = ['url', 'review', 'is_recommended']

df_mhwilds = pd.read_csv("./data/mhwilds-reviews.csv", header=None, names=column_names)
df_mhwilds.head(3)

Unnamed: 0,url,review,is_recommended
0,https://steamcommunity.com/id/xSinek/recommend...,Monster Hunter Wilds: A Majestic Hunt Marred b...,Recommended
1,https://steamcommunity.com/id/ze11an/recommend...,"Ride monster,Life good,Monster fight back,Kill...",Recommended
2,https://steamcommunity.com/profiles/7656119829...,If this review gets 1 like I will get an Arkve...,Recommended


In [10]:
df_mhrise = pd.read_csv("./data/mhrise-reviews.csv", header=None, names=column_names)
df_mhrise.head(3)

Unnamed: 0,url,review,is_recommended
0,https://steamcommunity.com/profiles/7656119816...,I've been playing for 3 Hours and its already ...,Recommended
1,https://steamcommunity.com/id/metaLfaceshriLL/...,I LOVE KILLING ENDANGERED SPECIES,Recommended
2,https://steamcommunity.com/profiles/7656119831...,Many monsters think they can outsmart me with ...,Recommended


In [15]:
df_monsterhunter = pd.concat([df_mhwilds, df_mhrise], ignore_index=True)
df_monsterhunter

Unnamed: 0,url,review,is_recommended
0,https://steamcommunity.com/id/xSinek/recommend...,Monster Hunter Wilds: A Majestic Hunt Marred b...,Recommended
1,https://steamcommunity.com/id/ze11an/recommend...,"Ride monster,Life good,Monster fight back,Kill...",Recommended
2,https://steamcommunity.com/profiles/7656119829...,If this review gets 1 like I will get an Arkve...,Recommended
3,https://steamcommunity.com/profiles/7656119948...,My grandma runs better than this game,Not Recommended
4,https://steamcommunity.com/id/kirigherkins/rec...,very immersive game. you can cook a well-done ...,Recommended
...,...,...,...
18595,https://steamcommunity.com/profiles/7656119798...,DRM that breaks the game.,Not Recommended
18596,https://steamcommunity.com/profiles/7656119805...,Capcom decided to break another game that work...,Not Recommended
18597,https://steamcommunity.com/id/sopheon/recommen...,New DRM added years after release left the gam...,Not Recommended
18598,https://steamcommunity.com/profiles/7656119820...,Adding DRM to a game that came out two years a...,Not Recommended


In [16]:
df_monsterhunter.describe()

Unnamed: 0,url,review,is_recommended
count,18600,18581,18600
unique,18598,17860,2
top,https://steamcommunity.com/profiles/7656119807...,yes,Recommended
freq,2,46,9888


## 2. Text Cleaning / Preprocessing

In [95]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split

In [59]:
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download('punkt_tab')
nltk.download('vader_lexicon')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [80]:
df_prep = df_monsterhunter.copy()

In [81]:
df_prep.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18600 entries, 0 to 18599
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   url             18600 non-null  object
 1   review          18581 non-null  object
 2   is_recommended  18600 non-null  object
dtypes: object(3)
memory usage: 436.1+ KB


In [82]:
df_prep.url = df_prep.url.astype("string")
df_prep.review = df_prep.review.astype("string")
df_prep.is_recommended = df_prep.is_recommended.astype("string")

df_prep.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18600 entries, 0 to 18599
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   url             18600 non-null  string
 1   review          18581 non-null  string
 2   is_recommended  18600 non-null  string
dtypes: string(3)
memory usage: 436.1 KB


* Handle Negations (Critical for Sentiment)

In [83]:
def handle_negations(text):
    negation_words = r'\b(?:not|no|never|cannot|can\'t|won\'t|didn\'t|doesn\'t|don\'t|isn\'t|aren\'t|wasn\'t|weren\'t|shouldn\'t|couldn\'t|wouldn\'t|haven\'t|hasn\'t|hadn\'t)\b'
    pattern = re.compile(rf'({negation_words})\s+([a-zA-Z]+)')
    return pattern.sub(lambda m: f"{m.group(1)}_{m.group(2)}", text)

* Strip HTML tags, URLs, and special characters (e.g., \n, emojis)

In [84]:
def clean_text(text):
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = handle_negations(text)
    text = re.sub(r'[^a-zA-Z_\s]', '', text)  # Keep only letters and spaces
    return text.strip()

* Lowercase conversion

In [85]:
df_prep['cleaned_review'] = df_prep['review'].apply(lambda x: clean_text(str(x)).lower())

In [86]:
df_prep[["review", "cleaned_review"]].head(3)

Unnamed: 0,review,cleaned_review
0,Monster Hunter Wilds: A Majestic Hunt Marred b...,monster hunter wilds a majestic hunt marred by...
1,"Ride monster,Life good,Monster fight back,Kill...",ride monsterlife goodmonster fight backkill mo...
2,If this review gets 1 like I will get an Arkve...,if this review gets like i will get an arkvel...


* Tokenization 

In [87]:
df_prep['tokens'] = df_prep['cleaned_review'].apply(word_tokenize)
df_prep[["cleaned_review", "tokens"]].head(3)

Unnamed: 0,cleaned_review,tokens
0,monster hunter wilds a majestic hunt marred by...,"[monster, hunter, wilds, a, majestic, hunt, ma..."
1,ride monsterlife goodmonster fight backkill mo...,"[ride, monsterlife, goodmonster, fight, backki..."
2,if this review gets like i will get an arkvel...,"[if, this, review, gets, like, i, will, get, a..."


* Lemmatization/Stemming

In [88]:
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

In [89]:
df_prep['lemmatized'] = df_prep['tokens'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x if word not in stop_words])
df_prep[["tokens", "lemmatized"]].head(3)

Unnamed: 0,tokens,lemmatized
0,"[monster, hunter, wilds, a, majestic, hunt, ma...","[monster, hunter, wild, majestic, hunt, marred..."
1,"[ride, monsterlife, goodmonster, fight, backki...","[ride, monsterlife, goodmonster, fight, backki..."
2,"[if, this, review, gets, like, i, will, get, a...","[review, get, like, get, arkveld, tattoo, fore..."


* Labeling with VADER

In [90]:
sid = SentimentIntensityAnalyzer()

def get_sentiment(text):
    scores = sid.polarity_scores(text)
    if scores['compound'] > 0.05:
        return 'positive'
    elif scores['compound'] < -0.05:
        return 'negative'
    else:
        return 'neutral'

In [91]:
df_prep['sentiment'] = df_prep['cleaned_review'].apply(get_sentiment)
df_prep.head(3)

Unnamed: 0,url,review,is_recommended,cleaned_review,tokens,lemmatized,sentiment
0,https://steamcommunity.com/id/xSinek/recommend...,Monster Hunter Wilds: A Majestic Hunt Marred b...,Recommended,monster hunter wilds a majestic hunt marred by...,"[monster, hunter, wilds, a, majestic, hunt, ma...","[monster, hunter, wild, majestic, hunt, marred...",positive
1,https://steamcommunity.com/id/ze11an/recommend...,"Ride monster,Life good,Monster fight back,Kill...",Recommended,ride monsterlife goodmonster fight backkill mo...,"[ride, monsterlife, goodmonster, fight, backki...","[ride, monsterlife, goodmonster, fight, backki...",negative
2,https://steamcommunity.com/profiles/7656119829...,If this review gets 1 like I will get an Arkve...,Recommended,if this review gets like i will get an arkvel...,"[if, this, review, gets, like, i, will, get, a...","[review, get, like, get, arkveld, tattoo, fore...",positive


In [93]:
df_prep["is_recommended"].value_counts()

is_recommended
Recommended        9888
Not Recommended    8712
Name: count, dtype: Int64

In [92]:
df_prep["sentiment"].value_counts()

sentiment
positive    10620
negative     4223
neutral      3757
Name: count, dtype: int64

In [98]:
x_train, x_test, y_train, y_test = train_test_split(df_prep["cleaned_review"], df_prep["sentiment"], test_size=0.2, random_state=42)

In [104]:
print(f"Train data: {x_train.shape}")
print(f"Test data: {x_test.shape}")

Train data: (14880,)
Test data: (3720,)


## Feature Extraction

In [153]:
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
import numpy as np

### 1. TF-IDF (Term Frequency-Inverse Document Frequency)

In [131]:
tfidf = TfidfVectorizer(max_features=5000)
x_train_tfidf = tfidf.fit_transform(x_train)
x_test_tfidf = tfidf.transform(x_test)

In [108]:
print(f"Train data: {x_train_tfidf.shape}")
print(f"Test data: {x_test_tfidf.shape}")

Train data: (14880, 5000)
Test data: (3720, 5000)


### 2. Word Embeddings (Word2Vec)

In [130]:
tokenized_train = [text.split() for text in x_train]
tokenized_test = [text.split() for text in x_test]

w2v_model = Word2Vec(tokenized_train, vector_size=300, window=5, min_count=1, workers=4)

In [112]:
def average_word_vectors(text, model, num_features):
    feature_vector = np.zeros((num_features,), dtype="float32")
    n_words = 0
    for word in text:
        if word in model.wv:
            feature_vector = np.add(feature_vector, model.wv[word])
            n_words += 1
    if n_words > 0:
        feature_vector = np.divide(feature_vector, n_words)
    return feature_vector

In [113]:
x_train_w2v = np.array([average_word_vectors(text, w2v_model, 300) for text in tokenized_train])
x_test_w2v = np.array([average_word_vectors(text, w2v_model, 300) for text in tokenized_test])

In [114]:
print(f"Train data: {x_train_w2v.shape}")
print(f"Test data: {x_test_w2v.shape}")

Train data: (14880, 300)
Test data: (3720, 300)


In [154]:
tfidf_ngram = TfidfVectorizer(ngram_range=(1, 2), max_features=5000)
x_train_tfidf_ngram = tfidf.fit_transform(x_train)
x_test_tfidf_ngram = tfidf.transform(x_test)

In [155]:
print(f"Train data: {x_train_tfidf_ngram.shape}")
print(f"Test data: {x_test_tfidf_ngram.shape}")

Train data: (14880, 5000)
Test data: (3720, 5000)


## Train Model

**1. Logistic Regression**

In [117]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=1000, multi_class='multinomial')

**2. Random Forest**

In [118]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=42)

In [119]:
#PUT
lr.fit(x_train_tfidf, y_train)
print("TF-IDF + LR Accuracy:", lr.score(x_test_tfidf, y_test))

TF-IDF + LR Accuracy: 0.8516129032258064


In [120]:
rf.fit(x_train_tfidf, y_train)
print("TF-IDF + RF Accuracy:", rf.score(x_test_tfidf, y_test))

TF-IDF + RF Accuracy: 0.7768817204301075


In [127]:
lr.fit(x_train_w2v, y_train)
print("Word2Vec + LR Accuracy:", lr.score(x_test_w2v, y_test))

Word2Vec + LR Accuracy: 0.7123655913978495


In [128]:
rf.fit(x_train_w2v, y_train)
print("Word2Vec + RF Accuracy:", rf.score(x_test_w2v, y_test))

Word2Vec + RF Accuracy: 0.7247311827956989


In [156]:
lr.fit(x_train_tfidf_ngram, y_train)
print("N-grams + LR Accuracy:", lr.score(x_test_tfidf_ngram, y_test))

N-grams + LR Accuracy: 0.8516129032258064


In [157]:
rf.fit(x_train_tfidf_ngram, y_train)
print("N-grams + RF Accuracy:", rf.score(x_test_tfidf_ngram, y_test))

N-grams + RF Accuracy: 0.7768817204301075


In [164]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(x_train_w2v.shape[1],)))
model.add(Dropout(0.5))
model.add(Dense(3, activation='softmax'))  # 3 classes: positive, negative, neutral
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train_w2v, y_train_encoded, epochs=10, batch_size=32)

2025-05-07 18:50:14.555923: W tensorflow/compiler/mlir/tools/kernel_gen/tf_gpu_runtime_wrappers.cc:40] 'cuModuleLoadData(&module, data)' failed with 'CUDA_ERROR_UNSUPPORTED_PTX_VERSION'

2025-05-07 18:50:14.555946: W tensorflow/compiler/mlir/tools/kernel_gen/tf_gpu_runtime_wrappers.cc:40] 'cuModuleGetFunction(&function, module, kernel_name)' failed with 'CUDA_ERROR_INVALID_HANDLE'

2025-05-07 18:50:14.555954: W tensorflow/core/framework/op_kernel.cc:1844] INTERNAL: 'cuLaunchKernel(function, gridX, gridY, gridZ, blockX, blockY, blockZ, 0, reinterpret_cast<CUstream>(stream), params, nullptr)' failed with 'CUDA_ERROR_INVALID_HANDLE'


InternalError: {{function_node __wrapped__Cast_device_/job:localhost/replica:0/task:0/device:GPU:0}} 'cuLaunchKernel(function, gridX, gridY, gridZ, blockX, blockY, blockZ, 0, reinterpret_cast<CUstream>(stream), params, nullptr)' failed with 'CUDA_ERROR_INVALID_HANDLE' [Op:Cast] name: 

In [163]:
import tensorflow as tf; print(tf.config.list_physical_devices('GPU'))

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
