# Monster Hunter Franchise Review Sentiment Analysis
---

## 1. Read Scrapping Results

In [1]:
import pandas as pd

In [2]:
column_names = ['url', 'review', 'is_recommended']

df_mhwilds = pd.read_csv("./data/mhwilds-reviews.csv", header=None, names=column_names)
df_mhwilds.head(3)

Unnamed: 0,url,review,is_recommended
0,https://steamcommunity.com/id/xSinek/recommend...,Monster Hunter Wilds: A Majestic Hunt Marred b...,Recommended
1,https://steamcommunity.com/id/ze11an/recommend...,"Ride monster,Life good,Monster fight back,Kill...",Recommended
2,https://steamcommunity.com/profiles/7656119829...,If this review gets 1 like I will get an Arkve...,Recommended


In [3]:
df_mhrise = pd.read_csv("./data/mhrise-reviews.csv", header=None, names=column_names)
df_mhrise.head(3)

Unnamed: 0,url,review,is_recommended
0,https://steamcommunity.com/profiles/7656119816...,I've been playing for 3 Hours and its already ...,Recommended
1,https://steamcommunity.com/id/metaLfaceshriLL/...,I LOVE KILLING ENDANGERED SPECIES,Recommended
2,https://steamcommunity.com/profiles/7656119831...,Many monsters think they can outsmart me with ...,Recommended


In [4]:
df_monsterhunter = pd.concat([df_mhwilds, df_mhrise], ignore_index=True)
df_monsterhunter

Unnamed: 0,url,review,is_recommended
0,https://steamcommunity.com/id/xSinek/recommend...,Monster Hunter Wilds: A Majestic Hunt Marred b...,Recommended
1,https://steamcommunity.com/id/ze11an/recommend...,"Ride monster,Life good,Monster fight back,Kill...",Recommended
2,https://steamcommunity.com/profiles/7656119829...,If this review gets 1 like I will get an Arkve...,Recommended
3,https://steamcommunity.com/profiles/7656119948...,My grandma runs better than this game,Not Recommended
4,https://steamcommunity.com/id/kirigherkins/rec...,very immersive game. you can cook a well-done ...,Recommended
...,...,...,...
18595,https://steamcommunity.com/profiles/7656119798...,DRM that breaks the game.,Not Recommended
18596,https://steamcommunity.com/profiles/7656119805...,Capcom decided to break another game that work...,Not Recommended
18597,https://steamcommunity.com/id/sopheon/recommen...,New DRM added years after release left the gam...,Not Recommended
18598,https://steamcommunity.com/profiles/7656119820...,Adding DRM to a game that came out two years a...,Not Recommended


In [5]:
df_monsterhunter.describe()

Unnamed: 0,url,review,is_recommended
count,18600,18581,18600
unique,18598,17860,2
top,https://steamcommunity.com/profiles/7656119807...,yes,Recommended
freq,2,46,9888


## 2. Text Cleaning / Preprocessing

In [6]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from tqdm import tqdm
from tqdm import trange
tqdm.pandas()

In [7]:
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download('punkt_tab')
nltk.download('vader_lexicon')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [8]:
df_prep = df_monsterhunter.copy()

In [9]:
df_prep.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18600 entries, 0 to 18599
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   url             18600 non-null  object
 1   review          18581 non-null  object
 2   is_recommended  18600 non-null  object
dtypes: object(3)
memory usage: 436.1+ KB


In [10]:
df_prep.url = df_prep.url.astype("string")
df_prep.review = df_prep.review.astype("string")
df_prep.is_recommended = df_prep.is_recommended.astype("string")

df_prep.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18600 entries, 0 to 18599
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   url             18600 non-null  string
 1   review          18581 non-null  string
 2   is_recommended  18600 non-null  string
dtypes: string(3)
memory usage: 436.1 KB


* Strip HTML tags, URLs, and special characters (e.g., \n, emojis)

In [11]:
def clean_text(text):
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z_\s]', '', text)  # Keep only letters and spaces
    return text.strip()

* Lowercase conversion

In [12]:
df_prep['cleaned_review'] = df_prep['review'].apply(lambda x: clean_text(str(x)).lower())

In [13]:
df_prep[["review", "cleaned_review"]].head(3)

Unnamed: 0,review,cleaned_review
0,Monster Hunter Wilds: A Majestic Hunt Marred b...,monster hunter wilds a majestic hunt marred by...
1,"Ride monster,Life good,Monster fight back,Kill...",ride monsterlife goodmonster fight backkill mo...
2,If this review gets 1 like I will get an Arkve...,if this review gets like i will get an arkvel...


* Tokenization 

In [14]:
df_prep['tokens'] = df_prep['cleaned_review'].apply(word_tokenize)
df_prep[["cleaned_review", "tokens"]].head(3)

Unnamed: 0,cleaned_review,tokens
0,monster hunter wilds a majestic hunt marred by...,"[monster, hunter, wilds, a, majestic, hunt, ma..."
1,ride monsterlife goodmonster fight backkill mo...,"[ride, monsterlife, goodmonster, fight, backki..."
2,if this review gets like i will get an arkvel...,"[if, this, review, gets, like, i, will, get, a..."


* Lemmatization/Stemming

In [15]:
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

In [16]:
df_prep['lemmatized'] = df_prep['tokens'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x if word not in stop_words])
df_prep[["tokens", "lemmatized"]].head(3)

Unnamed: 0,tokens,lemmatized
0,"[monster, hunter, wilds, a, majestic, hunt, ma...","[monster, hunter, wild, majestic, hunt, marred..."
1,"[ride, monsterlife, goodmonster, fight, backki...","[ride, monsterlife, goodmonster, fight, backki..."
2,"[if, this, review, gets, like, i, will, get, a...","[review, get, like, get, arkveld, tattoo, fore..."


In [17]:
df_prep['cleaned_review'] = df_prep['lemmatized'].apply(lambda x: ' '.join(x))
df_prep[["cleaned_review", "lemmatized"]].head(3)

Unnamed: 0,cleaned_review,lemmatized
0,monster hunter wild majestic hunt marred techn...,"[monster, hunter, wild, majestic, hunt, marred..."
1,ride monsterlife goodmonster fight backkill mo...,"[ride, monsterlife, goodmonster, fight, backki..."
2,review get like get arkveld tattoo forehead,"[review, get, like, get, arkveld, tattoo, fore..."


In [18]:
df_prep = df_prep[df_prep['cleaned_review'] != '']

* Labeling Review Score Based

In [19]:
sid = SentimentIntensityAnalyzer()

def get_sentiment(text):
    scores = sid.polarity_scores(text)
    if scores['compound'] > 0.05:
        return 'positive'
    elif scores['compound'] < -0.05:
        return 'negative'
    else:
        return 'neutral'

df_prep['sentiment'] = df_prep['cleaned_review'].progress_apply(get_sentiment)
df_prep.head(3)

100%|██████████| 18397/18397 [00:04<00:00, 3925.54it/s]


Unnamed: 0,url,review,is_recommended,cleaned_review,tokens,lemmatized,sentiment
0,https://steamcommunity.com/id/xSinek/recommend...,Monster Hunter Wilds: A Majestic Hunt Marred b...,Recommended,monster hunter wild majestic hunt marred techn...,"[monster, hunter, wilds, a, majestic, hunt, ma...","[monster, hunter, wild, majestic, hunt, marred...",positive
1,https://steamcommunity.com/id/ze11an/recommend...,"Ride monster,Life good,Monster fight back,Kill...",Recommended,ride monsterlife goodmonster fight backkill mo...,"[ride, monsterlife, goodmonster, fight, backki...","[ride, monsterlife, goodmonster, fight, backki...",negative
2,https://steamcommunity.com/profiles/7656119829...,If this review gets 1 like I will get an Arkve...,Recommended,review get like get arkveld tattoo forehead,"[if, this, review, gets, like, i, will, get, a...","[review, get, like, get, arkveld, tattoo, fore...",positive


In [20]:
df_prep["sentiment"].value_counts()

sentiment
positive    10847
negative     4040
neutral      3510
Name: count, dtype: int64

In [21]:
x_train, x_test, y_train, y_test = train_test_split(df_prep["cleaned_review"], df_prep["sentiment"], test_size=0.2, random_state=0)

In [22]:
print(f"Train data: {x_train.shape}")
print(f"Test data: {x_test.shape}")

Train data: (14717,)
Test data: (3680,)


In [23]:
label_encoder = LabelEncoder()

y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [24]:
label_encoder.classes_

array(['negative', 'neutral', 'positive'], dtype=object)

In [25]:
print(f"Train label: {list(y_train[:3])}")
print(f"Train encoded label: {y_train_encoded[:3]}")

Train label: ['neutral', 'positive', 'neutral']
Train encoded label: [1 2 1]


In [26]:
print(f"Train label: {list(y_test[:3])}")
print(f"Train encoded label: {y_test_encoded[:3]}")

Train label: ['positive', 'positive', 'negative']
Train encoded label: [2 2 0]


## 3. Feature Extraction

In [27]:
import numpy as np
import torch
from torch.utils.data import DataLoader, TensorDataset

In [28]:
def encode_texts_2d(texts, tokenizer, model, max_length=128, batch_size=32):
    encodings = tokenizer(
        texts,
        truncation=True,
        padding=True,
        max_length=max_length,
        return_tensors='pt'
    )

    dataset = TensorDataset(encodings['input_ids'], encodings['attention_mask'])
    loader = DataLoader(dataset, batch_size=batch_size)

    model.eval()
    all_embeddings = []

    with torch.no_grad():
        for batch_ids, batch_mask in tqdm(loader, desc="Encoding Batches"):
            batch_ids = batch_ids.to(device)
            batch_mask = batch_mask.to(device)
            outputs = model(input_ids=batch_ids, attention_mask=batch_mask)
            cls_embeddings = outputs.last_hidden_state[:, 0, :]  # [CLS] token
            all_embeddings.append(cls_embeddings.cpu())

    return torch.cat(all_embeddings, dim=0)  # Shape: (num_texts, hidden_size)


def encode_texts_3d(texts, tokenizer, model, max_length=128, batch_size=32):
    encodings = tokenizer(
        texts,
        truncation=True,
        padding=True,
        max_length=max_length,
        return_tensors='pt'
    )

    dataset = TensorDataset(encodings['input_ids'], encodings['attention_mask'])
    loader = DataLoader(dataset, batch_size=batch_size)

    model.eval()
    all_token_embeddings = []

    with torch.no_grad():
        for batch_ids, batch_mask in tqdm(loader, desc="Encoding Batches"):
            batch_ids = batch_ids.to(device)
            batch_mask = batch_mask.to(device)
            outputs = model(input_ids=batch_ids, attention_mask=batch_mask)
            token_embeddings = outputs.last_hidden_state
            all_token_embeddings.append(token_embeddings.cpu())

    return torch.cat(all_token_embeddings, dim=0)  # Shape: (num_texts, seq_len, hidden_size)

In [29]:
if torch.cuda.is_available():
    print(torch.cuda.get_device_name(torch.cuda.current_device()))
    !nvidia-smi
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
    
print(device)

NVIDIA H100 80GB HBM3
Sun May 25 05:48:03 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.90.07              Driver Version: 550.90.07      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA H100 80GB HBM3          On  |   00000000:66:00.0 Off |                    0 |
| N/A   37C    P0             73W /  700W |       4MiB /  81559MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                          

### 1. TF-IDF (Term Frequency-Inverse Document Frequency)

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000)
x_train_tfidf = tfidf.fit_transform(x_train)
x_test_tfidf = tfidf.transform(x_test)

In [31]:
print(f"Train data: {x_train_tfidf.shape}")
print(f"Test data: {x_test_tfidf.shape}")

Train data: (14717, 5000)
Test data: (3680, 5000)


### 2. BERT (Bidirectional Encoder Representations from Transformers)

In [32]:
from transformers import BertTokenizer, BertModel

# Load tokenizer and model
bert_tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking')
bert = BertModel.from_pretrained('bert-large-uncased-whole-word-masking').to(device)

for param in bert.parameters():
    param.requires_grad = False
    
bert.eval()

  from .autonotebook import tqdm as notebook_tqdm


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 1024, padding_idx=0)
    (position_embeddings): Embedding(512, 1024)
    (token_type_embeddings): Embedding(2, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-23): 24 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, 

In [33]:
x_train_bert = encode_texts_3d(list(x_train), bert_tokenizer, bert)
x_test_bert = encode_texts_3d(list(x_test), bert_tokenizer, bert)

Encoding Batches: 100%|██████████| 460/460 [00:09<00:00, 46.91it/s]
Encoding Batches: 100%|██████████| 115/115 [00:01<00:00, 64.04it/s]


In [34]:
print(f"Train data: {x_train_bert.shape}")
print(f"Test data: {x_test_bert.shape}")

Train data: torch.Size([14717, 128, 1024])
Test data: torch.Size([3680, 128, 1024])


### 3. XLM-RoBERTa (Cross-lingual Language Model RoBERTa)

In [35]:
from transformers import XLMRobertaTokenizer, XLMRobertaModel

xlmroberta_tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-large')
xlmroberta = XLMRobertaModel.from_pretrained('xlm-roberta-large').to(device)

for param in xlmroberta.parameters():
    param.requires_grad = False
    
xlmroberta.eval()

XLMRobertaModel(
  (embeddings): XLMRobertaEmbeddings(
    (word_embeddings): Embedding(250002, 1024, padding_idx=1)
    (position_embeddings): Embedding(514, 1024, padding_idx=1)
    (token_type_embeddings): Embedding(1, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): XLMRobertaEncoder(
    (layer): ModuleList(
      (0-23): 24 x XLMRobertaLayer(
        (attention): XLMRobertaAttention(
          (self): XLMRobertaSdpaSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): XLMRobertaSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-05, eleme

In [36]:
x_train_xlmroberta = encode_texts_3d(list(x_train), xlmroberta_tokenizer, xlmroberta)
x_test_xlmroberta = encode_texts_3d(list(x_test), xlmroberta_tokenizer, xlmroberta)

Encoding Batches: 100%|██████████| 460/460 [00:07<00:00, 60.59it/s]
Encoding Batches: 100%|██████████| 115/115 [00:01<00:00, 68.32it/s]


In [37]:
print(f"Train data: {x_train_xlmroberta.shape}")
print(f"Test data: {x_test_xlmroberta.shape}")

Train data: torch.Size([14717, 128, 1024])
Test data: torch.Size([3680, 128, 1024])


## 4. Train and Test Model

In [50]:
import joblib

**1. Logistic Regression**

In [38]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=1000, multi_class='multinomial')

**2. LSTM Classifier Head**

In [39]:
import torch.nn as nn

class LSTMClassifierHead(nn.Module):
    def __init__(self, input_dim, num_classes, num_layers=1, hidden_dim = 128, bidirectional=True):
        super().__init__()
        self.lstm = nn.LSTM(
            input_dim,
            hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=bidirectional
        )
        lstm_output_dim = hidden_dim * 2 if bidirectional else hidden_dim

        self.classifier = nn.Sequential(
            nn.Linear(lstm_output_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(128, num_classes)
        )

    def forward(self, x):
        _, (h_n, _) = self.lstm(x)
        
        if self.lstm.bidirectional:
            h_n = torch.cat((h_n[-2], h_n[-1]), dim=1)
        else:
            h_n = h_n[-1] 
        
        return self.classifier(h_n)

### TF-IDF + Logistic Regression

In [40]:
lr.fit(x_train_tfidf, y_train)
print("TF-IDF + LR Train Accuracy:", lr.score(x_train_tfidf, y_train))

TF-IDF + LR Train Accuracy: 0.9148603655636339


In [41]:
print("TF-IDF + LR Test Accuracy:", lr.score(x_test_tfidf, y_test))

TF-IDF + LR Test Accuracy: 0.8548913043478261


In [51]:
joblib.dump(lr, "models/tfidf_logreg.pkl")

['models/tfidf_logreg.pkl']

### BERT + LSTM Classifier Head

In [44]:
lstm_bert = LSTMClassifierHead(input_dim=x_train_bert.shape[-1], num_classes=3, bidirectional=False).to(device)
lstm_bert

LSTMClassifierHead(
  (lstm): LSTM(1024, 128, batch_first=True)
  (classifier): Sequential(
    (0): Linear(in_features=128, out_features=256, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.1, inplace=False)
    (3): Linear(in_features=256, out_features=128, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.1, inplace=False)
    (6): Linear(in_features=128, out_features=3, bias=True)
  )
)

In [45]:
train_ds = TensorDataset(x_train_bert, torch.tensor(y_train_encoded))
test_ds  = TensorDataset(x_test_bert,  torch.tensor(y_test_encoded))

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)
test_loader  = DataLoader(test_ds,  batch_size=32, num_workers=4, pin_memory=True)

optimizer = torch.optim.Adam(lstm_bert.parameters(), lr=1e-3, weight_decay=1e-5)
criterion = nn.CrossEntropyLoss()

num_epochs = 100
for epoch in trange(num_epochs, desc="Training Epochs"):
    lstm_bert.train()
    train_loss, train_correct, train_total = 0.0, 0, 0

    for Xb, yb in train_loader:
        Xb, yb = Xb.to(device), yb.to(device)

        preds = lstm_bert(Xb)
        loss = criterion(preds, yb)

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(lstm_bert.parameters(), max_norm=1.0)
        optimizer.step()

        train_loss += loss.item() * yb.size(0)
        train_correct += (preds.argmax(dim=1) == yb).sum().item()
        train_total += yb.size(0)

    # Evaluation on test set
    lstm_bert.eval()
    test_loss_total, test_correct, test_total =  0.0, 0, 0

    with torch.no_grad():
        for Xb, yb in test_loader:
            Xb, yb = Xb.to(device), yb.to(device)
            preds = lstm_bert(Xb)
            loss = criterion(preds, yb)
            test_loss_total += loss.item() * yb.size(0)
            test_correct += (preds.argmax(dim=1) == yb).sum().item()
            test_total += yb.size(0)
            
    train_acc = test_correct / test_total
    train_loss = test_loss_total / test_total
    test_acc = train_correct / train_total
    test_loss = train_loss / train_total
    
    if (epoch + 1) % 10 == 0:
        print(f"\nEpoch {epoch + 1}/{num_epochs} | Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} | Test Loss: {test_loss:.4f} | Test Acc: {test_acc:.4f}")

Training Epochs:  10%|█         | 10/100 [00:58<08:40,  5.78s/it]


Epoch 10/100 | Train Loss: 0.7696 | Train Acc: 0.8027 | Test Loss: 0.0001 | Test Acc: 0.9355


Training Epochs:  20%|██        | 20/100 [01:56<07:43,  5.79s/it]


Epoch 20/100 | Train Loss: 1.5124 | Train Acc: 0.7997 | Test Loss: 0.0001 | Test Acc: 0.9836


Training Epochs:  30%|███       | 30/100 [02:53<06:49,  5.85s/it]


Epoch 30/100 | Train Loss: 1.6038 | Train Acc: 0.7984 | Test Loss: 0.0001 | Test Acc: 0.9893


Training Epochs:  40%|████      | 40/100 [03:52<05:52,  5.87s/it]


Epoch 40/100 | Train Loss: 1.8310 | Train Acc: 0.8011 | Test Loss: 0.0001 | Test Acc: 0.9908


Training Epochs:  50%|█████     | 50/100 [04:49<04:45,  5.71s/it]


Epoch 50/100 | Train Loss: 1.7942 | Train Acc: 0.7976 | Test Loss: 0.0001 | Test Acc: 0.9925


Training Epochs:  60%|██████    | 60/100 [05:47<03:52,  5.82s/it]


Epoch 60/100 | Train Loss: 1.6300 | Train Acc: 0.7965 | Test Loss: 0.0001 | Test Acc: 0.9916


Training Epochs:  70%|███████   | 70/100 [06:45<02:54,  5.83s/it]


Epoch 70/100 | Train Loss: 1.8069 | Train Acc: 0.8000 | Test Loss: 0.0001 | Test Acc: 0.9941


Training Epochs:  80%|████████  | 80/100 [07:44<01:58,  5.90s/it]


Epoch 80/100 | Train Loss: 1.9584 | Train Acc: 0.7981 | Test Loss: 0.0001 | Test Acc: 0.9950


Training Epochs:  90%|█████████ | 90/100 [08:43<00:58,  5.85s/it]


Epoch 90/100 | Train Loss: 1.9894 | Train Acc: 0.8052 | Test Loss: 0.0001 | Test Acc: 0.9942


Training Epochs: 100%|██████████| 100/100 [09:41<00:00,  5.82s/it]


Epoch 100/100 | Train Loss: 1.7667 | Train Acc: 0.7924 | Test Loss: 0.0001 | Test Acc: 0.9957





In [53]:
torch.save(lstm_bert, "./models/lstm_bert.pth")

### XLM-RoBERTa + LSTM Classifier Head

In [47]:
lstm_xlmroberta = LSTMClassifierHead(input_dim=x_train_xlmroberta.shape[-1], num_classes=3, bidirectional=False).to(device)
lstm_xlmroberta

LSTMClassifierHead(
  (lstm): LSTM(1024, 128, batch_first=True)
  (classifier): Sequential(
    (0): Linear(in_features=128, out_features=256, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.1, inplace=False)
    (3): Linear(in_features=256, out_features=128, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.1, inplace=False)
    (6): Linear(in_features=128, out_features=3, bias=True)
  )
)

In [48]:
train_ds = TensorDataset(x_train_xlmroberta, torch.tensor(y_train_encoded))
test_ds  = TensorDataset(x_test_xlmroberta,  torch.tensor(y_test_encoded))

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)
test_loader  = DataLoader(test_ds,  batch_size=32, num_workers=4, pin_memory=True)

optimizer = torch.optim.Adam(lstm_xlmroberta.parameters(), lr=1e-3, weight_decay=1e-5)
criterion = nn.CrossEntropyLoss()

num_epochs = 100
for epoch in trange(num_epochs, desc="Training Epochs"):
    lstm_xlmroberta.train()
    train_loss, train_correct, train_total = 0.0, 0, 0

    for Xb, yb in train_loader:
        Xb, yb = Xb.to(device), yb.to(device)

        preds = lstm_xlmroberta(Xb)
        loss = criterion(preds, yb)

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(lstm_xlmroberta.parameters(), max_norm=1.0)
        optimizer.step()

        train_loss += loss.item() * yb.size(0)
        train_correct += (preds.argmax(dim=1) == yb).sum().item()
        train_total += yb.size(0)

    # Evaluation on test set
    lstm_xlmroberta.eval()
    test_loss_total, test_correct, test_total =  0.0, 0, 0

    with torch.no_grad():
        for Xb, yb in test_loader:
            Xb, yb = Xb.to(device), yb.to(device)
            preds = lstm_xlmroberta(Xb)
            loss = criterion(preds, yb)
            test_loss_total += loss.item() * yb.size(0)
            test_correct += (preds.argmax(dim=1) == yb).sum().item()
            test_total += yb.size(0)
            
    train_acc = test_correct / test_total
    train_loss = test_loss_total / test_total
    test_acc = train_correct / train_total
    test_loss = train_loss / train_total
    
    if (epoch + 1) % 10 == 0:
        print(f"\nEpoch {epoch + 1}/{num_epochs} | Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} | Test Loss: {test_loss:.4f} | Test Acc: {test_acc:.4f}")

Training Epochs:  10%|█         | 10/100 [00:57<08:37,  5.75s/it]


Epoch 10/100 | Train Loss: 0.5815 | Train Acc: 0.7633 | Test Loss: 0.0000 | Test Acc: 0.7968


Training Epochs:  20%|██        | 20/100 [01:54<07:39,  5.74s/it]


Epoch 20/100 | Train Loss: 0.7417 | Train Acc: 0.7595 | Test Loss: 0.0001 | Test Acc: 0.8305


Training Epochs:  30%|███       | 30/100 [02:54<07:05,  6.08s/it]


Epoch 30/100 | Train Loss: 0.8040 | Train Acc: 0.7560 | Test Loss: 0.0001 | Test Acc: 0.8591


Training Epochs:  40%|████      | 40/100 [03:53<05:50,  5.85s/it]


Epoch 40/100 | Train Loss: 0.9659 | Train Acc: 0.7315 | Test Loss: 0.0001 | Test Acc: 0.8893


Training Epochs:  50%|█████     | 50/100 [04:53<04:58,  5.97s/it]


Epoch 50/100 | Train Loss: 1.0459 | Train Acc: 0.7457 | Test Loss: 0.0001 | Test Acc: 0.9079


Training Epochs:  60%|██████    | 60/100 [05:52<03:58,  5.96s/it]


Epoch 60/100 | Train Loss: 1.2853 | Train Acc: 0.7413 | Test Loss: 0.0001 | Test Acc: 0.9358


Training Epochs:  70%|███████   | 70/100 [06:51<02:57,  5.91s/it]


Epoch 70/100 | Train Loss: 1.3424 | Train Acc: 0.7543 | Test Loss: 0.0001 | Test Acc: 0.9506


Training Epochs:  80%|████████  | 80/100 [07:50<01:57,  5.88s/it]


Epoch 80/100 | Train Loss: 1.6967 | Train Acc: 0.7410 | Test Loss: 0.0001 | Test Acc: 0.9650


Training Epochs:  90%|█████████ | 90/100 [08:49<00:59,  5.96s/it]


Epoch 90/100 | Train Loss: 1.7035 | Train Acc: 0.7326 | Test Loss: 0.0001 | Test Acc: 0.9669


Training Epochs: 100%|██████████| 100/100 [09:49<00:00,  5.89s/it]


Epoch 100/100 | Train Loss: 1.7476 | Train Acc: 0.7459 | Test Loss: 0.0001 | Test Acc: 0.9651





In [54]:
torch.save(lstm_xlmroberta, "./models/lstm_xlmroberta.pth")

## 5. Inference

In [84]:
import re
import torch
import torch.nn.functional as F
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from transformers import BertTokenizer, BertModel

class SentimentAnalysis():
    def __init__(self, model_path: str, bert_path: str = "bert-large-uncased-whole-word-masking"):
        nltk.download("stopwords")
        nltk.download("wordnet")
        nltk.download('punkt_tab')
        
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.label_map = {0: "negative", 1: "neutral", 2: "positive"}

        self.classifier = torch.load(model_path, map_location=self.device)
        self.classifier.eval().to(self.device)

        self.bert_tokenizer = BertTokenizer.from_pretrained(bert_path)
        self.bert = BertModel.from_pretrained(bert_path).to(self.device)
        
        self.stop_words = set(stopwords.words("english"))
        self.lemmatizer = WordNetLemmatizer()

    def __preprocessing(self, text: str):
        # Clean text
        text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
        text = re.sub(r'http\S+', '', text)  # Remove URLs
        text = re.sub(r'[^a-zA-Z\s]', '', text)  # Keep only letters and spaces
        text = text.lower().strip()

        # Tokenize
        tokens = word_tokenize(text)

        # Remove stopwords and lemmatize
        lemmatized = [
            self.lemmatizer.lemmatize(word)
            for word in tokens
            if word not in self.stop_words
        ]

        return " ".join(lemmatized)
        
        
    def predict(self, text: str):
        text = self.__preprocessing(text)

        encoded = self.bert_tokenizer(
            text,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=128
        ).to(self.device)

        with torch.no_grad():
            outputs = self.bert(**encoded)
            cls_embedding = outputs.last_hidden_state[:, 0, :]
            logits = self.classifier(cls_embedding.unsqueeze(0))
            probs = F.softmax(logits, dim=1)
            conf, pred_id = torch.max(probs, dim=1)

            label = self.label_map[pred_id.item()]
            confidence = conf.item()

        return label, confidence

In [106]:
!nvidia-smi

Sun May 25 07:18:27 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.90.07              Driver Version: 550.90.07      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA H100 80GB HBM3          On  |   00000000:66:00.0 Off |                    0 |
| N/A   39C    P0            121W /  700W |    8086MiB /  81559MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [107]:
sentiment_analysis = SentimentAnalysis("./models/lstm_bert.pth")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [109]:
%%time
text = "Good Game, I recommend you guys to try it!"
pred, conf = sentiment_analysis.predict(text)

separator_len = max(50, len(text) + 20)
separator = "=" * separator_len

print(separator)
print(f"Input Text     : {text}")
print(f"Predicted Label: {pred.capitalize()}")
print(f"Confidence     : {conf:.2%}")
print(separator)

Input Text     : Good Game, I recommend you guys to try it!
Predicted Label: Positive
Confidence     : 97.55%
CPU times: user 8.3 ms, sys: 377 µs, total: 8.68 ms
Wall time: 8.03 ms
