# Proyek NLP Submission - Review Pokemon Unite (Playstore)


## Library

In [None]:
!pip install Sastrawi
# !pip install transformers torch

Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl.metadata (909 bytes)
Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/209.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1


In [None]:
import re
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import csv
import requests
from tqdm import tqdm
from io import StringIO

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

from wordcloud import WordCloud

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## Data Understanding

In [None]:
reviews_df = pd.read_csv('reviews.csv')

In [None]:
# reviews_df = reviews_df.iloc[:10000]

In [None]:
reviews_df

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion
0,5bd4f771-7e1e-4474-a201-eeafa9f11f8c,Siann,https://play-lh.googleusercontent.com/a-/ALV-U...,tolong kasih sistem hukuman untuk tim yang men...,1,0,,2025-03-24 19:57:01,,,
1,342a67ae-a063-4b89-82d9-6a7f9ef2a2ff,Tria Haikal Ramdani,https://play-lh.googleusercontent.com/a-/ALV-U...,nice game,5,0,1.18.1.1,2025-03-24 16:37:49,,,1.18.1.1
2,ce76028c-4a55-437f-93d3-1ce7eba4e6a9,adnayaka reshwara,https://play-lh.googleusercontent.com/a/ACg8oc...,udh aku kasih nama nya kok tetep gak bisa masu...,3,0,,2025-03-24 13:58:05,,,
3,d50611c3-074b-40e3-81b5-e916ffc22447,Vons One,https://play-lh.googleusercontent.com/a-/ALV-U...,semoga pokemon yang lainnya cepat di tambahkan,5,0,,2025-03-24 13:53:07,,,
4,d13b7fdc-c697-40c4-a30e-3812cc318dda,Jomblo ngenes,https://play-lh.googleusercontent.com/a/ACg8oc...,makin diupdate makin berat.padahal jaringan bu...,1,9,1.18.1.1,2025-03-24 13:19:41,,,1.18.1.1
...,...,...,...,...,...,...,...,...,...,...,...
49995,1a7b1b33-43e0-4be8-b0be-f032f82e7923,ROACH JOESTAR 2025,https://play-lh.googleusercontent.com/a-/ALV-U...,Yang penting game ini ada pengaturan kek mirip...,5,0,,2021-11-19 09:29:08,,,
49996,68679900-0d74-48ed-8dd1-f39ec59e4b1b,Darwan Gemilang,https://play-lh.googleusercontent.com/a/ACg8oc...,Aku belum coba,5,0,,2021-11-19 09:22:29,,,
49997,78437906-6c1a-4d4f-a7cb-8c04dcdf30d7,nadine mikayla,https://play-lh.googleusercontent.com/a-/ALV-U...,Bagus sekali,5,0,1.2.1.2,2021-11-19 09:18:35,,,1.2.1.2
49998,5d864d91-c001-4c2c-963c-04557ccf137f,ninda sofiya,https://play-lh.googleusercontent.com/a/ACg8oc...,Game nya bagus banget aku lebih suka pokemon y...,5,0,,2021-11-19 09:11:58,,,


In [None]:
reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   reviewId              50000 non-null  object 
 1   userName              50000 non-null  object 
 2   userImage             50000 non-null  object 
 3   content               50000 non-null  object 
 4   score                 50000 non-null  int64  
 5   thumbsUpCount         50000 non-null  int64  
 6   reviewCreatedVersion  29339 non-null  object 
 7   at                    50000 non-null  object 
 8   replyContent          0 non-null      float64
 9   repliedAt             0 non-null      float64
 10  appVersion            29339 non-null  object 
dtypes: float64(2), int64(2), object(7)
memory usage: 4.2+ MB


In [None]:
selected_columns = ['reviewId', 'userName', 'userImage', 'content', 'score', 'thumbsUpCount', 'at']
review_df = reviews_df[selected_columns]

review_df

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,at
0,5bd4f771-7e1e-4474-a201-eeafa9f11f8c,Siann,https://play-lh.googleusercontent.com/a-/ALV-U...,tolong kasih sistem hukuman untuk tim yang men...,1,0,2025-03-24 19:57:01
1,342a67ae-a063-4b89-82d9-6a7f9ef2a2ff,Tria Haikal Ramdani,https://play-lh.googleusercontent.com/a-/ALV-U...,nice game,5,0,2025-03-24 16:37:49
2,ce76028c-4a55-437f-93d3-1ce7eba4e6a9,adnayaka reshwara,https://play-lh.googleusercontent.com/a/ACg8oc...,udh aku kasih nama nya kok tetep gak bisa masu...,3,0,2025-03-24 13:58:05
3,d50611c3-074b-40e3-81b5-e916ffc22447,Vons One,https://play-lh.googleusercontent.com/a-/ALV-U...,semoga pokemon yang lainnya cepat di tambahkan,5,0,2025-03-24 13:53:07
4,d13b7fdc-c697-40c4-a30e-3812cc318dda,Jomblo ngenes,https://play-lh.googleusercontent.com/a/ACg8oc...,makin diupdate makin berat.padahal jaringan bu...,1,9,2025-03-24 13:19:41
...,...,...,...,...,...,...,...
49995,1a7b1b33-43e0-4be8-b0be-f032f82e7923,ROACH JOESTAR 2025,https://play-lh.googleusercontent.com/a-/ALV-U...,Yang penting game ini ada pengaturan kek mirip...,5,0,2021-11-19 09:29:08
49996,68679900-0d74-48ed-8dd1-f39ec59e4b1b,Darwan Gemilang,https://play-lh.googleusercontent.com/a/ACg8oc...,Aku belum coba,5,0,2021-11-19 09:22:29
49997,78437906-6c1a-4d4f-a7cb-8c04dcdf30d7,nadine mikayla,https://play-lh.googleusercontent.com/a-/ALV-U...,Bagus sekali,5,0,2021-11-19 09:18:35
49998,5d864d91-c001-4c2c-963c-04557ccf137f,ninda sofiya,https://play-lh.googleusercontent.com/a/ACg8oc...,Game nya bagus banget aku lebih suka pokemon y...,5,0,2021-11-19 09:11:58


In [None]:
review_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   reviewId       50000 non-null  object
 1   userName       50000 non-null  object
 2   userImage      50000 non-null  object
 3   content        50000 non-null  object
 4   score          50000 non-null  int64 
 5   thumbsUpCount  50000 non-null  int64 
 6   at             50000 non-null  object
dtypes: int64(2), object(5)
memory usage: 2.7+ MB


## Preprocessing

In [None]:
slangwords = {
    "@": "di", "abis": "habis", "wtb": "beli", "masi": "masih",
    "wts": "jual", "wtt": "tukar", "bgt": "banget", "maks": "maksimal",
    "gamenya": "game", "btw": "ngomong-ngomong", "tp": "tapi"
}

stemmer = StemmerFactory().create_stemmer()

def get_stopwords():
    stop_id = set(stopwords.words('indonesian'))
    stop_en = set(stopwords.words('english'))
    custom_stop = {
        'iyaa', 'yaa', 'gak', 'nya', 'na', 'sih', 'ku', 'di', 'ga', 'ya',
        'loh', 'kah', 'woi', 'woii', 'woy', 'nih', 'kan', 'deh', 'dong',
        'si', 'aj', 'aja', 'banget', 'bgt', 'nya', 'lah'
    }
    return stop_id.union(stop_en).union(custom_stop)

ALL_STOPWORDS = get_stopwords()

def clean_text(text):
    text = str(text)
    text = re.sub(r'@[A-Za-z0-9_]+', '', text)
    text = re.sub(r'#[A-Za-z0-9_]+', '', text)
    text = re.sub(r'RT\s+', '', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = text.replace('\n', ' ').strip()
    return text

def lowercase(text):
    return text.lower()

def tokenize(text):
    return text.split()

def replace_slang(tokens):
    return [slangwords.get(word, word) for word in tokens]

def remove_stopwords(tokens):
    return [word for word in tokens if word not in ALL_STOPWORDS]

def detect_negation(words):
    result = []
    skip = False
    for i in range(len(words)):
        if skip:
            skip = False
            continue
        if words[i] in ['tidak', 'gak', 'ga', 'nggak', 'bukan']:
            if i + 1 < len(words):
                result.append(words[i] + '_' + words[i + 1])
                skip = True
        else:
            result.append(words[i])
    return result

def stemming(tokens):
    return [stemmer.stem(word) for word in tokens]

def to_sentence(tokens):
    return ' '.join(tokens)

def preprocess(text, apply_stemming=False):
    text = clean_text(text)
    text = lowercase(text)
    tokens = tokenize(text)
    tokens = replace_slang(tokens)
    tokens = detect_negation(tokens)
    tokens = remove_stopwords(tokens)
    if apply_stemming:
        tokens = stemming(tokens)
    return to_sentence(tokens)

In [None]:
review_df['text_akhir'] = review_df['content'].apply(preprocess)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  review_df['text_akhir'] = review_df['content'].apply(preprocess)


In [None]:
review_df

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,at,text_akhir
0,5bd4f771-7e1e-4474-a201-eeafa9f11f8c,Siann,https://play-lh.googleusercontent.com/a-/ALV-U...,tolong kasih sistem hukuman untuk tim yang men...,1,0,2025-03-24 19:57:01,tolong kasih sistem hukuman tim menolak berker...
1,342a67ae-a063-4b89-82d9-6a7f9ef2a2ff,Tria Haikal Ramdani,https://play-lh.googleusercontent.com/a-/ALV-U...,nice game,5,0,2025-03-24 16:37:49,nice game
2,ce76028c-4a55-437f-93d3-1ce7eba4e6a9,adnayaka reshwara,https://play-lh.googleusercontent.com/a/ACg8oc...,udh aku kasih nama nya kok tetep gak bisa masu...,3,0,2025-03-24 13:58:05,udh kasih nama tetep gak_bisa masuk invalid to...
3,d50611c3-074b-40e3-81b5-e916ffc22447,Vons One,https://play-lh.googleusercontent.com/a-/ALV-U...,semoga pokemon yang lainnya cepat di tambahkan,5,0,2025-03-24 13:53:07,semoga pokemon cepat tambahkan
4,d13b7fdc-c697-40c4-a30e-3812cc318dda,Jomblo ngenes,https://play-lh.googleusercontent.com/a/ACg8oc...,makin diupdate makin berat.padahal jaringan bu...,1,9,2025-03-24 13:19:41,diupdate beratpadahal jaringan game lancar lag...
...,...,...,...,...,...,...,...,...
49995,1a7b1b33-43e0-4be8-b0be-f032f82e7923,ROACH JOESTAR 2025,https://play-lh.googleusercontent.com/a-/ALV-U...,Yang penting game ini ada pengaturan kek mirip...,5,0,2021-11-19 09:29:08,game pengaturan kek mobile legend maksudnya pa...
49996,68679900-0d74-48ed-8dd1-f39ec59e4b1b,Darwan Gemilang,https://play-lh.googleusercontent.com/a/ACg8oc...,Aku belum coba,5,0,2021-11-19 09:22:29,coba
49997,78437906-6c1a-4d4f-a7cb-8c04dcdf30d7,nadine mikayla,https://play-lh.googleusercontent.com/a-/ALV-U...,Bagus sekali,5,0,2021-11-19 09:18:35,bagus
49998,5d864d91-c001-4c2c-963c-04557ccf137f,ninda sofiya,https://play-lh.googleusercontent.com/a/ACg8oc...,Game nya bagus banget aku lebih suka pokemon y...,5,0,2021-11-19 09:11:58,game bagus suka pokemon kayak bantal pikachu


## Labeling dan Schema 0: Indobert

In [None]:
print(torch.cuda.is_available())  # Harus True
print(torch.cuda.get_device_name(0))  # Nama GPU

True
Tesla T4


In [13]:
device = 0 if torch.cuda.is_available() else -1
print("Using device:", "GPU" if device == 0 else "CPU")

model_name = "w11wo/indonesian-roberta-base-sentiment-classifier"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model=model,
    tokenizer=tokenizer,
    device=device,
    truncation=True,
    max_length=512
)

def classify_sentiment_bert_batch(texts):
    results = sentiment_pipeline(texts)
    labels = [r['label'] for r in results]
    scores = [r['score'] for r in results]
    return labels, scores


batch_size = 64
labels, scores = [], []

print("Mulai proses analisis sentimen IndoBERT...")

for i in tqdm(range(0, len(review_df), batch_size)):
    batch = review_df['content'].iloc[i:i+batch_size].tolist()
    batch_labels, batch_scores = classify_sentiment_bert_batch(batch)
    labels.extend(batch_labels)
    scores.extend(batch_scores)

review_df['bert_label'] = labels
review_df['bert_score'] = scores

print("Selesai!")

Using device: GPU


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/328 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/808k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/467k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Device set to use cuda:0


Mulai proses analisis sentimen IndoBERT...


  1%|▏         | 10/782 [00:20<23:01,  1.79s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 782/782 [08:18<00:00,  1.57it/s]

Selesai!



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  review_df['bert_label'] = labels
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  review_df['bert_score'] = scores


In [14]:
review_df

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,at,text_akhir,bert_label,bert_score
0,5bd4f771-7e1e-4474-a201-eeafa9f11f8c,Siann,https://play-lh.googleusercontent.com/a-/ALV-U...,tolong kasih sistem hukuman untuk tim yang men...,1,0,2025-03-24 19:57:01,tolong kasih sistem hukuman tim menolak berker...,neutral,0.969630
1,342a67ae-a063-4b89-82d9-6a7f9ef2a2ff,Tria Haikal Ramdani,https://play-lh.googleusercontent.com/a-/ALV-U...,nice game,5,0,2025-03-24 16:37:49,nice game,positive,0.995360
2,ce76028c-4a55-437f-93d3-1ce7eba4e6a9,adnayaka reshwara,https://play-lh.googleusercontent.com/a/ACg8oc...,udh aku kasih nama nya kok tetep gak bisa masu...,3,0,2025-03-24 13:58:05,udh kasih nama tetep gak_bisa masuk invalid to...,negative,0.598562
3,d50611c3-074b-40e3-81b5-e916ffc22447,Vons One,https://play-lh.googleusercontent.com/a-/ALV-U...,semoga pokemon yang lainnya cepat di tambahkan,5,0,2025-03-24 13:53:07,semoga pokemon cepat tambahkan,neutral,0.887500
4,d13b7fdc-c697-40c4-a30e-3812cc318dda,Jomblo ngenes,https://play-lh.googleusercontent.com/a/ACg8oc...,makin diupdate makin berat.padahal jaringan bu...,1,9,2025-03-24 13:19:41,diupdate beratpadahal jaringan game lancar lag...,negative,0.998724
...,...,...,...,...,...,...,...,...,...,...
49995,1a7b1b33-43e0-4be8-b0be-f032f82e7923,ROACH JOESTAR 2025,https://play-lh.googleusercontent.com/a-/ALV-U...,Yang penting game ini ada pengaturan kek mirip...,5,0,2021-11-19 09:29:08,game pengaturan kek mobile legend maksudnya pa...,negative,0.757291
49996,68679900-0d74-48ed-8dd1-f39ec59e4b1b,Darwan Gemilang,https://play-lh.googleusercontent.com/a/ACg8oc...,Aku belum coba,5,0,2021-11-19 09:22:29,coba,negative,0.979086
49997,78437906-6c1a-4d4f-a7cb-8c04dcdf30d7,nadine mikayla,https://play-lh.googleusercontent.com/a-/ALV-U...,Bagus sekali,5,0,2021-11-19 09:18:35,bagus,positive,0.976862
49998,5d864d91-c001-4c2c-963c-04557ccf137f,ninda sofiya,https://play-lh.googleusercontent.com/a/ACg8oc...,Game nya bagus banget aku lebih suka pokemon y...,5,0,2021-11-19 09:11:58,game bagus suka pokemon kayak bantal pikachu,positive,0.997962


In [15]:
print(review_df['bert_label'].value_counts())

bert_label
positive    28619
negative    18030
neutral      3351
Name: count, dtype: int64


## Schema 1: XGBoost, TF-IDF, 80:20

In [16]:
X = review_df['content']
y = review_df['bert_label']

tfidf = TfidfVectorizer(max_features=200, min_df=17, max_df=0.8 )
X_tfidf = tfidf.fit_transform(X)

features_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf.get_feature_names_out())

features_df

Unnamed: 0,ada,afk,agar,aja,aku,akun,and,apa,atau,bagus,...,udh,unite,untuk,update,waktu,wifi,ya,yang,yg,you
0,0.00000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,...,0.000000,0.0,0.399172,0.0,0.0,0.0,0.0,0.329746,0.0,0.0
1,0.00000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
2,0.00000,0.0,0.0,0.000000,0.209563,0.0,0.0,0.0,0.000000,0.000000,...,0.326616,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
3,0.00000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.361515,0.0,0.0
4,0.00000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,0.28382,0.0,0.0,0.328045,0.137210,0.0,0.0,0.0,0.000000,0.000000,...,0.000000,0.0,0.164198,0.0,0.0,0.0,0.0,0.271280,0.0,0.0
49996,0.00000,0.0,0.0,0.000000,0.547948,0.0,0.0,0.0,0.000000,0.000000,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
49997,0.00000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.444172,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
49998,0.00000,0.0,0.0,0.000000,0.287672,0.0,0.0,0.0,0.407961,0.178180,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.284380,0.0,0.0


In [18]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

In [19]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_encoded = le.fit_transform(y)

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y_encoded, test_size=0.2, random_state=42)

In [21]:
xgb_model = XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    use_label_encoder=False,
    eval_metric='mlogloss'
)

xgb_model.fit(X_train, y_train)

# Prediksi
y_pred_train_xgb = xgb_model.predict(X_train)
y_pred_test_xgb = xgb_model.predict(X_test)

# Evaluasi akurasi
accuracy_train_xgb = accuracy_score(y_train, y_pred_train_xgb)
accuracy_test_xgb = accuracy_score(y_test, y_pred_test_xgb)

print('XGBoost - accuracy_train:', accuracy_train_xgb)
print('XGBoost - accuracy_test:', accuracy_test_xgb)

Parameters: { "use_label_encoder" } are not used.



XGBoost - accuracy_train: 0.810475
XGBoost - accuracy_test: 0.7848


## Schema 2: Random Forest, TF-IDF, 80:20

In [22]:
X = review_df['content']
y = review_df['bert_label']

tfidf = TfidfVectorizer(max_features=200, min_df=17, max_df=0.8 )
X_tfidf = tfidf.fit_transform(X)

features_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf.get_feature_names_out())

features_df

Unnamed: 0,ada,afk,agar,aja,aku,akun,and,apa,atau,bagus,...,udh,unite,untuk,update,waktu,wifi,ya,yang,yg,you
0,0.00000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,...,0.000000,0.0,0.399172,0.0,0.0,0.0,0.0,0.329746,0.0,0.0
1,0.00000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
2,0.00000,0.0,0.0,0.000000,0.209563,0.0,0.0,0.0,0.000000,0.000000,...,0.326616,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
3,0.00000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.361515,0.0,0.0
4,0.00000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,0.28382,0.0,0.0,0.328045,0.137210,0.0,0.0,0.0,0.000000,0.000000,...,0.000000,0.0,0.164198,0.0,0.0,0.0,0.0,0.271280,0.0,0.0
49996,0.00000,0.0,0.0,0.000000,0.547948,0.0,0.0,0.0,0.000000,0.000000,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
49997,0.00000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.444172,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
49998,0.00000,0.0,0.0,0.000000,0.287672,0.0,0.0,0.0,0.407961,0.178180,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.284380,0.0,0.0


In [23]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

In [24]:
random_forest = RandomForestClassifier()

random_forest.fit(X_train.toarray(), y_train)

y_pred_train_rf = random_forest.predict(X_train.toarray())
y_pred_test_rf = random_forest.predict(X_test.toarray())

accuracy_train_rf = accuracy_score(y_pred_train_rf, y_train)
accuracy_test_rf = accuracy_score(y_pred_test_rf, y_test)

print('Random Forest - accuracy_train:', accuracy_train_rf)
print('Random Forest - accuracy_test:', accuracy_test_rf)

Random Forest - accuracy_train: 0.91525
Random Forest - accuracy_test: 0.7863


## Schema 3: Random Forest, BoW, 80:20

In [25]:
le = LabelEncoder()
review_df['label_encoded'] = le.fit_transform(review_df['bert_label'])

In [26]:
from sklearn.feature_extraction.text import CountVectorizer
X = review_df['content']
y = review_df['bert_label']

bow_vectorizer = CountVectorizer(max_features=1000)
X_bow = bow_vectorizer.fit_transform(X)

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X_bow, y, test_size=0.2, random_state=42)

In [28]:
random_forest = RandomForestClassifier()

random_forest.fit(X_train.toarray(), y_train)

y_pred_train_rf = random_forest.predict(X_train.toarray())
y_pred_test_rf = random_forest.predict(X_test.toarray())

accuracy_train_rf = accuracy_score(y_pred_train_rf, y_train)
accuracy_test_rf = accuracy_score(y_pred_test_rf, y_test)

print('Random Forest - accuracy_train:', accuracy_train_rf)
print('Random Forest - accuracy_test:', accuracy_test_rf)

Random Forest - accuracy_train: 0.9486
Random Forest - accuracy_test: 0.8063


In [33]:
!pip install pipreqs

Collecting pipreqs
  Downloading pipreqs-0.5.0-py3-none-any.whl.metadata (7.9 kB)
Collecting docopt==0.6.2 (from pipreqs)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ipython==8.12.3 (from pipreqs)
  Downloading ipython-8.12.3-py3-none-any.whl.metadata (5.7 kB)
Collecting yarg==0.1.9 (from pipreqs)
  Downloading yarg-0.1.9-py2.py3-none-any.whl.metadata (4.6 kB)
Collecting jedi>=0.16 (from ipython==8.12.3->pipreqs)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting stack-data (from ipython==8.12.3->pipreqs)
  Downloading stack_data-0.6.3-py3-none-any.whl.metadata (18 kB)
Collecting executing>=1.2.0 (from stack-data->ipython==8.12.3->pipreqs)
  Downloading executing-2.2.0-py2.py3-none-any.whl.metadata (8.9 kB)
Collecting asttokens>=2.1.0 (from stack-data->ipython==8.12.3->pipreqs)
  Downloading asttokens-3.0.0-py3-none-any.whl.metadata (4.7 kB)
Collecting pure-eval (from stack-data->ipython==8.12.3->pipr

In [37]:
!pip freeze > requirements.txt