# Preprocesing

## Import Library

In [1]:
# Download NLTK data (tokenizer dan stopwords)
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import spacy
# from google.colab import drive

# Load spaCy model bahasa Inggris
nlp = spacy.load("en_core_web_sm")

# Load stopword bahasa Inggris dari NLTK
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Load Data

In [2]:
# Mount Google Drive and load data
# drive.mount('/content/drive')
# file_path = '/content/drive/MyDrive/Dataset/ChatGPT_Reviews99k_Cleaned.csv'
file_path = '/kaggle/input/sentiment-analysist/chatGPT_clean_reviews.csv'
df = pd.read_csv(file_path)

## Preprocessed

In [3]:
def preprocess_text(text):
    # Lowercase
    text = text.lower()

    # Hapus angka
    text = re.sub(r'\d+', '', text)

    # Hapus tanda baca
    text = re.sub(r'[^\w\s]', '', text)

    # Hapus spasi berlebih
    text = re.sub(r'\s+', ' ', text).strip()

    # Tokenisasi
    tokens = word_tokenize(text)

    # Hapus stopword
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization
    doc = nlp(' '.join(tokens))
    lemmas = [token.lemma_ for token in doc]

    return ' '.join(lemmas)


In [4]:
df['preprocessed'] = df['content'].astype(str).apply(preprocess_text)

## Labeling

In [5]:
# Install & import VADER (sudah tersedia di NLTK)
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

# Inisialisasi analyzer
vader = SentimentIntensityAnalyzer()


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [6]:
def get_vader_sentiment(text):
    scores = vader.polarity_scores(text)
    compound = scores['compound']

    if compound >= 0.05:
        return 'positive'
    elif compound <= -0.05:
        return 'negative'
    else:
        return 'neutral'

df['vader_sentiment'] = df['content'].apply(get_vader_sentiment)


In [7]:
df['vader_sentiment'].value_counts()

vader_sentiment
positive    79455
negative    12250
neutral      7295
Name: count, dtype: int64

## Export Data

In [8]:
df.to_csv('ChatGPT_Review99k_Preprocesed.csv', index=False)

# Scheme I - Bi-GRU + GloVe + class_weight

## Import Library

In [9]:
import pandas as pd
import numpy as np
import re
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, Bidirectional, GRU, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
# from imblearn.over_sampling import SMOTE
from google.colab import drive
from sklearn.utils.class_weight import compute_class_weight
np.random.seed(112)
tf.random.set_seed(112)

2025-07-19 03:32:26.230534: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752895946.505153      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752895946.580570      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## Prepare Data

In [10]:
# Mount Google Drive and load data
# drive.mount('/content/drive')
# file_path = '/content/drive/MyDrive/Dataset/ChatGPT_Review99k_Preprocessed.csv'
file_path = '/kaggle/working/ChatGPT_Review99k_Preprocesed.csv'
df = pd.read_csv(file_path)

# Prepare data and labels
df = df[['preprocessed', 'vader_sentiment']].copy()
df.dropna(subset=['preprocessed'], inplace=True)
label_encoder = LabelEncoder()
df['sentiment_encoded'] = label_encoder.fit_transform(df['vader_sentiment'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99000 entries, 0 to 98999
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   preprocessed       99000 non-null  object
 1   vader_sentiment    99000 non-null  object
 2   sentiment_encoded  99000 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 2.3+ MB


## Tokenize Text Data

In [11]:
print("\n Tokenizing text data...")
# Define parameters
MAX_WORDS = 10000 # Maximum number of words to keep
MAX_SEQUENCE_LENGTH = 150 # Maximum length of all sequences

# Create and fit the tokenizer
tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<oov>")
tokenizer.fit_on_texts(df['preprocessed'].values)
word_index = tokenizer.word_index

# Convert text to padded sequences
sequences = tokenizer.texts_to_sequences(df['preprocessed'].values)
padded_sequences = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')


 Tokenizing text data...


## Load Pre-Trained GLOVE Embeddings

In [12]:
print("\n Loading GloVe Embeddings...")
# IMPORTANT: Download GloVe embeddings first, e.g., 'glove.6B.100d.txt'
# from https://nlp.stanford.edu/projects/glove/
# GLOVE_PATH = '/content/drive/MyDrive/Dataset/glove.6B.100d.txt'
GLOVE_PATH = '/kaggle/input/glove6b/glove.6B.100d.txt'
EMBEDDING_DIM = 100 # Must match the GloVe file's dimension

embeddings_index = {}
try:
    with open(GLOVE_PATH, encoding="utf8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    print(f"Found {len(embeddings_index)} word vectors.")
except FileNotFoundError:
    print("GloVe file not found. Please update GLOVE_PATH.")
    # Create an empty dictionary if the file is not found
    embeddings_index = {}

# Create the embedding matrix
embedding_matrix = np.zeros((MAX_WORDS, EMBEDDING_DIM))
for word, i in word_index.items():
    if i < MAX_WORDS:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # Words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector



 Loading GloVe Embeddings...
Found 400000 word vectors.


## Splitting Data

In [13]:
from sklearn.model_selection import train_test_split

X = padded_sequences
y = df['sentiment_encoded'].values

# Split data (ensure stratification to maintain distribution)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")

Training data shape: (79200, 150)
Testing data shape: (19800, 150)


## Class Weight

In [14]:
# Menghitung bobot agar kelas minoritas mendapat perhatian lebih
class_weights = compute_class_weight(
    'balanced',
    classes=np.unique(y_train),
    y=y_train
)
# Mengubahnya menjadi format dictionary yang dibutuhkan Keras
class_weight_dict = dict(enumerate(class_weights))

print("Class Weights yang akan digunakan:", class_weight_dict)

Class Weights yang akan digunakan: {0: 2.693877551020408, 1: 4.523646333104867, 2: 0.4153294317538229}


In [15]:
model = Sequential([
    Embedding(
        input_dim=MAX_WORDS,
        output_dim=EMBEDDING_DIM,
        weights=[embedding_matrix],
        input_length=MAX_SEQUENCE_LENGTH,
        trainable=True, # Tetap fine-tuning untuk performa terbaik
        mask_zero=True
    ),
    Bidirectional(GRU(128, return_sequences=True)),
    Bidirectional(GRU(64, return_sequences=False)),
    Dropout(0.5), # Menambahkan dropout untuk mencegah overfitting
    Dense(3, activation='softmax') # Langsung ke output layer
])

# Compile model seperti biasa
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

I0000 00:00:1752895973.574786      36 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13942 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1752895973.575523      36 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13942 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5


## Training

In [16]:
history = model.fit(
    X_train,
    y_train,
    epochs=10,
    batch_size=64,
    validation_data=(X_test, y_test),
    class_weight=class_weight_dict, # <-- INI KUNCINYA
    verbose=2
)

final_train_accuracy = history.history['accuracy'][-1]
final_val_accuracy = history.history['val_accuracy'][-1]
final_train_loss = history.history['loss'][-1]
final_val_loss = history.history['val_loss'][-1]

# Tampilkan hasilnya dengan format yang mudah dibaca
print(f"Akurasi Pelatihan Final: {final_train_accuracy*100:.2f}%")
print(f"Loss Pelatihan Final: {final_train_loss:.4f}")
print(f"Akurasi Validasi Final: {final_val_accuracy*100:.2f}%")
print(f"Loss Validasi Final: {final_val_loss:.4f}")

Epoch 1/10


I0000 00:00:1752895981.795657      98 cuda_dnn.cc:529] Loaded cuDNN version 90300


1238/1238 - 28s - 23ms/step - accuracy: 0.8331 - loss: 0.5275 - val_accuracy: 0.8845 - val_loss: 0.3420
Epoch 2/10
1238/1238 - 20s - 16ms/step - accuracy: 0.8892 - loss: 0.3724 - val_accuracy: 0.8957 - val_loss: 0.3040
Epoch 3/10
1238/1238 - 20s - 16ms/step - accuracy: 0.9044 - loss: 0.3188 - val_accuracy: 0.9085 - val_loss: 0.2738
Epoch 4/10
1238/1238 - 20s - 16ms/step - accuracy: 0.9156 - loss: 0.2683 - val_accuracy: 0.9081 - val_loss: 0.2841
Epoch 5/10
1238/1238 - 19s - 16ms/step - accuracy: 0.9266 - loss: 0.2188 - val_accuracy: 0.9119 - val_loss: 0.2824
Epoch 6/10
1238/1238 - 19s - 16ms/step - accuracy: 0.9379 - loss: 0.1708 - val_accuracy: 0.9130 - val_loss: 0.3054
Epoch 7/10
1238/1238 - 19s - 15ms/step - accuracy: 0.9455 - loss: 0.1392 - val_accuracy: 0.9172 - val_loss: 0.3204
Epoch 8/10
1238/1238 - 19s - 16ms/step - accuracy: 0.9539 - loss: 0.1152 - val_accuracy: 0.9169 - val_loss: 0.3372
Epoch 9/10
1238/1238 - 19s - 16ms/step - accuracy: 0.9625 - loss: 0.0939 - val_accuracy: 0.

## Testing

In [17]:
print("\n 4. Mengevaluasi model...")
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"\nFinal Test Accuracy: {accuracy*100:.2f}%")


 4. Mengevaluasi model...

Final Test Accuracy: 91.06%


## Inference

In [18]:
new_reviews = [
    "This app is absolutely fantastic! It works perfectly and has all the features I need.",
    "I'm really disappointed with the latest update. It's buggy and crashes all the time.",
    "The application is okay, it does what it says but nothing more.",
    "I hate this, it's the worst experience I've ever had.",
    "Wow, I love it! So useful and easy to use.",
    "just it"
]

# 1. Pra-pemrosesan data baru (HARUS SAMA DENGAN DATA LATIH)
#    a. Tokenisasi -> mengubah teks menjadi sekuens integer
processed_sequences = tokenizer.texts_to_sequences(new_reviews)

#    b. Padding -> menyeragamkan panjang sekuens
processed_padded = tf.keras.preprocessing.sequence.pad_sequences(
    processed_sequences,
    maxlen=MAX_SEQUENCE_LENGTH, # Gunakan max_length yang sama
    padding='post',
    truncating='post'
)

# 2. Lakukan Prediksi
print("\n Melakukan prediksi pada ulasan baru...")
predictions = model.predict(processed_padded)

# 3. Ubah hasil prediksi (probabilitas) menjadi label kelas
#    a. Ambil indeks kelas dengan probabilitas tertinggi
predicted_class_indices = np.argmax(predictions, axis=1)

#    b. Gunakan label_encoder untuk mengubah indeks kembali ke label teks
predicted_class_labels = label_encoder.inverse_transform(predicted_class_indices)


# 4. Tampilkan Hasil
print("\n--- Hasil Inferensi ---")
for i, review in enumerate(new_reviews):
    print(f"Ulasan: \"{review}\"")
    print(f"Prediksi Sentimen: {predicted_class_labels[i].capitalize()}")
    print("-" * 20)



 Melakukan prediksi pada ulasan baru...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 507ms/step

--- Hasil Inferensi ---
Ulasan: "This app is absolutely fantastic! It works perfectly and has all the features I need."
Prediksi Sentimen: Positive
--------------------
Ulasan: "I'm really disappointed with the latest update. It's buggy and crashes all the time."
Prediksi Sentimen: Negative
--------------------
Ulasan: "The application is okay, it does what it says but nothing more."
Prediksi Sentimen: Negative
--------------------
Ulasan: "I hate this, it's the worst experience I've ever had."
Prediksi Sentimen: Negative
--------------------
Ulasan: "Wow, I love it! So useful and easy to use."
Prediksi Sentimen: Positive
--------------------
Ulasan: "just it"
Prediksi Sentimen: Neutral
--------------------


# Scheme II - BERT Fine-Tuning + class_weight

## Import Library

In [19]:
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from transformers import BertTokenizerFast
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support
from sklearn.preprocessing import LabelEncoder

In [20]:
# Cek CUDA (GPU Nvidia)
if torch.cuda.is_available():
    print(f" Available GPU: {torch.cuda.get_device_name(0)}")
    device = torch.device("cuda")
else:
    print(" GPU not Avaiable")
    device = torch.device("cpu")

 Available GPU: Tesla T4


## Prepare Data

In [21]:
# Mount Google Drive and load data
# from google.colab import drive
# drive.mount('/content/drive')
# file_path = '/content/drive/MyDrive/Dataset/ChatGPT_Review99k_Preprocessed.csv'
file_path = '/kaggle/working/ChatGPT_Review99k_Preprocesed.csv'
df = pd.read_csv(file_path)

# Prepare data and labels
df = df[['preprocessed', 'vader_sentiment']].copy()
df.dropna(subset=['preprocessed'], inplace=True)
label_encoder = LabelEncoder()
df['sentiment_encoded'] = label_encoder.fit_transform(df['vader_sentiment'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99000 entries, 0 to 98999
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   preprocessed       99000 non-null  object
 1   vader_sentiment    99000 non-null  object
 2   sentiment_encoded  99000 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 2.3+ MB


## Define column names and create label maps

In [22]:
TEXT_COLUMN = 'preprocessed'
LABEL_COLUMN = 'sentiment_encoded'
ORIGINAL_LABEL_COLUMN = 'vader_sentiment'

# Create mappings from existing encoded data
unique_labels_df = df[[LABEL_COLUMN, ORIGINAL_LABEL_COLUMN]].drop_duplicates().sort_values(LABEL_COLUMN)
id2label = dict(zip(unique_labels_df[LABEL_COLUMN], unique_labels_df[ORIGINAL_LABEL_COLUMN]))
label2id = dict(zip(unique_labels_df[ORIGINAL_LABEL_COLUMN], unique_labels_df[LABEL_COLUMN]))
num_labels = len(id2label)

print(f"ID to Label Map: {id2label}")
print(f"Number of Labels: {num_labels}")

ID to Label Map: {0: 'negative', 1: 'neutral', 2: 'positive'}
Number of Labels: 3


## Split Data Train, Validation, and Test

In [23]:
# training+validation (80%) dan test (20%)
train_val_df, test_df = train_test_split(
    df,
    test_size=0.2, # 20% untuk data test
    random_state=112,
    stratify=df[LABEL_COLUMN]
)

# training (80% dari sisa) dan validation (20% dari sisa)
train_df, val_df = train_test_split(
    train_val_df,
    test_size=0.2, # 20% dari 80% data awal untuk validasi
    random_state=112,
    stratify=train_val_df[LABEL_COLUMN]
)

print(f"Ukuran Data Training: {len(train_df)}")
print(f"Ukuran Data Validasi: {len(val_df)}")
print(f"Ukuran Data Test: {len(test_df)}")

Ukuran Data Training: 63360
Ukuran Data Validasi: 15840
Ukuran Data Test: 19800


## Calculate Class Weights for Imbalance

In [24]:
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_df[LABEL_COLUMN]),
    y=train_df[LABEL_COLUMN].to_numpy()
)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float)

print(f"\nCalculated Class Weights: {class_weights}")


Calculated Class Weights: [2.69387755 4.52345256 0.41533107]


## Tokenization and Dataset Creation

In [25]:
# --- Load Tokenizer ---
MODEL_NAME = 'bert-base-uncased'
tokenizer = BertTokenizerFast.from_pretrained(MODEL_NAME)

# --- Tokenize text and create Datasets ---
train_encodings = tokenizer(train_df[TEXT_COLUMN].tolist(), truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_df[TEXT_COLUMN].tolist(), truncation=True, padding=True, max_length=128)

class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = SentimentDataset(train_encodings, train_df[LABEL_COLUMN].tolist())
val_dataset = SentimentDataset(val_encodings, val_df[LABEL_COLUMN].tolist())

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

## Training

In [26]:
# --- Load Pre-trained Model ---
model = BertForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)

# --- Define Metrics ---
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# --- Custom Trainer for Weighted Loss ---
class WeightedLossTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        outputs = model(**inputs)
        logits = outputs.get('logits')
        labels = inputs.get('labels')

        # Ambil device dari input tensor 'labels', bukan dari 'model.device'.
        # bekerja di single-GPU (Colab) maupun multi-GPU (Kaggle).
        device = labels.device
        
        # Pindahkan class weights ke device yang benar
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights_tensor.to(device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))

        return (loss, outputs) if return_outputs else loss

# --- Define Training Arguments (Dioptimasi dengan FP16) ---
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    learning_rate=2e-5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    logging_steps=100,
    report_to="none",
    fp16=True,
)

# --- Instantiate and Run Trainer ---
trainer = WeightedLossTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    # tokenizer=tokenizer,
)

# Start fine-tuning
trainer.train()

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4279,0.394368,0.906313,0.910963,0.920632,0.906313
2,0.3075,0.4251,0.923737,0.925993,0.930707,0.923737




TrainOutput(global_step=3960, training_loss=0.4055168310801188, metrics={'train_runtime': 1915.5935, 'train_samples_per_second': 66.152, 'train_steps_per_second': 2.067, 'total_flos': 8335433073623040.0, 'train_loss': 0.4055168310801188, 'epoch': 2.0})

In [27]:
# Jalankan evaluasi secara eksplisit pada validation set
final_metrics = trainer.evaluate()

# Cetak hasilnya
print("\n--- Metrik Evaluasi Final ---")
print(final_metrics)

accuracy = final_metrics['eval_accuracy']


print(f"\n Akurasi Final: {accuracy:.4f}")





--- Metrik Evaluasi Final ---
{'eval_loss': 0.4251001179218292, 'eval_accuracy': 0.9237373737373737, 'eval_f1': 0.9259931836137475, 'eval_precision': 0.9307069876573975, 'eval_recall': 0.9237373737373737, 'eval_runtime': 65.952, 'eval_samples_per_second': 240.175, 'eval_steps_per_second': 1.88, 'epoch': 2.0}

 Akurasi Final: 0.9237


## Testing

In [28]:
# Tokenisasi data test
test_encodings = tokenizer(test_df[TEXT_COLUMN].tolist(), truncation=True, padding=True, max_length=128)
test_dataset = SentimentDataset(test_encodings, test_df[LABEL_COLUMN].tolist())

# --- Prediksi pada Data Test ---
test_predictions = trainer.predict(test_dataset)

# metrikMdari hasil prediksi
test_metrics = test_predictions.metrics
print("\n--- Testing Final Result---")
for key, value in test_metrics.items():
    print(f"{key}: {value:.4f}")




--- Testing Final Result---
test_loss: 0.4137
test_accuracy: 0.9228
test_f1: 0.9252
test_precision: 0.9301
test_recall: 0.9228
test_runtime: 82.8030
test_samples_per_second: 239.1220
test_steps_per_second: 1.8720


## Inference

In [29]:
def predict_sentiment(text):
    """
    Fungsi untuk memprediksi sentimen dari sebuah teks tunggal.
    """
    # Masukkan model ke mode evaluasi
    trainer.model.eval()

    # Tokenisasi input teks dan ubah menjadi PyTorch tensor
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)

    # Pindahkan semua tensor ke device yang sesuai (GPU/CPU)
    inputs = {key: val.to(device) for key, val in inputs.items()}

    # Lakukan prediksi tanpa menghitung gradient untuk efisiensi
    with torch.no_grad():
        outputs = trainer.model(**inputs)

    # Ambil logits (skor mentah) dari output model
    logits = outputs.logits

    # Dapatkan ID kelas dengan skor tertinggi
    predicted_class_id = torch.argmax(logits, dim=1).item()

    # Kembalikan label kategorikal (contoh: 'positif') menggunakan map id2label
    return id2label[predicted_class_id]

In [30]:
# List review baru yang akan diprediksi
new_reviews = [
    "This app is absolutely fantastic! It works perfectly and has all the features I need.",
    "I'm really disappointed with the latest update. It's buggy and crashes all the time.",
    "The application is okay, it does what it says but nothing more.",
    "I hate this, it's the worst experience I've ever had.",
    "Wow, I love it! So useful and easy to use.",
    "Just it"
]

# Loop melalui setiap review di dalam list
for review in new_reviews:
    # Panggil fungsi prediksi yang sudah kita buat sebelumnya
    predicted_label = predict_sentiment(review)

    # Cetak hasilnya
    print(f"Review: \"{review}\"")
    print(f"Prediksi: {predicted_label}\n")

Review: "This app is absolutely fantastic! It works perfectly and has all the features I need."
Prediksi: positive

Review: "I'm really disappointed with the latest update. It's buggy and crashes all the time."
Prediksi: negative

Review: "The application is okay, it does what it says but nothing more."
Prediksi: positive

Review: "I hate this, it's the worst experience I've ever had."
Prediksi: negative

Review: "Wow, I love it! So useful and easy to use."
Prediksi: positive

Review: "Just it"
Prediksi: neutral



# Scheme III - RoBERTa + class_weight

In [31]:
# !pip install transformers[torch] datasets pandas scikit-learn imbalanced-learn -q

In [32]:
import torch
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight # Import for class weights
from google.colab import drive # Import for Google Drive

# Import the model class designed for fine-tuning
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
from torch.optim import AdamW # Correctly import AdamW from PyTorch

## Configuration and Data Preparation

In [33]:
class Config:
    MODEL_NAME = 'roberta-base'
    MAX_LENGTH = 128  # Max sequence length for RoBERTa
    # Reduce batch size for full fine-tuning to avoid memory issues on T4 GPU
    BATCH_SIZE = 16
    EPOCHS = 2        # 2 epochs is standard for full fine-tuning
    LR = 2e-5         # A standard learning rate for fine-tuning
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

config = Config()
print(f"Using device: {config.DEVICE}")

# --- Mount Google Drive and Load Your Data ---
# drive.mount('/content/drive')
# file_path = '/content/drive/MyDrive/Dataset/ChatGPT_Review99k_Preprocessed.csv'
file_path = '/kaggle/working/ChatGPT_Review99k_Preprocesed.csv'
df = pd.read_csv(file_path)

# --- Prepare data columns ---
# Select the necessary columns and handle any potential missing values
df = df[['preprocessed', 'vader_sentiment']].copy()
df.dropna(subset=['preprocessed'], inplace=True)

# Rename columns to match what the rest of the script expects ('text' and 'label')
df.rename(columns={'preprocessed': 'text'}, inplace=True)

print("Initial Data Distribution:")
print(df['vader_sentiment'].value_counts())
print("-" * 30)

# --- Map Labels to Integers ---
# This map should correspond to the unique values in your 'vader_sentiment' column
label_map = {'positive': 0, 'negative': 1, 'neutral': 2}
df['label'] = df['vader_sentiment'].map(label_map)

# --- Split Data into Train, Validation, and Test sets ---
# training+validation set and a test set.
df_train_val, df_test = train_test_split(
    df,
    test_size=0.2,  # 20% for the final test set
    random_state=112,
    stratify=df['label']
)

# Training and validation .
df_train, df_val = train_test_split(
    df_train_val,
    test_size=0.2, # This makes the validation set 10% of the original total
    random_state=112,
    stratify=df_train_val['label']
)


print(f"Training set size: {len(df_train)}")
print(f"Validation set size: {len(df_val)}")
print(f"Test set size: {len(df_test)}")
print("-" * 30)

Using device: cuda
Initial Data Distribution:
vader_sentiment
positive    79455
negative    12250
neutral      7295
Name: count, dtype: int64
------------------------------
Training set size: 63360
Validation set size: 15840
Test set size: 19800
------------------------------


## PyTorch Dataset for Fine-Tuning

In [34]:
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# --- Initialize Tokenizer ---
tokenizer = AutoTokenizer.from_pretrained(config.MODEL_NAME)

# --- Create Datasets ---
train_dataset = SentimentDataset(
    texts=df_train.text.to_numpy(),
    labels=df_train.label.to_numpy(),
    tokenizer=tokenizer,
    max_len=config.MAX_LENGTH
)

val_dataset = SentimentDataset(
    texts=df_val.text.to_numpy(),
    labels=df_val.label.to_numpy(),
    tokenizer=tokenizer,
    max_len=config.MAX_LENGTH
)

test_dataset = SentimentDataset(
    texts=df_test.text.to_numpy(),
    labels=df_test.label.to_numpy(),
    tokenizer=tokenizer,
    max_len=config.MAX_LENGTH
)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

## Handle Class Imbalance with Weighted Loss

In [35]:
print("Calculating class weights for handling imbalance...")
class_weights = compute_class_weight(
    'balanced',
    classes=np.unique(df_train.label),
    y=df_train.label.to_numpy()
)
weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(config.DEVICE)
print(f"Calculated Class Weights: {weights_tensor.cpu().numpy()}")
print("-" * 30)

Calculating class weights for handling imbalance...
Calculated Class Weights: [0.41533107 2.6938775  4.5234528 ]
------------------------------


## Define and Train the Full Model

In [36]:
# --- Load Model for Sequence Classification ---
model = AutoModelForSequenceClassification.from_pretrained(
    config.MODEL_NAME,
    num_labels=len(label_map)
).to(config.DEVICE)

# --- Dataloaders ---
train_dataloader = DataLoader(train_dataset, batch_size=config.BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=config.BATCH_SIZE)
test_dataloader = DataLoader(test_dataset, batch_size=config.BATCH_SIZE)

# --- Optimizer and Scheduler ---
optimizer = AdamW(model.parameters(), lr=config.LR)
total_steps = len(train_dataloader) * config.EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)
# --- Instantiate Loss Function with Class Weights ---
loss_fn = nn.CrossEntropyLoss(weight=weights_tensor).to(config.DEVICE)

# --- Training Loop ---
print("Starting end-to-end fine-tuning...")
for epoch in range(config.EPOCHS):
    model.train()
    total_loss = 0
    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{config.EPOCHS}"):
        input_ids = batch['input_ids'].to(config.DEVICE)
        attention_mask = batch['attention_mask'].to(config.DEVICE)
        labels = batch['labels'].to(config.DEVICE)

        optimizer.zero_grad()

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        # Calculate custom weighted loss
        loss = loss_fn(outputs.logits, labels)
        total_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # Gradient clipping
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1} | Average Training Loss: {avg_train_loss:.4f}")

    # --- Validation after each epoch ---
    print("Evaluating on the validation set...")
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(val_dataloader, desc="Validating"):
            input_ids = batch['input_ids'].to(config.DEVICE)
            attention_mask = batch['attention_mask'].to(config.DEVICE)
            labels = batch['labels'].to(config.DEVICE)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # --- HIGHLIGHT ACCURACY ---
    val_accuracy = accuracy_score(all_labels, all_preds)
    reverse_label_map = {v: k for k, v in label_map.items()}
    target_names = [reverse_label_map[i] for i in range(len(label_map))]
    print(classification_report(all_labels, all_preds, target_names=target_names))
    print("="*50)
    print(f" Validation Accuracy for Epoch {epoch + 1}: {val_accuracy:.4f}")
    print("="*50)
    print("-" * 30)

print("Training complete.")
print("-" * 30)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting end-to-end fine-tuning...


Epoch 1/2:   0%|          | 0/3960 [00:00<?, ?it/s]

Epoch 1 | Average Training Loss: 0.5458
Evaluating on the validation set...


Validating:   0%|          | 0/990 [00:00<?, ?it/s]

              precision    recall  f1-score   support

    positive       0.97      0.94      0.96     12713
    negative       0.70      0.83      0.76      1960
     neutral       0.78      0.82      0.80      1167

    accuracy                           0.92     15840
   macro avg       0.82      0.86      0.84     15840
weighted avg       0.93      0.92      0.92     15840

 Validation Accuracy for Epoch 1: 0.9179
------------------------------


Epoch 2/2:   0%|          | 0/3960 [00:00<?, ?it/s]

Epoch 2 | Average Training Loss: 0.4367
Evaluating on the validation set...


Validating:   0%|          | 0/990 [00:00<?, ?it/s]

              precision    recall  f1-score   support

    positive       0.97      0.96      0.96     12713
    negative       0.75      0.81      0.78      1960
     neutral       0.88      0.79      0.83      1167

    accuracy                           0.93     15840
   macro avg       0.87      0.86      0.86     15840
weighted avg       0.93      0.93      0.93     15840

 Validation Accuracy for Epoch 2: 0.9317
------------------------------
Training complete.
------------------------------


## Final Evaluation on Test Set

In [37]:
print("Performing final evaluation on the test set...")
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Testing"):
        input_ids = batch['input_ids'].to(config.DEVICE)
        attention_mask = batch['attention_mask'].to(config.DEVICE)
        labels = batch['labels'].to(config.DEVICE)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# --- Print Classification Report for Test Set ---
print("\nTest Set Classification Report:")
target_names = [reverse_label_map[i] for i in range(len(label_map))]
print(classification_report(all_labels, all_preds, target_names=target_names))

test_accuracy = accuracy_score(all_labels, all_preds)
print("="*50)
print(f"Final Test Set Accuracy: {test_accuracy:.4f}")
print("="*50)


Performing final evaluation on the test set...


Testing:   0%|          | 0/1238 [00:00<?, ?it/s]


Test Set Classification Report:
              precision    recall  f1-score   support

    positive       0.96      0.96      0.96     15891
    negative       0.76      0.81      0.78      2450
     neutral       0.88      0.78      0.82      1459

    accuracy                           0.93     19800
   macro avg       0.87      0.85      0.86     19800
weighted avg       0.93      0.93      0.93     19800

Final Test Set Accuracy: 0.9305


## Inference

In [38]:
print("\n" + "="*30)
print("Inference Section")
print("="*30)

# Create a reverse map to decode predictions
reverse_label_map = {v: k for k, v in label_map.items()}

def predict_sentiment(texts, model, tokenizer, device, max_len=128):
    """
    Predicts the sentiment for a list of texts.

    Args:
        texts (list of str): The new texts to analyze.
        model: The fine-tuned model.
        tokenizer: The tokenizer.
        device: The device to run the model on (e.g., 'cuda' or 'cpu').
        max_len (int): The maximum sequence length.

    Returns:
        list of str: The predicted sentiment labels.
    """
    model.eval() # Set the model to evaluation mode

    predictions = []

    with torch.no_grad(): # No need to calculate gradients for inference
        for text in texts:
            encoding = tokenizer.encode_plus(
                text,
                add_special_tokens=True,
                max_length=max_len,
                return_token_type_ids=False,
                padding='max_length',
                truncation=True,
                return_attention_mask=True,
                return_tensors='pt',
            )

            input_ids = encoding['input_ids'].to(device)
            attention_mask = encoding['attention_mask'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            # Get the prediction with the highest probability
            pred_index = torch.argmax(logits, dim=1).item()
            predicted_label = reverse_label_map[pred_index]
            predictions.append(predicted_label)

    return predictions

# --- Example Usage ---
new_reviews = [
    "This app is absolutely fantastic! It works perfectly and has all the features I need.",
    "I'm really disappointed with the latest update. It's buggy and crashes all the time.",
    "The application is okay, it does what it says but nothing more.",
    "I hate this, it's the worst experience I've ever had.",
    "Wow, I love it! So useful and easy to use.",
    "just it"
]

predicted_sentiments = predict_sentiment(new_reviews, model, tokenizer, config.DEVICE, config.MAX_LENGTH)

for review, sentiment in zip(new_reviews, predicted_sentiments):
    print(f"Review: '{review}'\nPredicted Sentiment: {sentiment}\n")



Inference Section
Review: 'This app is absolutely fantastic! It works perfectly and has all the features I need.'
Predicted Sentiment: positive

Review: 'I'm really disappointed with the latest update. It's buggy and crashes all the time.'
Predicted Sentiment: negative

Review: 'The application is okay, it does what it says but nothing more.'
Predicted Sentiment: positive

Review: 'I hate this, it's the worst experience I've ever had.'
Predicted Sentiment: negative

Review: 'Wow, I love it! So useful and easy to use.'
Predicted Sentiment: positive

Review: 'just it'
Predicted Sentiment: neutral

