<a href="https://colab.research.google.com/github/tuwidzz/sentimentanalysis/blob/main/sentimentanalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf

tf.__version__

'2.17.1'

In [5]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AlbertTokenizer, TFAlbertForSequenceClassification
import tensorflow as tf

# Hyperparameter configuration
EPOCH = 50  # Reduce epochs to prevent overfitting
BATCH = 32
LEARNING_RATE = 2e-5  # Lower learning rate for fine-tuning
MODEL_PATH = "models/my-albert-202501031441.h5"  # Path to save model weights

# Ensure the directory exists
os.makedirs('models', exist_ok=True)

# Baca dataset dari CSV
file_path = 'dataset/pilkada_sentiment_dataset.csv'
df = pd.read_csv(file_path, on_bad_lines='skip')

# Persiapkan data
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'].values,
    df['sentiment'].values,
    test_size=0.2,
    random_state=42
)

# Unduh ALBERT Pre-trained Model
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = TFAlbertForSequenceClassification.from_pretrained('albert-base-v2')

# Preprocessing Data
max_length = 500

train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=max_length, return_tensors='tf')
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True, max_length=max_length, return_tensors='tf')

# Ekstrak Array NumPy
train_input_ids = train_encodings['input_ids'].numpy()
train_attention_mask = train_encodings['attention_mask'].numpy()

test_input_ids = test_encodings['input_ids'].numpy()
test_attention_mask = test_encodings['attention_mask'].numpy()

# Konversi label sentimen menjadi bentuk numerik
label_mapping = {
    'very positive': 0,
    'positive': 1,
    'neutral': 2,
    'negative': 3,
    'very negative': 4
}
train_labels_numeric = [label_mapping.get(label, 0) for label in train_labels]
test_labels_numeric = [label_mapping.get(label, 0) for label in test_labels]

# Buat tf.data.Dataset
train_dataset = tf.data.Dataset.from_tensor_slices(((train_input_ids, train_attention_mask), train_labels_numeric))
test_dataset = tf.data.Dataset.from_tensor_slices(((test_input_ids, test_attention_mask), test_labels_numeric))

# Training Model
optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.fit(train_dataset.batch(BATCH), epochs=EPOCH)

# Evaluasi Model
eval_results = model.evaluate(test_dataset.batch(BATCH))
print("Test loss:", eval_results[0])
print("Test accuracy:", eval_results[1])

# Prediksi dengan Model yang Telah Dilatih
new_texts = ['Bangga sekali Calon ini bisa Merangkul', 'Keren Sekali Visi Misinya', 'Hebat Sekali Kerjanya']
new_encodings = tokenizer(new_texts, truncation=True, padding=True, max_length=max_length, return_tensors='tf')

new_input_ids = new_encodings['input_ids'].numpy()
new_attention_mask = new_encodings['attention_mask'].numpy()

predictions = model.predict([new_input_ids, new_attention_mask])
logits = predictions.logits
predicted_labels = tf.argmax(logits, axis=1).numpy()
predicted_sentiments = [list(label_mapping.keys())[list(label_mapping.values()).index(label)] for label in predicted_labels]
print("Predicted sentiments:", predicted_sentiments)

# Simpan model di folder models
model.save_weights(MODEL_PATH)
print(f"Model weights saved to {MODEL_PATH}")

All PyTorch model weights were used when initializing TFAlbertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFAlbertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Test loss: nan
Test accuracy: 0.23157894611358643
Predicted sentiments: ['very positive', 'very positive', 'very positive']
Model weights saved to models/my-albert-202501031441.h5
