In [None]:
""" Train and Predict for Multiclass Sentiment Analysis """
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AlbertTokenizer, TFAlbertForSequenceClassification
import tensorflow as tf
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report
import numpy as np
import os

# File paths and configurations
FILE_PATH = 'datasets/datasets.csv'
MODEL_PATH = "model/my-albert.h5"
BASE_PRETRAINED_MODEL = 'albert-base-v2'
MAX_LENGTH = 512  # Adjusted dynamically if necessary
EPOCH = 10
BATCH = 32
LEARNING_RATE = 2e-5  # Increased slightly for better convergence
NUM_LABELS = 3
LABEL_MAPPING = {'Positive': 0, 'Neutral': 1, 'Negative': 2}

# Ensure directory exists for saving model weights
os.makedirs(os.path.dirname(MODEL_PATH), exist_ok=True)

# Load dataset
df = pd.read_csv(FILE_PATH)

# Validate label mapping
if not set(df['sentiment'].unique()).issubset(set(LABEL_MAPPING.keys())):
    raise ValueError("Dataset contains labels not defined in LABEL_MAPPING.")

# Split data
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['reviews'].values,
    df['sentiment'].values,
    test_size=0.2,
    random_state=42
)

# Tokenizer and model initialization
tokenizer = AlbertTokenizer.from_pretrained(BASE_PRETRAINED_MODEL)
model = TFAlbertForSequenceClassification.from_pretrained(BASE_PRETRAINED_MODEL, num_labels=NUM_LABELS)

# Tokenize data
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')

# Convert labels to numeric
train_labels_numeric = np.array([LABEL_MAPPING[label] for label in train_labels])
test_labels_numeric = np.array([LABEL_MAPPING[label] for label in test_labels])

# Compute class weights to handle imbalanced data
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_labels_numeric),
    y=train_labels_numeric
)
class_weights_dict = {i: weight for i, weight in enumerate(class_weights)}

# Prepare datasets
train_dataset = tf.data.Dataset.from_tensor_slices((
    (train_encodings['input_ids'], train_encodings['attention_mask']),
    train_labels_numeric
)).batch(BATCH)

test_dataset = tf.data.Dataset.from_tensor_slices((
    (test_encodings['input_ids'], test_encodings['attention_mask']),
    test_labels_numeric
)).batch(BATCH)

# Compile model
optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

# Train model with early stopping
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', patience=3, restore_best_weights=True
)
history = model.fit(
    train_dataset,
    validation_data=test_dataset,
    epochs=EPOCH,
    class_weight=class_weights_dict,
    callbacks=[early_stopping]
)

# Evaluate model
eval_results = model.evaluate(test_dataset)
print(f"Test loss: {eval_results[0]}")
print(f"Test accuracy: {eval_results[1]}")

# Save model weights
model.save_weights(MODEL_PATH)

# Predict with trained model
new_texts = [
    'It’s too early to tell any difference, but I don’t mind the texture.',
    'My skin looks much brighter and feels soft. I’m really impressed with this serum.',
    'It’s okay but not worth the price. There are better alternatives out there.'
]
new_encodings = tokenizer(new_texts, truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')

predictions = model.predict([new_encodings['input_ids'], new_encodings['attention_mask']])
predicted_labels = tf.argmax(predictions.logits, axis=1).numpy()

# Map predictions to sentiments
inverse_label_mapping = {v: k for k, v in LABEL_MAPPING.items()}
predicted_sentiments = [inverse_label_mapping[label] for label in predicted_labels]

# Display results
for text, sentiment in zip(new_texts, predicted_sentiments):
    print(f"Text: {text}\nPredicted Sentiment: {sentiment}\n")

# Classification report on the test set
test_preds = tf.argmax(model.predict([test_encodings['input_ids'], test_encodings['attention_mask']]).logits, axis=1).numpy()
print("\nClassification Report on Test Set:")
print(classification_report(test_labels_numeric, test_preds, target_names=LABEL_MAPPING.keys()))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFAlbertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFAlbertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test loss: 0.01838313415646553
Test accuracy: 1.0
Text: It’s too early to tell any difference, but I don’t mind the texture.
Predicted Sentiment: Neutral

Text: My skin looks much brighter and feels soft. I’m really impressed with this serum.
Predicted Sentiment: Positive

Text: It’s okay but not worth the price. There are better alternatives out there.
Predicted Sentiment: Neutral


Classification Report on Test Set:
              precision    recall  f1-score   support

    Positive       1.00      1.00      1.00        28
     Neutral       1.00      1.00      1.00        24
    Negative       1.00      1.00      1.00        23

    accuracy                           1.00        75
   macro avg       1.00      1.00      1.00        75
weighted avg       1.00      1.00      1.00        75



In [None]:
"""Module providing a train pipelines for sentiment analysis"""

from transformers import AlbertTokenizer, TFAlbertForSequenceClassification
import tensorflow as tf

# Download ALBERT Pre-trained Model
label_mapping = {'Positive': 0, 'Neutral': 1, 'Negative': 2}
# label_mapping = {'Very Positive': 0, 'Very Negative': 1, 'Mixed': 2, 'Positive': 3, 'Negative': 4, 'Neutral': 5}
MAX_LENGTH = 1000
NUM_LABELS = 3 # Adjust num_labels based on the number of sentiments
MODEL_PATH = 'model/my-albert.h5'
BASE_PRETRAINED_MODEL='albert-base-v2'


tokenizer = AlbertTokenizer.from_pretrained(BASE_PRETRAINED_MODEL)
model = TFAlbertForSequenceClassification.from_pretrained(BASE_PRETRAINED_MODEL, num_labels=NUM_LABELS)
model.load_weights(MODEL_PATH) # type: ignore

new_texts = ["This moisturizer is fantastic! It leaves my skin feeling hydrated and glowing all day. I’ve never been so happy with a product before.",
             "Absolutely love this serum! My skin feels soft and looks radiant. I’ve already recommended it to all my friends!",
             "The product is okay. It moisturizes well, but I didn’t notice any significant improvement in my skin texture.",
             "This product made my skin break out terribly. I wouldn’t recommend it to anyone with sensitive skin.",
             "It’s not bad, but it’s not amazing either. It’s a decent product if you’re looking for something basic."]
new_encodings = tokenizer(new_texts, truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')

new_input_ids = new_encodings['input_ids'].numpy()
new_attention_mask = new_encodings['attention_mask'].numpy()


# Mengambil logits dari TFSequenceClassifierOutput dan lakukan predictions
predictions = model.predict([new_input_ids, new_attention_mask]) # type: ignore
logits = predictions.logits
predicted_labels = tf.argmax(logits, axis=1).numpy()
predicted_sentiments = [list(label_mapping.keys())[list(label_mapping.values()).index(label)] for label in predicted_labels]
print(f'Predicted sentiments: {predicted_sentiments}')

All PyTorch model weights were used when initializing TFAlbertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFAlbertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Predicted sentiments: ['Positive', 'Positive', 'Neutral', 'Negative', 'Neutral']
