## Import Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from LSTM_implementation import LSTMModel
import re
from typing import List
from sklearn.preprocessing import LabelEncoder

In [2]:
train = pd.read_csv('dataset/NusaX-sentiment/train.csv')
val = pd.read_csv('dataset/NusaX-sentiment/valid.csv')
test = pd.read_csv('dataset/NusaX-sentiment/test.csv')

In [3]:
train.head()

Unnamed: 0,id,text,label
0,219,Nikmati cicilan 0% hingga 12 bulan untuk pemes...,neutral
1,209,Kue-kue yang disajikan bikin saya bernostalgia...,positive
2,436,Ibu pernah bekerja di grab indonesia,neutral
3,394,Paling suka banget makan siang di sini ayam sa...,positive
4,592,Pelayanan bus DAMRI sangat baik,positive


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      500 non-null    int64 
 1   text    500 non-null    object
 2   label   500 non-null    object
dtypes: int64(1), object(2)
memory usage: 11.8+ KB


## Data Preprocessing

In [5]:
class TextTokenization:
    
    def __init__(self, 
                 max_words: int = 4000,
                 max_sequence_length: int = 100,
                 oov_token: str = "<OOV>",
                 padding: str = 'post',
                 truncating: str = 'post'):

        self.max_words = max_words
        self.max_sequence_length = max_sequence_length
        self.oov_token = oov_token
        self.padding = padding
        self.truncating = truncating
        
        # Initialize tokenizer
        self.tokenizer = Tokenizer(
            num_words=max_words,
            oov_token=oov_token,
            filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
        )
        
        # Attributes to be set during fitting
        self.vocab_size = None
        self.word_index = None
        self.is_fitted = False
        
        # Store preprocessing statistics
        self.text_stats = {
            'original_lengths': [],
            'cleaned_lengths': [],
            'total_texts': 0,
            'unique_words': 0
        }
    
    def clean_text(self, text: str) -> str:
        
        if not isinstance(text, str):
            return ""
        
        # Store original length
        original_length = len(text.split())
        
        # Convert to lowercase
        text = text.lower()
        
        # Remove URLs
        text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
        text = re.sub(r'www\.(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
        
        # Remove email addresses
        text = re.sub(r'\S+@\S+', '', text)
        
        # Remove mentions and hashtags (but keep the text part)
        text = re.sub(r'[@#]\w+', '', text)
        
        # Remove numbers (optional - you might want to keep them)
        text = re.sub(r'\d+', '', text)
        
        # Remove special characters but keep Indonesian characters and spaces
        text = re.sub(r'[^\w\s]', ' ', text)
        
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        
        # Remove very short words (less than 2 characters)
        words = text.split()
        words = [word for word in words if len(word) >= 2]
        text = ' '.join(words)
        
        # Store cleaned length
        cleaned_length = len(text.split())
        self.text_stats['original_lengths'].append(original_length)
        self.text_stats['cleaned_lengths'].append(cleaned_length)
        
        return text
    
    def preprocess_texts(self, texts: List[str]) -> List[str]:
        
        cleaned_texts = []
        for i, text in enumerate(texts):
            if i % 1000 == 0 and i > 0:
                print(f"Processed {i}/{len(texts)} texts")
            
            cleaned_text = self.clean_text(text)
            cleaned_texts.append(cleaned_text)
        
        self.text_stats['total_texts'] += len(texts)
        print(f"Preprocessing completed. Total texts processed: {self.text_stats['total_texts']}")
        
        return cleaned_texts
    
    def fit(self, texts: List[str]) -> 'TextTokenization':
        
        # Preprocess texts
        cleaned_texts = self.preprocess_texts(texts)
        
        # Fit tokenizer
        self.tokenizer.fit_on_texts(cleaned_texts)
        
        # Store tokenizer information
        self.word_index = self.tokenizer.word_index
        self.vocab_size = min(len(self.word_index) + 1, self.max_words)
        self.is_fitted = True
        
        # Calculate statistics
        self.text_stats['unique_words'] = len(self.word_index)
        
        print(f"Tokenizer fitted successfully!")
        print(f"- Total unique words: {len(self.word_index)}")
        print(f"- Vocabulary size (with limit): {self.vocab_size}")
        print(f"- OOV token: {self.oov_token}")
        
        return self
    
    def transform(self, texts: List[str]) -> np.ndarray:
        
        if not self.is_fitted:
            raise ValueError("Tokenizer must be fitted before transforming. Call fit() first.")
        
        print(f"Transforming {len(texts)} texts to sequences...")
        
        # Preprocess texts
        cleaned_texts = self.preprocess_texts(texts)
        
        # Convert to sequences
        sequences = self.tokenizer.texts_to_sequences(cleaned_texts)
        
        # Pad sequences
        padded_sequences = pad_sequences(
            sequences,
            maxlen=self.max_sequence_length,
            padding=self.padding,
            truncating=self.truncating
        )
        
        print(f"Transformation completed. Output shape: {padded_sequences.shape}")
        
        return padded_sequences
    
    def fit_transform(self, texts: List[str]) -> np.ndarray:
        
        return self.fit(texts).transform(texts)

In [6]:
print("Label unique values:", train['label'].unique())
print("Number of unique labels:", len(train['label'].unique()))

tokenizer = TextTokenization(max_words=10000, max_sequence_length=1000)
encoder = LabelEncoder()
# Tokenization and encoding
def preprocess_dataset(df, tokenizer, encoder, is_train=False):
    df_copy = df.copy()
    
    if is_train:
        # For training data, fit tokenizer and encoder
        X = tokenizer.fit_transform(df_copy['text'].tolist())
        encoder.fit(df_copy['label'])
        y = encoder.transform(df_copy['label'])
    else:
        # For validation/test data, only transform
        X = tokenizer.transform(df_copy['text'].tolist())
        y = encoder.transform(df_copy['label'])
    
    return X, y


print("Preprocessing training data...")
X_train, y_train = preprocess_dataset(train, tokenizer, encoder, is_train=True)

print("Preprocessing validation data...")
X_val, y_val = preprocess_dataset(val, tokenizer, encoder, is_train=False)

print("Preprocessing test data...")
X_test, y_test = preprocess_dataset(test, tokenizer, encoder, is_train=False)

# Verify data types and shapes
print(f"\nData shapes and types:")
print(f"X_train: {X_train.shape}, dtype: {X_train.dtype}")
print(f"y_train: {y_train.shape}, dtype: {y_train.dtype}")
print(f"X_val: {X_val.shape}, dtype: {X_val.dtype}")
print(f"y_val: {y_val.shape}, dtype: {y_val.dtype}")
print(f"X_test: {X_test.shape}, dtype: {X_test.dtype}")
print(f"y_test: {y_test.shape}, dtype: {y_test.dtype}")

Label unique values: ['neutral' 'positive' 'negative']
Number of unique labels: 3
Preprocessing training data...
Preprocessing completed. Total texts processed: 500
Tokenizer fitted successfully!
- Total unique words: 2732
- Vocabulary size (with limit): 2733
- OOV token: <OOV>
Transforming 500 texts to sequences...
Preprocessing completed. Total texts processed: 1000
Transformation completed. Output shape: (500, 1000)
Preprocessing validation data...
Transforming 100 texts to sequences...
Preprocessing completed. Total texts processed: 1100
Transformation completed. Output shape: (100, 1000)
Preprocessing test data...
Transforming 400 texts to sequences...
Preprocessing completed. Total texts processed: 1500
Transformation completed. Output shape: (400, 1000)

Data shapes and types:
X_train: (500, 1000), dtype: int32
y_train: (500,), dtype: int32
X_val: (100, 1000), dtype: int32
y_val: (100,), dtype: int32
X_test: (400, 1000), dtype: int32
y_test: (400,), dtype: int32


In [None]:
# Eksperimen 1: Pengaruh Jumlah Layer LSTM
print("\n=== Eksperimen 1: Pengaruh Jumlah Layer LSTM ===")

lstm_layer_variations = [1, 2, 3]
lstm_layer_results = []

for num_layers in lstm_layer_variations:
    print(f"\nTraining LSTM with {num_layers} layers...")
    
    model = LSTMModel(
        vocab_size=tokenizer.vocab_size,
        embedding_dim=32,
        lstm_units=16,
        num_classes=len(encoder.classes_),
        num_lstm_layers=num_layers,
        bidirectional=False,  # Unidirectional untuk eksperimen ini
        dropout_rate=0.3,
        learning_rate=0.01
    )
    
    # Training
    history = model.fit(
        X_train, y_train,
        X_val, y_val,
        epochs=10,
        batch_size=32,
        verbose=True
    )
    
    # Evaluasi
    results = model.evaluate(X_test, y_test)
    
    lstm_layer_results.append({
        'num_layers': num_layers,
        'history': history,
        'test_results': results,
        'model': model
    })
    
    print(f"Results for {num_layers} layers:")
    print(f"Test Accuracy: {results['test_accuracy']:.4f}")
    print(f"Macro F1-Score: {results['macro_f1_score']:.4f}")

# Plot hasil eksperimen 1
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
for result in lstm_layer_results:
    plt.plot(result['history']['train_loss'], label=f"{result['num_layers']} layers")
plt.title('Training Loss vs Epochs')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)

plt.subplot(1, 3, 2)
for result in lstm_layer_results:
    plt.plot(result['history']['val_loss'], label=f"{result['num_layers']} layers")
plt.title('Validation Loss vs Epochs')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)

plt.subplot(1, 3, 3)
f1_scores = [result['test_results']['macro_f1_score'] for result in lstm_layer_results]
layer_counts = [result['num_layers'] for result in lstm_layer_results]
plt.bar(layer_counts, f1_scores)
plt.title('Macro F1-Score by Number of LSTM Layers')
plt.xlabel('Number of LSTM Layers')
plt.ylabel('Macro F1-Score')
plt.grid(True, axis='y')

plt.tight_layout()
plt.show()


=== Eksperimen 1: Pengaruh Jumlah Layer LSTM ===

Training LSTM with 1 layers...
Epoch 1/10 - train_loss: 1.0855 - train_acc: 0.3840 - val_loss: 1.0797 - val_acc: 0.3800
Epoch 2/10 - train_loss: 1.0789 - train_acc: 0.3500 - val_loss: 1.0779 - val_acc: 0.3800
Epoch 3/10 - train_loss: 1.0809 - train_acc: 0.3780 - val_loss: 1.0781 - val_acc: 0.3800
Epoch 4/10 - train_loss: 1.0829 - train_acc: 0.3840 - val_loss: 1.0786 - val_acc: 0.3800
Epoch 5/10 - train_loss: 1.0818 - train_acc: 0.3700 - val_loss: 1.0790 - val_acc: 0.3800
Epoch 6/10 - train_loss: 1.0805 - train_acc: 0.3760 - val_loss: 1.0785 - val_acc: 0.3800
Epoch 7/10 - train_loss: 1.0801 - train_acc: 0.3840 - val_loss: 1.0783 - val_acc: 0.3800
Epoch 8/10 - train_loss: 1.0774 - train_acc: 0.3840 - val_loss: 1.0779 - val_acc: 0.3800
Epoch 9/10 - train_loss: 1.0774 - train_acc: 0.3780 - val_loss: 1.0779 - val_acc: 0.3800
Epoch 10/10 - train_loss: 1.0782 - train_acc: 0.3880 - val_loss: 1.0779 - val_acc: 0.3800


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Results for 1 layers:
Test Accuracy: 0.3825
Macro F1-Score: 0.1844

Training LSTM with 2 layers...
Epoch 1/10 - train_loss: 1.0736 - train_acc: 0.4100 - val_loss: 1.1119 - val_acc: 0.3800
Epoch 2/10 - train_loss: 1.0823 - train_acc: 0.4000 - val_loss: 1.0849 - val_acc: 0.3800
Epoch 3/10 - train_loss: 1.0814 - train_acc: 0.3780 - val_loss: 1.0782 - val_acc: 0.3800
Epoch 4/10 - train_loss: 1.0786 - train_acc: 0.3720 - val_loss: 1.0779 - val_acc: 0.3800
Epoch 5/10 - train_loss: 1.0795 - train_acc: 0.3840 - val_loss: 1.0780 - val_acc: 0.3800
Epoch 6/10 - train_loss: 1.0779 - train_acc: 0.3780 - val_loss: 1.0782 - val_acc: 0.3800
Epoch 7/10 - train_loss: 1.0804 - train_acc: 0.3780 - val_loss: 1.0781 - val_acc: 0.3800
Epoch 8/10 - train_loss: 1.0799 - train_acc: 0.3840 - val_loss: 1.0785 - val_acc: 0.3800
Epoch 9/10 - train_loss: 1.0797 - train_acc: 0.3840 - val_loss: 1.0782 - val_acc: 0.3800
Epoch 10/10 - train_loss: 1.0830 - train_acc: 0.3780 - val_loss: 1.0795 - val_acc: 0.3800
