<a href="https://colab.research.google.com/github/wiwindaaulia/Tugas_UAS_NLP/blob/main/Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np


#Dataprosesing

In [3]:
import pandas as pd

# Baca dataset
file_path = "/content/mobil_listrik.csv"
df = pd.read_csv(file_path)

# Tampilkan beberapa baris pertama
print(df.head())

# Cek informasi dataset
print(df.info())


                  id_komentar        nama_akun                    tanggal  \
0  Ugzbll5eyrIy3-gdUUJ4AaABAg          Sqn Ldr  2023-08-06 12:54:49+00:00   
1  UgzEDUiV3OTrV943p8p4AaABAg       lushen ace  2023-08-04 12:16:23+00:00   
2  UgwqJqu6JMF4EH2CsVV4AaABAg  Fatih Al-Ayyubi  2023-08-04 10:17:57+00:00   
3  UgyYicCMR1rKwuOj2Y14AaABAg        yp office  2023-08-04 08:29:54+00:00   
4  UgxKAcLuAwZOQK6es-x4AaABAg    Lembur Kuring  2023-08-04 07:55:37+00:00   

                                       text_cleaning sentimen  
0  saran sih bikin   harga ionic sama kayak brio ...  positif  
1  problem subsidi kualitas diturunin harga dinai...  negatif  
2  baik kualitas kembang dulu baik kualitas motor...  positif  
3       model jelek kwalitas buruk harga mahal croot  negatif  
4  syarat   ngaco woy anak muda   blom punya ruma...  negatif  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1517 entries, 0 to 1516
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
--- 

In [4]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
import pandas as pd

# Pastikan DataFrame memiliki kolom numerik sebelum preprocessing
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

if numerical_cols:  # Cek apakah ada kolom numerik
    # Mengisi nilai yang hilang hanya pada kolom numerik dengan median
    df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].median())

    # Pastikan semua nilai numerik bertipe float64
    df[numerical_cols] = df[numerical_cols].astype(float)

    # Normalisasi fitur numerik
    scaler = StandardScaler()
    df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Encoding fitur kategorikal (hanya untuk kolom non-numerik)
label_encoders = {}
for col in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))  # Konversi ke string sebelum encoding
    label_encoders[col] = le  # Simpan encoder untuk decoding nanti

print(df.head())


   id_komentar  nama_akun  tanggal  text_cleaning  sentimen
0         1345        979     1513           1263         2
1         1222       1365     1512           1200         0
2          348        376     1511            251         2
3          984       1495     1510           1016         0
4          527        616     1509           1395         0


#Menyiapkan Data untuk Model Transformer

In [5]:
from sklearn.model_selection import train_test_split

# Pilih target dan fitur
target_col = "text_cleaning"  # Gantilah dengan kolom target yang sesuai
X = df.drop(columns=[target_col])
y = df[target_col]

# Membagi dataset menjadi 80% training dan 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Jumlah data train: {X_train.shape[0]}, data test: {X_test.shape[0]}")


Jumlah data train: 1213, data test: 304


#Membangun Model Transformer

In [8]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Input, Dense, Dropout, LayerNormalization, MultiHeadAttention

# Fungsi untuk membuat blok Transformer
def transformer_block(inputs, num_heads, ff_dim, dropout_rate=0.1):
    attn_output = MultiHeadAttention(num_heads=num_heads, key_dim=ff_dim)(inputs, inputs)
    attn_output = Dropout(dropout_rate)(attn_output)
    attn_output = LayerNormalization(epsilon=1e-6)(inputs + attn_output)

    ffn_output = Dense(ff_dim, activation="relu")(attn_output)
    ffn_output = Dense(inputs.shape[-1])(ffn_output)
    ffn_output = Dropout(dropout_rate)(ffn_output)
    return LayerNormalization(epsilon=1e-6)(attn_output + ffn_output)

# Membangun model Transformer
def build_transformer(input_shape, num_heads=4, ff_dim=32):
    inputs = Input(shape=input_shape)
    x = transformer_block(inputs, num_heads, ff_dim)
    x = Dense(64, activation="relu")(x)
    x = Dropout(0.2)(x)
    outputs = Dense(1)(x)  # Output regresi

    model = keras.Model(inputs, outputs)
    model.compile(optimizer="adam", loss="mse", metrics=["mae"])
    return model

# Buat model dengan ukuran input sesuai dengan fitur
model = build_transformer(input_shape=(X_train.shape[1],))

# Tampilkan arsitektur model
model.summary()


#Melatih Model

In [12]:
def build_transformer(input_shape, num_heads=4, ff_dim=32):
    inputs = Input(shape=(1, input_shape))  # Mengubah shape input jadi (batch, 1, feature)

    # Blok Transformer
    x = transformer_block(inputs, num_heads, ff_dim)

    # Flatten sebelum masuk ke Dense
    x = tf.keras.layers.Flatten()(x)

    x = Dense(64, activation="relu")(x)
    x = Dropout(0.2)(x)
    outputs = Dense(1)(x)  # Output regresi

    model = keras.Model(inputs, outputs)
    model.compile(optimizer="adam", loss="mse", metrics=["mae"])
    return model

# Bangun ulang model dengan shape input baru
model = build_transformer(X_train.shape[1])

# Tambahkan dimensi ekstra sebelum melatih model
X_train_expanded = tf.expand_dims(X_train, axis=1)
X_test_expanded = tf.expand_dims(X_test, axis=1)

# Melatih model
history = model.fit(X_train_expanded, y_train, epochs=50, batch_size=16, validation_data=(X_test_expanded, y_test))


Epoch 1/50
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 10ms/step - loss: 747975.6250 - mae: 744.3054 - val_loss: 760084.8750 - val_mae: 763.8817
Epoch 2/50
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - loss: 785789.1250 - mae: 764.5760 - val_loss: 749277.8125 - val_mae: 756.8422
Epoch 3/50
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 723799.6250 - mae: 730.6461 - val_loss: 721495.6875 - val_mae: 738.7479
Epoch 4/50
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - loss: 705724.9375 - mae: 715.3154 - val_loss: 672497.6250 - val_mae: 706.5395
Epoch 5/50
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - loss: 630841.6250 - mae: 661.4368 - val_loss: 601962.1875 - val_mae: 659.1578
Epoch 6/50
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - loss: 581775.9375 - mae: 629.3238 - val_loss: 515997.4688 - val_mae: 601.1434
Epoch 7/50


#Evaluasi Model

In [14]:
# Evaluasi model
loss, mae = model.evaluate(X_test_expanded, y_test)
print(f"Mean Absolute Error (MAE): {mae}")  # Tambahkan tanda kurung tutup di sini


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - loss: 179037.7031 - mae: 360.0507  
Mean Absolute Error (MAE): 358.3233642578125


#Membandingkan dengan Model Encoder-Decoder

In [16]:
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Dense

def build_encoder_decoder(input_shape):
    inputs = Input(shape=input_shape)
    encoded = LSTM(32, return_sequences=True)(inputs)
    encoded = LSTM(16)(encoded)

    decoded = Dense(32, activation="relu")(encoded)
    outputs = Dense(1)(decoded)  # Output regresi

    model = keras.Model(inputs, outputs)
    model.compile(optimizer="adam", loss="mse", metrics=["mae"])
    return model

# Perbaikan: Tambahkan dimensi waktu (timesteps = 1)
X_train_expanded = tf.expand_dims(X_train, axis=1)
X_test_expanded = tf.expand_dims(X_test, axis=1)

# Buat model Encoder-Decoder
model_enc_dec = build_encoder_decoder(input_shape=(1, X_train.shape[1]))

# Latih model
history_enc_dec = model_enc_dec.fit(X_train_expanded, y_train, epochs=50, batch_size=16, validation_data=(X_test_expanded, y_test))

# Evaluasi model
loss_enc_dec, mae_enc_dec = model_enc_dec.evaluate(X_test_expanded, y_test)
print(f"Mean Absolute Error (Encoder-Decoder): {mae_enc_dec}")


Epoch 1/50
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 22ms/step - loss: 772110.4375 - mae: 759.8014 - val_loss: 763890.4375 - val_mae: 766.3649
Epoch 2/50
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - loss: 752660.0625 - mae: 742.3045 - val_loss: 755659.6875 - val_mae: 760.9849
Epoch 3/50
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - loss: 750066.6875 - mae: 743.6945 - val_loss: 735357.9375 - val_mae: 747.7571
Epoch 4/50
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 772148.3125 - mae: 754.5152 - val_loss: 712486.1875 - val_mae: 732.8893
Epoch 5/50
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - loss: 695352.8750 - mae: 708.9644 - val_loss: 687228.9375 - val_mae: 716.2730
Epoch 6/50
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - loss: 719922.1250 - mae: 719.7678 - val_loss: 659348.0000 - val_mae: 697.7869
Epoch 7/50
[1