In [68]:
# Import dependencies & libraries
from google.colab import files
import pandas as pd
from scipy.stats import linregress
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import random_split

In [69]:
# Upload file train.csv & test.csv sebelum menjalankan kode berikutnya
uploaded = files.upload()

In [71]:
# (Optional) download file csv yg sudah dibersihkan
files.download("train_cleaned.csv")
files.download("test_cleaned.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Fungsi u/: cleaning dataset
def clean_data(df):
    # Isi missing value pada kolom Popularity dengan mean
    df['Popularity'] = df['Popularity'].fillna(round(df['Popularity'].mean(), 1))

    # Hapus kolom dengan missing values
    cleaned_df = df.drop(["instrumentalness", "key"], axis=1)

    return cleaned_df

In [None]:
# Fungsi u/: formatting input & output menjadi tensor
def make_tensor(dataset):
    # Pisahkan kolom berdasarkan tipe datanya
    columns_int64 = dataset.select_dtypes(include=['int64']).columns  # Semua kolom bertipe int64
    columns_float64 = dataset.select_dtypes(include=['float64']).columns  # Semua kolom bertipe float64

    # Inisialisasi object StandardScaler()
    scaler = StandardScaler()

    # Normalisasi kolom numerikal
    normalized_float_array = scaler.fit_transform(dataset[columns_float64])

    # Konversi kolom int64 & float64 ke bentuk PyTorch tensor berdasarkan tipe data
    int_tensor = torch.tensor(dataset[columns_int64].values, dtype=torch.int64)  # Untuk int64
    float_tensor = torch.tensor(normalized_float_array, dtype=torch.float64)  # Untuk float64

    # Pastikan banyak baris pada int_tensor & float_tensor sama
    assert int_tensor.shape[0] == float_tensor.shape[0]

    # Gabungkan tensor int64 & float64
    formatted_tensor = torch.cat((float_tensor, int_tensor), dim=1)

    # Print hasil gabungan
    return formatted_tensor

In [None]:
# Fungsi membagi train set menjadi train & validation set
def split_train_validation(df):
    # Memisahkan train_df menjadi train_data dan val_data (80% train, 20% validation)
    train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['Class'])

    # Menampilkan informasi jumlah data
    print("Jumlah data training:", len(train_df))
    print("Jumlah data validation:", len(val_df))

    return train_df, val_df

In [70]:

# Load data
train_df = pd.read_csv('train.csv')
test_df= pd.read_csv('test.csv')

# ----------------!!!-----------------------
# Hitung jumlah missing value per kolom
missing_values_train = train_df.isna().sum()

# Tampilkan jumlah missing value
print(missing_values_train)

# Jika ingin melihat hanya kolom yang memiliki missing value
missing_values_train = missing_values_train[missing_values_train > 0]
print("Kolom pada training set dengan missing values:")
print(missing_values_train)

# ----------------!!!-----------------------
# Hitung jumlah missing value per kolom
missing_values_test = test_df.isna().sum()

# Tampilkan jumlah missing value
print(missing_values_test)

# Jika ingin melihat hanya kolom yang memiliki missing value
missing_values_test = missing_values_test[missing_values_test > 0]
print("Kolom pada test set dengan missing values:")
print(missing_values_test)

# Bersihkan dataset
cleaned_train_df = clean_data(train_df)
cleaned_test_df = clean_data(test_df)

# Simpan dataset
cleaned_train_df.to_csv('train_cleaned.csv', index=False)
cleaned_test_df.to_csv('test_cleaned.csv', index=False)

# Bagi train set menjadi train & validation set
train_df, val_df = split_train_validation(cleaned_train_df)

# Tranformasi train, validation, & test set menjadi tensor
train_tensor = make_tensor(train_df)
val_tensor = make_tensor(val_df)
test_tensor = make_tensor(cleaned_test_df)

Artist Name              0
Track Name               0
Popularity             428
danceability             0
energy                   0
key                   2014
loudness                 0
mode                     0
speechiness              0
acousticness             0
instrumentalness      4377
liveness                 0
valence                  0
tempo                    0
duration_in min/ms       0
time_signature           0
Class                    0
dtype: int64
Kolom pada training set dengan missing values:
Popularity           428
key                 2014
instrumentalness    4377
dtype: int64
Artist Name              0
Track Name               0
Popularity             227
danceability             0
energy                   0
key                    808
loudness                 0
mode                     0
speechiness              0
acousticness             0
instrumentalness      1909
liveness                 0
valence                  0
tempo                    0
duration_in min

In [74]:
# Definisi kelas Dataset
class MyDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X  # Fitur
        self.Y = Y  # Label/target

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        return self.X[idx], self.Y[idx]

# Pisahkan fitur (X) dan label (Y)
Xtrain = train_tensor[:, :-1]
Ytrain = train_tensor[:, -1]

Xval = val_tensor[:, :-1]
Yval = val_tensor[:, -1]

Xtest = test_tensor[:, :-1]
Ytest = test_tensor[:, -1]

Ytrain = train_df['Class']  # Kolom kelas sebagai target
Xtrain = train_df.drop(columns=['Class'])  # Drop kolom kelas dari fitur

# Buat objek dataset
train_set = MyDataset(Xtrain, Ytrain)
val_set = MyDataset(Xval, Yval)
test_set = MyDataset(Xtest, Ytest)

In [75]:
# Gunakan DataLoader untuk batching
train_loader = DataLoader(train_set, batch_size=32, shuffle=True)
valloader = DataLoader(val_set, batch_size=32, shuffle=True)
test_loader = DataLoader(test_set, batch_size=32, shuffle=False)


In [76]:
 # Pembangunan Model
class MyModel(nn.Module):
    def __init__(self, input_size, output_size):
        super(MyModel, self).__init__()
        self.layer1 = nn.Linear(input_size, 64)
        self.layer2 = nn.Linear(64, 32)
        self.layer3 = nn.Linear(32, 16)
        self.layer4 = nn.Linear(32, output_size)

        # Inisialisasi bobot menggunakan He untuk layer 1-3
        nn.init.kaiming_normal_(self.layer1.weight, nonlinearity='relu')
        nn.init.kaiming_normal_(self.layer2.weight, nonlinearity='relu')
        nn.init.kaiming_normal_(self.layer3.weight, nonlinearity='relu')
        
        # Inisialisasi bobot menggunakan Xavier untuk layer 4
        nn.init.xavier_normal_(self.layer4.weight)
        
        # Inisialisasi bias dengan nol
        nn.init.zeros_(self.layer1.bias)
        nn.init.zeros_(self.layer2.bias)
        nn.init.zeros_(self.layer3.bias)
        nn.init.zeros_(self.layer4.bias)

    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        x = F.relu(self.layer3(x))
        return torch.sigmoid(self.layer4(x)) # Output tanpa fungsi aktivasi, cocok untuk regresi

# Inisialisasi model dan optimizer
model = MyModel()
optimizer = optim.SGD(model.parameters(), lr=0.001)

In [None]:

epoch = 10
for ei in range(epoch):
  sum_loss = 0.0

  # Klasifikasi, SGD Step, perhitungan loss, & MSE pada data training 
  for x, y in train_loader:
    optimizer.zero_grad()
    yt = model(x)
    loss = F.mse_loss(yt, y)
    with torch.no_grad():
      sum_loss += loss
    loss.backward()
    optimizer.step()

  with torch.no_grad():
    ncorrect = 0
    avg_loss = sum_loss / Xtrain.shape[0]
    for x, y in valloader:
      yt = model(x)
      lt = torch.argmax(yt, dim=1) # kelas hasil prediksi
      ll = torch.argmax(y, dim=1) # kelas hasil dari label
      for i in range(lt.shape[0]):
        if ll[i] == lt[i]:
          ncorrect += 1
    avg_val_acc = ncorrect / Xval.shape[0] * 100.0
    print(f'Epoch={ei + 1} Avg Loss={avg_loss} Avg Val Acc={avg_val_acc} \n')
