In [68]:
# Import dependencies & libraries
from google.colab import files
import pandas as pd
from scipy.stats import linregress
from sklearn.preprocessing import StandardScaler
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [69]:
# Upload file train.csv & test.csv sebelum menjalankan kode berikutnya
uploaded = files.upload()

In [70]:
#Cleaning starts here
"""
dataset cleaning:
1) isi missing values pada kolom popularity dengan mean
2) drop kolom instrumentalness & key dengan missing values
"""

# Load data
train_df = pd.read_csv('train.csv')
test_df= pd.read_csv('test.csv')

# ----------------!!!-----------------------
# Hitung jumlah missing value per kolom
missing_values_train = train_df.isna().sum()

# Tampilkan jumlah missing value
print(missing_values_train)

# Jika ingin melihat hanya kolom yang memiliki missing value
missing_values_train = missing_values_train[missing_values_train > 0]
print("Kolom pada training set dengan missing values:")
print(missing_values_train)

# ----------------!!!-----------------------
# Hitung jumlah missing value per kolom
missing_values_test = test_df.isna().sum()

# Tampilkan jumlah missing value
print(missing_values_test)

# Jika ingin melihat hanya kolom yang memiliki missing value
missing_values_test = missing_values_test[missing_values_test > 0]
print("Kolom pada test set dengan missing values:")
print(missing_values_test)

# Isi missing value pada kolom Popularity dengan mean
train_df['Popularity'] = train_df['Popularity'].fillna(round(train_df['Popularity'].mean(), 1))

# Hapus kolom dengan missing values
cleaned_train_df = train_df.drop(["instrumentalness", "key"], axis=1)
cleaned_test_df = test_df.drop(["instrumentalness", "key"], axis=1)

# Simpan dataset
cleaned_train_df.to_csv('train_cleaned.csv', index=False)
cleaned_test_df.to_csv('test_cleaned.csv', index=False)

Artist Name              0
Track Name               0
Popularity             428
danceability             0
energy                   0
key                   2014
loudness                 0
mode                     0
speechiness              0
acousticness             0
instrumentalness      4377
liveness                 0
valence                  0
tempo                    0
duration_in min/ms       0
time_signature           0
Class                    0
dtype: int64
Kolom pada training set dengan missing values:
Popularity           428
key                 2014
instrumentalness    4377
dtype: int64
Artist Name              0
Track Name               0
Popularity             227
danceability             0
energy                   0
key                    808
loudness                 0
mode                     0
speechiness              0
acousticness             0
instrumentalness      1909
liveness                 0
valence                  0
tempo                    0
duration_in min

In [71]:
# (Optional) download file csv yg sudah dibersihkan
files.download("train_cleaned.csv")
files.download("test_cleaned.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [72]:
# Formating input & output on train set starts here

# Pisahkan kolom berdasarkan tipe datanya
columns_int64 = cleaned_train_df.select_dtypes(include=['int64']).columns  # Semua kolom bertipe int64
columns_float64 = cleaned_train_df.select_dtypes(include=['float64']).columns  # Semua kolom bertipe float64

# Inisialisasi object StandardScaler()
scaler = StandardScaler()

# Normalisasi kolom numerikal
normalized_float_array = scaler.fit_transform(cleaned_train_df[columns_float64])

# Konversi kolom int64 & float64 ke bentuk PyTorch tensor berdasarkan tipe data
int_tensor = torch.tensor(cleaned_train_df[columns_int64].values, dtype=torch.int64)  # Untuk int64
float_tensor = torch.tensor(normalized_float_array, dtype=torch.float64)  # Untuk float64

# Pastikan banyak baris pada int_tensor & float_tensor sama
assert int_tensor.shape[0] == float_tensor.shape[0]

# Gabungkan tensor int64 & float64
combined_tensor_train = torch.cat((float_tensor, int_tensor), dim=1)

# Print hasil gabungan
print("Combined tensor:\n", combined_tensor_train)


Combined tensor:
 tensor([[ 0.8995,  1.8679, -0.4197,  ...,  1.0000,  4.0000,  5.0000],
        [ 0.5511, -0.9709,  0.6425,  ...,  1.0000,  4.0000, 10.0000],
        [-0.5524, -0.6582, -0.2072,  ...,  1.0000,  4.0000,  6.0000],
        ...,
        [-0.6105,  0.0876,  1.3520,  ...,  0.0000,  4.0000,  8.0000],
        [-0.9009, -1.9754,  0.6043,  ...,  0.0000,  4.0000,  8.0000],
        [-0.0878, -0.8627,  0.8082,  ...,  0.0000,  4.0000, 10.0000]],
       dtype=torch.float64)


In [73]:
# Formating input on test set starts here

# Pisahkan kolom berdasarkan tipe datanya
columns_int64 = cleaned_test_df.select_dtypes(include=['int64']).columns  # Semua kolom bertipe int64
columns_float64 = cleaned_test_df.select_dtypes(include=['float64']).columns  # Semua kolom bertipe float64

# Inisialisasi object StandardScaler()
scaler = StandardScaler()

# Normalisasi kolom numerikal
normalized_float_array = scaler.fit_transform(cleaned_test_df[columns_float64])

# Konversi kolom int64 & float64 ke bentuk PyTorch tensor berdasarkan tipe data
int_tensor = torch.tensor(cleaned_test_df[columns_int64].values, dtype=torch.int64)  # Untuk int64
float_tensor = torch.tensor(normalized_float_array, dtype=torch.float64)  # Untuk float64

# Pastikan banyak baris pada int_tensor & float_tensor sama
assert int_tensor.shape[0] == float_tensor.shape[0]

# Gabungkan tensor int64 & float64
combined_tensor_test = torch.cat((float_tensor, int_tensor), dim=1)

# Print hasil gabungan
print("Combined tensor:\n", combined_tensor_test)


Combined tensor:
 tensor([[ 1.6110e+00, -1.4440e+00, -1.1093e+00,  ...,  1.0486e+00,
          1.0000e+00,  4.0000e+00],
        [-6.3033e-01, -2.2505e-01,  1.2416e+00,  ...,  1.8557e-01,
          1.0000e+00,  4.0000e+00],
        [ 1.8983e+00,  9.2113e-01,  1.9392e-01,  ...,  3.1259e-02,
          1.0000e+00,  4.0000e+00],
        ...,
        [-4.0045e-01,  1.5579e+00,  1.0287e+00,  ..., -1.7632e+00,
          0.0000e+00,  4.0000e+00],
        [ 3.4665e-01,  1.5336e+00,  1.1351e+00,  ...,  1.9359e-01,
          1.0000e+00,  4.0000e+00],
        [ 1.8348e-03,  4.9662e-01,  8.6682e-01,  ...,  4.3984e-03,
          0.0000e+00,  4.0000e+00]], dtype=torch.float64)


In [74]:
# Definisi kelas Dataset
class MyDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X  # Fitur
        self.Y = Y  # Label/target

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        return self.X[idx], self.Y[idx]

# Pisahkan fitur (X) dan label (Y)
Xtrain = combined_tensor_train[:, :-1]
Ytrain = combined_tensor_train[:, -1]

Xtest = combined_tensor_test[:, :-1]
Ytest = combined_tensor_test[:, -1]

Ytrain = train_df['Class']  # Kolom kelas sebagai target
Xtrain = train_df.drop(columns=['Class'])  # Drop kolom kelas dari fitur

# Buat objek dataset
train_dataset = MyDataset(Xtrain, Ytrain)
test_dataset = MyDataset(Xtest, Ytest)

In [75]:
# Gunakan DataLoader untuk batching
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [76]:
 # Pembangunan Model
class MyModel(nn.Module):
    def __init__(self, input_size, output_size):
        super(MyModel, self).__init__()
        self.layer1 = nn.Linear(input_size, 64)
        self.layer2 = nn.Linear(64, 32)
        self.layer3 = nn.Linear(32, output_size)

    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        return self.layer3(x)  # Output tanpa fungsi aktivasi, cocok untuk regresi