In [1]:
# Import dependencies & libraries
from google.colab import files
import pandas as pd
from scipy.stats import linregress
from sklearn.preprocessing import StandardScaler
import torch


In [2]:
# Upload file train.csv & test.csv sebelum menjalankan kode berikutnya
uploaded = files.upload()

Saving test.csv to test.csv
Saving train.csv to train.csv


In [12]:
#Cleaning starts here
"""
dataset cleaned : missing values kolom popularity diisikan mean,
kolom instrumentalness dan key diisi dengan prediksi berdasarkan korelasi
dengan variabel lain menggunakan KNN Imputation.
"""

# Load data
train_df = pd.read_csv('train.csv')
test_df= pd.read_csv('test.csv')

# ----------------!!!-----------------------
# Hitung jumlah missing value per kolom
missing_values_train = train_df.isna().sum()

# Tampilkan jumlah missing value
print(missing_values_train)

# Jika ingin melihat hanya kolom yang memiliki missing value
missing_values_train = missing_values_train[missing_values_train > 0]
print("Kolom pada training set dengan missing values:")
print(missing_values_train)

# ----------------!!!-----------------------
# Hitung jumlah missing value per kolom
missing_values_test = test_df.isna().sum()

# Tampilkan jumlah missing value
print(missing_values_test)

# Jika ingin melihat hanya kolom yang memiliki missing value
missing_values_test = missing_values_test[missing_values_test > 0]
print("Kolom pada test set dengan missing values:")
print(missing_values_test)

# Hapus kolom 'dengan missing values
cleaned_train_df = train_df.drop(["Popularity", "instrumentalness", "key"], axis=1)
cleaned_test_df = test_df.drop(["Popularity", "instrumentalness", "key"], axis=1)

# Simpan dataset
cleaned_train_df.to_csv('train_cleaned.csv', index=False)
cleaned_test_df.to_csv('test_cleaned.csv', index=False)

Artist Name              0
Track Name               0
Popularity             428
danceability             0
energy                   0
key                   2014
loudness                 0
mode                     0
speechiness              0
acousticness             0
instrumentalness      4377
liveness                 0
valence                  0
tempo                    0
duration_in min/ms       0
time_signature           0
Class                    0
dtype: int64
Kolom pada training set dengan missing values:
Popularity           428
key                 2014
instrumentalness    4377
dtype: int64
Artist Name              0
Track Name               0
Popularity             227
danceability             0
energy                   0
key                    808
loudness                 0
mode                     0
speechiness              0
acousticness             0
instrumentalness      1909
liveness                 0
valence                  0
tempo                    0
duration_in min

In [11]:
# (Optional) download file csv yg sudah dibersihkan
files.download("train_cleaned.csv")
files.download("test_cleaned.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# print(train_df.head())  # Tampilkan beberapa baris pertama

# print(train_df.iloc[:, 2:].head())  # Tampilkan kolom yang dipilih

# print(train_df.dtypes)  # Periksa tipe data di tiap kolom

In [13]:
# Formating input & output on train set starts here

# Pisahkan kolom berdasarkan tipe datanya
columns_int64 = cleaned_train_df.select_dtypes(include=['int64']).columns  # Semua kolom bertipe int64
columns_float64 = cleaned_train_df.select_dtypes(include=['float64']).columns  # Semua kolom bertipe float64

# Inisialisasi object StandardScaler()
scaler = StandardScaler()

# Normalisasi kolom numerikal
normalized_float_array = scaler.fit_transform(cleaned_train_df[columns_float64])

# Konversi kolom int64 & float64 ke bentuk PyTorch tensor berdasarkan tipe data
int_tensor = torch.tensor(cleaned_train_df[columns_int64].values, dtype=torch.int64)  # Untuk int64
float_tensor = torch.tensor(normalized_float_array, dtype=torch.float64)  # Untuk float64

# Pastikan banyak baris pada int_tensor & float_tensor sama
assert int_tensor.shape[0] == float_tensor.shape[0]

# Gabungkan tensor int64 & float64
combined_tensor = torch.cat((float_tensor, int_tensor), dim=1)

# Print hasil gabungan
print("Combined tensor:\n", combined_tensor)


Combined tensor:
 tensor([[ 1.8679, -0.4197,  0.7277,  ...,  1.0000,  4.0000,  5.0000],
        [-0.9709,  0.6425,  0.1681,  ...,  1.0000,  4.0000, 10.0000],
        [-0.6582, -0.2072, -0.1046,  ...,  1.0000,  4.0000,  6.0000],
        ...,
        [ 0.0876,  1.3520,  0.7971,  ...,  0.0000,  4.0000,  8.0000],
        [-1.9754,  0.6043, -1.1969,  ...,  0.0000,  4.0000,  8.0000],
        [-0.8627,  0.8082,  0.6398,  ...,  0.0000,  4.0000, 10.0000]],
       dtype=torch.float64)


In [14]:
# Formating input on test set starts here

# Pisahkan kolom berdasarkan tipe datanya
columns_int64 = cleaned_test_df.select_dtypes(include=['int64']).columns  # Semua kolom bertipe int64
columns_float64 = cleaned_test_df.select_dtypes(include=['float64']).columns  # Semua kolom bertipe float64

# Inisialisasi object StandardScaler()
scaler = StandardScaler()

# Normalisasi kolom numerikal
normalized_float_array = scaler.fit_transform(cleaned_test_df[columns_float64])

# Konversi kolom int64 & float64 ke bentuk PyTorch tensor berdasarkan tipe data
int_tensor = torch.tensor(cleaned_test_df[columns_int64].values, dtype=torch.int64)  # Untuk int64
float_tensor = torch.tensor(normalized_float_array, dtype=torch.float64)  # Untuk float64

# Pastikan banyak baris pada int_tensor & float_tensor sama
assert int_tensor.shape[0] == float_tensor.shape[0]

# Gabungkan tensor int64 & float64
combined_tensor = torch.cat((float_tensor, int_tensor), dim=1)

# Print hasil gabungan
print("Combined tensor:\n", combined_tensor)


Combined tensor:
 tensor([[-1.4440, -1.1093, -1.4623,  ...,  1.0486,  1.0000,  4.0000],
        [-0.2251,  1.2416,  0.6986,  ...,  0.1856,  1.0000,  4.0000],
        [ 0.9211,  0.1939,  0.7119,  ...,  0.0313,  1.0000,  4.0000],
        ...,
        [ 1.5579,  1.0287,  0.1554,  ..., -1.7632,  0.0000,  4.0000],
        [ 1.5336,  1.1351,  0.1195,  ...,  0.1936,  1.0000,  4.0000],
        [ 0.4966,  0.8668,  0.8659,  ...,  0.0044,  0.0000,  4.0000]],
       dtype=torch.float64)
