In [1]:
import pandas as pd
import numpy as np 
import seaborn as sn 
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import torch 
import torch.nn as nn 
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split

In [2]:
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [3]:
tracks_df = pd.read_csv("tracks_2000.csv", index_col=[0])

In [4]:
tracks_df.head()

Unnamed: 0,title,artist,top genre,year,bpm,energy,danceability,dB,liveness,valence,duration,acousticness,speechiness,popularity
0,Flowers,Miley Cyrus,pop,2023,118,68,71,-4,3,65,200,6,7,98
1,Cupid - Twin Ver.,FIFTY FIFTY,k-pop girl group,2023,120,59,78,-8,35,73,174,44,3,97
2,BESO,ROSALÍA,pop,2023,95,64,77,-7,17,53,195,74,14,96
3,Boy's a liar Pt. 2,PinkPantheress,bronx drill,2023,133,81,70,-8,25,86,131,25,5,96
4,Creepin' (with The Weeknd & 21 Savage),Metro Boomin,rap,2022,98,62,72,-6,8,17,222,42,5,96


In [5]:
x = tracks_df.loc[: ,"artist":]
y = tracks_df["popularity"]

In [6]:
x.shape

(2338, 13)

In [7]:
xtrain, xtest, ytrain, ytest=train_test_split(x, y, test_size=0.15)

In [8]:
cat_col = ['artist', 'top genre','year']
num_col = tracks_df.iloc[: , 4:].columns.to_list()

In [9]:
col_transform = ColumnTransformer([("ohe",OneHotEncoder(handle_unknown= 'ignore', sparse=False), cat_col),("normalization", MinMaxScaler(),num_col)], remainder='passthrough')

In [10]:
x_train_transformed=col_transform.fit_transform(xtrain)

In [11]:
x_test_transformed=col_transform.transform(xtest)

In [12]:
x_train_transformed.shape

(1987, 1193)

In [13]:
y.shape

(2338,)

In [14]:
class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        self.layers = nn.Sequential(
            nn.Flatten(),
            nn.Linear(x_train_transformed.shape[1],64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.Linear(32,1),           
        )


    def forward(self, x):
        return self.layers(x)

net = Net()



In [15]:
for i in range (len(list(net.parameters()))):
    print(list(net.parameters())[i].shape)

torch.Size([64, 1193])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([32, 64])
torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([1, 32])
torch.Size([1])


In [16]:
class SongsDataset(Dataset):
    def __init__(self, x, y):
        self.x = torch.tensor(x, dtype=torch.float32)
        self.y = torch.tensor(y, dtype = torch.float32).clone().reshape(y.shape[0],1)

    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, idx):
        features = self.x[idx]
        target = self.y[idx]
        return features, target

In [17]:
train_dataset = SongsDataset(x_train_transformed, ytrain.values)
test_dataset = SongsDataset(x_test_transformed, ytest.values)

In [18]:
batch_size = 20

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [19]:
from torch.utils.tensorboard import SummaryWriter
import sys

In [20]:
writer = SummaryWriter("runs/music")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

optimizer = torch.optim.Adam(net.parameters(), lr = 0.01)
criterion = nn.MSELoss()

n_total_steps = len(train_loader)

num_epochs =10
for epoch in range(num_epochs):
    running_loss = 0.0
    for features, popularity in train_loader:
        
        features = features.to(device)
        popularity = popularity.to(device)
        
        optimizer.zero_grad()
        
        outputs = net(features)
        loss = criterion(outputs, popularity)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(train_loader):.4f}")
    writer.add_scalar("training_loss", running_loss/100, epoch * n_total_steps + i)

    writer.close()


Epoch 1/10, Loss: 2834.9820
Epoch 2/10, Loss: 77.6596
Epoch 3/10, Loss: 43.2156
Epoch 4/10, Loss: 29.8642
Epoch 5/10, Loss: 24.5116
Epoch 6/10, Loss: 21.1626
Epoch 7/10, Loss: 20.6873
Epoch 8/10, Loss: 18.5959
Epoch 9/10, Loss: 17.3243
Epoch 10/10, Loss: 15.6516


In [21]:
net.eval()

total_loss = 0 
with torch.no_grad():
    for features, popularity in test_loader:

        features = features.to(device)
        popularity = popularity.to(device)
        outputs = net(features)
        loss = criterion(outputs, popularity)
        print(loss)
        total_loss += loss

tensor(13.2642)
tensor(8.8720)
tensor(9.0891)
tensor(10.4043)
tensor(17.4125)
tensor(13.3261)
tensor(12.4669)
tensor(12.8462)
tensor(9.8252)
tensor(10.0000)
tensor(11.9716)
tensor(6.7809)
tensor(8.0062)
tensor(8.2824)
tensor(9.9053)
tensor(11.0431)
tensor(11.1329)
tensor(7.6373)


In [22]:
avg_loss = total_loss/len(test_loader)
avg_loss 

tensor(10.6814)

In [23]:
torch_rmse = torch.sqrt(avg_loss).detach().numpy()
print(torch_rmse)

3.2682483
