## Pytorch Linear Regression
- Data was generated using Spotify Offical API and is available:
https://www.kaggle.com/datasets/yasserh/song-popularity-dataset/

In [37]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import torch
import sklearn
from sklearn.model_selection import train_test_split

In [38]:
songPopularity = pd.read_csv("data/song_data.csv")

In [39]:
songPopularity.head(10)

Unnamed: 0,song_name,song_popularity,song_duration_ms,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,audio_mode,speechiness,tempo,time_signature,audio_valence
0,Boulevard of Broken Dreams,73,262333,0.00552,0.496,0.682,2.9e-05,8,0.0589,-4.095,1,0.0294,167.06,4,0.474
1,In The End,66,216933,0.0103,0.542,0.853,0.0,3,0.108,-6.407,0,0.0498,105.256,4,0.37
2,Seven Nation Army,76,231733,0.00817,0.737,0.463,0.447,0,0.255,-7.828,1,0.0792,123.881,4,0.324
3,By The Way,74,216933,0.0264,0.451,0.97,0.00355,0,0.102,-4.938,1,0.107,122.444,4,0.198
4,How You Remind Me,56,223826,0.000954,0.447,0.766,0.0,10,0.113,-5.065,1,0.0313,172.011,4,0.574
5,Bring Me To Life,80,235893,0.00895,0.316,0.945,2e-06,4,0.396,-3.169,0,0.124,189.931,4,0.32
6,Last Resort,81,199893,0.000504,0.581,0.887,0.00111,4,0.268,-3.659,0,0.0624,90.578,4,0.724
7,Are You Gonna Be My Girl,76,213800,0.00148,0.613,0.953,0.000582,2,0.152,-3.435,1,0.0855,105.046,4,0.537
8,Mr. Brightside,80,222586,0.00108,0.33,0.936,0.0,1,0.0926,-3.66,1,0.0917,148.112,4,0.234
9,Sex on Fire,81,203346,0.00172,0.542,0.905,0.0104,9,0.136,-5.653,1,0.054,153.398,4,0.374


In [40]:
# Prepare features
features = ['song_duration_ms', 
            'acousticness', 'danceability', 
            'energy', 'instrumentalness', 
            'key', 'liveness', 'loudness', 
            'audio_mode', 'speechiness', 
            'tempo', 'time_signature', 'audio_valence']

target = 'song_popularity'

songPopularityFeatures = songPopularity[features]
songPopularityTarget = songPopularity[target]

In [41]:
# Utilize train_test_split from sklearn to split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(songPopularityFeatures, songPopularityTarget, test_size = 0.2)

In [42]:
# Convert DataFrames into PyTorch tensors
def dataframe_to_tensor(df):
    return torch.tensor(df.values, dtype=torch.float32)

# Transform DataFrames into PyTorch tensors using the function
X_train = dataframe_to_tensor(X_train)
X_test = dataframe_to_tensor(X_test)
y_train = dataframe_to_tensor(y_train)
y_test = dataframe_to_tensor(y_test)

In [43]:
# Visualize X_train
X_train

tensor([[2.0559e+05, 8.1200e-02, 7.5600e-01,  ..., 1.4693e+02, 4.0000e+00,
         8.1100e-01],
        [1.7540e+05, 3.6300e-02, 3.5100e-01,  ..., 1.9567e+02, 4.0000e+00,
         8.8000e-01],
        [2.3942e+05, 7.9100e-01, 5.4400e-01,  ..., 1.3455e+02, 4.0000e+00,
         2.3600e-01],
        ...,
        [1.3200e+05, 4.0000e-01, 8.5100e-01,  ..., 8.0064e+01, 4.0000e+00,
         5.1200e-01],
        [1.7136e+05, 4.2700e-03, 5.9700e-01,  ..., 1.2634e+02, 4.0000e+00,
         4.4700e-01],
        [1.9224e+05, 1.6800e-01, 7.3400e-01,  ..., 1.0447e+02, 4.0000e+00,
         7.9300e-01]])

In [46]:
# Create first torch.nn.LinearRegression model

class LinearRegressionModel(nn.Module):
    '''
    Torch Module class.
    Initializes weight randomly and gets trained via train method.
    '''
    def __init__(self, optimizer):
        super().__init__()
        self.optimizer = optimizer

        # Initialize Weights and Bias
        self.weights = nn.Parameter(
            torch.randn(1, 13, dtype=torch.float),
            requires_grad=True)

        self.bias = nn.Parameter(
            torch.randn(1, 13, dtype=torch.float),
            requires_grad=True
            )
    # Goal is to optimize the weights using the optimizer (backpropagation)
    def forward(self, x: torch.Tensor) -> torch.Tensor:
            return (self.weights * x + self.bias).sum(axis=1)
    # Create trainModel Method to perform backpropogation
    # and weight adjustment for optimization
    def trainModel(
            self,
            epochs: int,
            X_train: torch.Tensor,
            X_test: torch.Tensor,
            y_train: torch.Tensor,
            y_test: torch.Tensor,
            lr: float
            ):
        '''
        Trains linear model using pytorch.
        Evaluates the model against test set for every epoch.
        '''
        torch.manual_seed(42)
        # Create empty loss lists to track values
        self.train_loss_values = []
        self.test_loss_values = []

        loss_fn = nn.L1Loss()

        if self.optimizer == 'SGD':
            optimizer = torch.optim.SGD(
                params=self.parameters(),
                lr=lr
                )
        elif self.optimizer == 'Adam':
            optimizer = torch.optim.Adam(
                params=self.parameters(),
                lr=lr
                )

        for epoch in range(epochs):
            self.train()
            y_pred = self(X_train)
            loss = loss_fn(y_pred, y_train)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Set the model in evaluation mode
            self.eval()
            with torch.inference_mode():
                self.evaluate(X_test, y_test, epoch, loss_fn, loss)
    def evaluate(self, X_test, y_test, epoch_nb, loss_fn, train_loss):
        '''
        Evaluates current epoch performance on the test set.
        '''
        test_pred = self(X_test)
        test_loss = loss_fn(test_pred, y_test.type(torch.float))
        if epoch_nb % 10 == 0:
            self.train_loss_values.append(train_loss.detach().numpy())
            self.test_loss_values.append(test_loss.detach().numpy())
            print(f"Epoch: {epoch_nb} - MAE Train Loss: {train_loss} - MAE Test Loss: {test_loss} ")

In [50]:
# Train model using Adam optimizer and 0.001 learning rate
adam_model = LinearRegressionModel('Adam')

adam_model.trainModel(500, X_train, X_test, y_train, y_test, 0.001)

Epoch: 0 - MAE Train Loss: 73396.3671875 - MAE Test Loss: 73653.859375 
Epoch: 10 - MAE Train Loss: 71215.578125 - MAE Test Loss: 71458.890625 
Epoch: 20 - MAE Train Loss: 69034.78125 - MAE Test Loss: 69263.9375 
Epoch: 30 - MAE Train Loss: 66854.0 - MAE Test Loss: 67068.9765625 
Epoch: 40 - MAE Train Loss: 64673.19921875 - MAE Test Loss: 64874.0078125 
Epoch: 50 - MAE Train Loss: 62492.40234375 - MAE Test Loss: 62679.0546875 
Epoch: 60 - MAE Train Loss: 60311.6171875 - MAE Test Loss: 60484.08984375 
Epoch: 70 - MAE Train Loss: 58130.82421875 - MAE Test Loss: 58289.1328125 
Epoch: 80 - MAE Train Loss: 55950.03515625 - MAE Test Loss: 56094.1640625 
Epoch: 90 - MAE Train Loss: 53769.234375 - MAE Test Loss: 53899.1875 
Epoch: 100 - MAE Train Loss: 51588.40625 - MAE Test Loss: 51704.19921875 
Epoch: 110 - MAE Train Loss: 49407.58984375 - MAE Test Loss: 49509.19921875 
Epoch: 120 - MAE Train Loss: 47226.75390625 - MAE Test Loss: 47314.203125 
Epoch: 130 - MAE Train Loss: 45045.9296875 - MAE

In [49]:
# Now use Stochastic Gradient Descent (SGD) optimizer with 0.001 learning rate
# Notice that the MAE does not improve. SGD is very sensitive to features and could be optimized by feature scaling
sgd_model = LinearRegressionModel('SGD')
sgd_model.trainModel(500, X_train, X_test, y_train, y_test, 0.001) 

Epoch: 0 - MAE Train Loss: 73396.3671875 - MAE Test Loss: 47727616.0 
Epoch: 10 - MAE Train Loss: 73396.640625 - MAE Test Loss: 47727616.0 
Epoch: 20 - MAE Train Loss: 73396.640625 - MAE Test Loss: 47727616.0 
Epoch: 30 - MAE Train Loss: 73396.640625 - MAE Test Loss: 47727616.0 
Epoch: 40 - MAE Train Loss: 73396.640625 - MAE Test Loss: 47727616.0 
Epoch: 50 - MAE Train Loss: 73396.640625 - MAE Test Loss: 47727616.0 
Epoch: 60 - MAE Train Loss: 73396.640625 - MAE Test Loss: 47727616.0 
Epoch: 70 - MAE Train Loss: 73396.640625 - MAE Test Loss: 47727616.0 
Epoch: 80 - MAE Train Loss: 73396.640625 - MAE Test Loss: 47727616.0 
Epoch: 90 - MAE Train Loss: 73396.640625 - MAE Test Loss: 47727616.0 
Epoch: 100 - MAE Train Loss: 73396.640625 - MAE Test Loss: 47727616.0 
Epoch: 110 - MAE Train Loss: 73396.640625 - MAE Test Loss: 47727616.0 
Epoch: 120 - MAE Train Loss: 73396.640625 - MAE Test Loss: 47727616.0 
Epoch: 130 - MAE Train Loss: 73396.640625 - MAE Test Loss: 47727616.0 
Epoch: 140 - MAE