<a href="https://colab.research.google.com/github/vineel-panyala/NewTunes/blob/main/NewTunes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NewTunes



[Data Set 1](https://www.kaggle.com/datasets/rodolfofigueroa/spotify-12m-songs)

[Data Set 2](https://www.kaggle.com/datasets/ambaliyagati/spotify-dataset-for-playing-around-with-sql)

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
import numpy as np

Loading Data Sets

In [None]:
audio_features_df = pd.read_csv("/content/drive/MyDrive/NewTunes Data/spotify_tracks (1).csv")
spotify_tracks_df = pd.read_csv("/content/drive/MyDrive/NewTunes Data/tracks_features.csv")

In [None]:
print(audio_features_df.columns)
print(spotify_tracks_df.columns)

Index(['id', 'name', 'genre', 'artists', 'album', 'popularity', 'duration_ms',
       'explicit'],
      dtype='object')
Index(['id', 'name', 'album', 'album_id', 'artists', 'artist_ids',
       'track_number', 'disc_number', 'explicit', 'danceability', 'energy',
       'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms',
       'time_signature', 'year', 'release_date'],
      dtype='object')


Merging DataFrames and Keeping Relevant Columns

In [None]:
final_df = pd.merge(audio_features_df, spotify_tracks_df, on='id')
final_df = final_df[['id', 'name_x', 'album_x', 'popularity', 'duration_ms_x', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature']]
final_df.rename(columns={'name_x': 'name', 'album_x': 'album', 'duration_ms_x':'duration_ms'}, inplace=True)


#Data Preparation

For Data to be trained:
1. Drop ID, Name, Album Columns
2. Drop Rows with Missing Values
3. Fill empty values with mean value

In [None]:
cleaned_df = final_df.drop(columns=['id', 'name','album'])
cleaned_df = cleaned_df.dropna()
cleaned_df = cleaned_df.fillna(cleaned_df.mean())
print(cleaned_df.columns)

Index(['popularity', 'duration_ms', 'danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'time_signature'],
      dtype='object')


Ensure Similar Scaling Measures for each variable

In [None]:
scaler = MinMaxScaler()
cleaned_df = scaler.fit_transform(cleaned_df)

# Pytorch ML

Convert Data to PyTorch Tensors

In [None]:
X = cleaned_df[:, 1:]
y = cleaned_df[:, 0]

X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32)

y_tensor = y_tensor.view(-1, 1)

In [None]:
gen_dataset = TensorDataset(X_tensor, y_tensor)

train_size = int(0.75 * len(gen_dataset))
test_size = len(gen_dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(gen_dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, batch_size = 32, shuffle = True)
test_loader = DataLoader(test_dataset, batch_size = 32, shuffle = False)

Neural Network Model

In [None]:
class popularity_predictor(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(popularity_predictor, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

input_size = X.shape[1]
hidden_size = 64
output_size = 1

model = popularity_predictor(input_size, hidden_size, output_size)

In [None]:
criterion = nn.MSELoss() # Gives Mean Squared Error loss for regressions
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
num_epochs = 100

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0

    for features, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(features)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(train_loader)}")

Epoch 1/100, Loss: 0.06751731112599373
Epoch 2/100, Loss: 0.05494100116193294
Epoch 3/100, Loss: 0.0485832504928112
Epoch 4/100, Loss: 0.045079075172543524
Epoch 5/100, Loss: 0.04121720604598522
Epoch 6/100, Loss: 0.04082063399255276
Epoch 7/100, Loss: 0.039750476367771626
Epoch 8/100, Loss: 0.03865131679922342
Epoch 9/100, Loss: 0.03808351252228022
Epoch 10/100, Loss: 0.03780564405024052
Epoch 11/100, Loss: 0.03697923719882965
Epoch 12/100, Loss: 0.037028530798852446
Epoch 13/100, Loss: 0.036993841640651226
Epoch 14/100, Loss: 0.0364395834505558
Epoch 15/100, Loss: 0.0358468996360898
Epoch 16/100, Loss: 0.03526377575471997
Epoch 17/100, Loss: 0.03655579388141632
Epoch 18/100, Loss: 0.03620578609406948
Epoch 19/100, Loss: 0.035971056856215
Epoch 20/100, Loss: 0.03546560760587454
Epoch 21/100, Loss: 0.0351419672369957
Epoch 22/100, Loss: 0.03474135212600231
Epoch 23/100, Loss: 0.03493841253221035
Epoch 24/100, Loss: 0.034402723610401156
Epoch 25/100, Loss: 0.03480595014989376
Epoch 26/1

In [None]:
model.eval()
with torch.no_grad():
    total_loss = 0.0
    for features, labels in test_loader:
        outputs = model(features)
        loss = criterion(outputs, labels)
        total_loss += loss.item()

print(f"Test Loss: {total_loss/len(test_loader)}")

Test Loss: 0.03319378802552819


In [None]:
model.eval()
with torch.no_grad():
  sample_data = X_tensor[:5]
  predictions = model(sample_data)
  print(predictions)

tensor([[0.4549],
        [0.6147],
        [0.4927],
        [0.5277],
        [0.5236]])


In [None]:

from sklearn.metrics import mean_absolute_error, r2_score, accuracy_score

# Evaluation on test data (loss calculation)
model.eval()
with torch.no_grad():
    total_loss = 0.0
    all_preds = []
    all_labels = []

    for features, labels in test_loader:
        outputs = model(features)
        loss = criterion(outputs, labels)
        total_loss += loss.item()

        all_preds.append(outputs)
        all_labels.append(labels)

    print(f"Test Loss (MSE): {total_loss/len(test_loader):.4f}")

# Convert lists to tensors for further calculations
all_preds = torch.cat(all_preds)
all_labels = torch.cat(all_labels)

# Define a threshold for binary classification (e.g., 0.5)
threshold = 0.5
y_pred = (all_preds > threshold).float()  # Binary predictions
y_true = (all_labels > threshold).float()  # Binary actuals

# Calculate accuracy for binary classification
accuracy = accuracy_score(y_true.numpy(), y_pred.numpy())
print(f"Test Accuracy: {accuracy * 100:.2f}%")

# Calculate additional regression metrics
mae = mean_absolute_error(all_labels.numpy(), all_preds.numpy())
rmse = np.sqrt(((all_preds - all_labels) ** 2).mean())
r2 = r2_score(all_labels.numpy(), all_preds.numpy())

# Print out the results
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R-Squared: {r2:.4f}")

Test Loss (MSE): 0.0332
Test Accuracy: 61.76%
Mean Absolute Error (MAE): 0.1565
Root Mean Squared Error (RMSE): 0.1922
R-Squared: 0.2383
