# Spotify Recommendation System

### Step 1: Load and Combine Data
We'll start by loading the JSON files for liked and disliked songs and combining them into a single DataFrame.

In [6]:
import pandas as pd
import json

# Load liked songs data
with open('dataset/good.json') as f:
    liked_data = json.load(f)['audio_features']

# Load disliked songs data
with open('dataset/dislike.json') as f:
    disliked_data = json.load(f)['audio_features']

# Convert to DataFrame
liked_df = pd.DataFrame(liked_data)
disliked_df = pd.DataFrame(disliked_data)

# Add a column to indicate liked (1) and disliked (0)
liked_df['liked'] = 1
disliked_df['liked'] = 0

# Combine the DataFrames
data = pd.concat([liked_df, disliked_df], ignore_index=True)

# Display the combined DataFrame
from IPython.display import display
display(data.head())


Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature,liked
0,0.749,0.839,6,-4.847,1,0.297,0.0867,0.0,0.204,0.804,172.068,audio_features,55mcupbf7cIsuCEVAuTJVk,spotify:track:55mcupbf7cIsuCEVAuTJVk,https://api.spotify.com/v1/tracks/55mcupbf7cIs...,https://api.spotify.com/v1/audio-analysis/55mc...,111000,4,1
1,0.573,0.581,10,-9.026,0,0.339,0.753,1e-06,0.13,0.351,76.506,audio_features,57RtLWT7IpugV0yi5bsxJk,spotify:track:57RtLWT7IpugV0yi5bsxJk,https://api.spotify.com/v1/tracks/57RtLWT7Ipug...,https://api.spotify.com/v1/audio-analysis/57Rt...,169347,4,1
2,0.8,0.719,7,-6.262,1,0.234,0.109,0.0,0.058,0.815,143.975,audio_features,5VyfAfp2Yt3qaeuvq55ll3,spotify:track:5VyfAfp2Yt3qaeuvq55ll3,https://api.spotify.com/v1/tracks/5VyfAfp2Yt3q...,https://api.spotify.com/v1/audio-analysis/5Vyf...,230854,4,1
3,0.778,0.632,8,-6.415,1,0.125,0.0404,0.0,0.0912,0.827,140.951,audio_features,3eWHY75nDgte70hh5yf4UW,spotify:track:3eWHY75nDgte70hh5yf4UW,https://api.spotify.com/v1/tracks/3eWHY75nDgte...,https://api.spotify.com/v1/audio-analysis/3eWH...,224029,4,1
4,0.797,0.852,8,-5.202,1,0.241,0.0555,2.4e-05,0.0536,0.48,136.035,audio_features,2UwrB6Ge6mPfUV8yGvAfX7,spotify:track:2UwrB6Ge6mPfUV8yGvAfX7,https://api.spotify.com/v1/tracks/2UwrB6Ge6mPf...,https://api.spotify.com/v1/audio-analysis/2Uwr...,102353,4,1


### Step 2: Preprocess Data

In [7]:
# Drop non-numeric columns
data_numeric = data.drop(columns=['id', 'uri', 'track_href', 'analysis_url', 'type'])

# Check for missing values
missing_values = data_numeric.isnull().sum()
print("Missing values:\n", missing_values)

# Fill missing values with the mean of each column
data_numeric.fillna(data_numeric.mean(), inplace=True)

# Normalize the features
from sklearn.preprocessing import StandardScaler

features = data_numeric.drop(columns=['liked'])
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# Create a new DataFrame with the scaled features
scaled_data = pd.DataFrame(scaled_features, columns=features.columns)
scaled_data['liked'] = data_numeric['liked'].values

# Display the preprocessed DataFrame
display(scaled_data.head())


Missing values:
 danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
duration_ms         0
time_signature      0
liked               0
dtype: int64


Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,liked
0,0.51997,0.773118,0.147533,0.712107,0.92582,1.232615,-0.726324,-0.556955,0.525481,1.162394,1.819952,-1.422996,0.193658,1
1,-0.294627,-0.221377,1.32178,0.070007,-1.080123,1.58231,1.356137,-0.556951,-0.174597,-0.534186,-1.591426,-0.612251,0.193658,1
2,0.756018,0.310562,0.441095,0.494694,0.92582,0.708071,-0.656627,-0.556955,-0.855755,1.203591,0.817087,0.242403,0.193658,1
3,0.654193,-0.024791,0.734657,0.471185,0.92582,-0.199472,-0.87103,-0.556955,-0.541666,1.248534,0.709136,0.147568,0.193658,1
4,0.742133,0.823228,0.734657,0.657562,0.92582,0.766354,-0.823836,-0.556884,-0.897381,-0.051054,0.533644,-1.543148,0.193658,1


### Step 3: Train a Model

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Split the data into training and testing sets
X = scaled_data.drop(columns=['liked'])
y = scaled_data['liked']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")


Accuracy: 0.9743589743589743
Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.98        21
           1       1.00      0.94      0.97        18

    accuracy                           0.97        39
   macro avg       0.98      0.97      0.97        39
weighted avg       0.98      0.97      0.97        39



In [10]:
import joblib

# Save the trained model
joblib.dump(model, 'spotify_recommendation_model.pkl')


['spotify_recommendation_model.pkl']

In [11]:
# Load the saved model
model = joblib.load('spotify_recommendation_model.pkl')

# Example new data
new_data = [
    {
        "danceability": 0.5,
        "energy": 0.7,
        "key": 5,
        "loudness": -5.0,
        "mode": 1,
        "speechiness": 0.05,
        "acousticness": 0.2,
        "instrumentalness": 0.0,
        "liveness": 0.1,
        "valence": 0.5,
        "tempo": 120.0,
        "duration_ms": 200000,
        "time_signature": 4
    }
]

# Convert new data to DataFrame
new_df = pd.DataFrame(new_data)
scaled_new_data = scaler.transform(new_df)

# Make predictions
predictions = model.predict(scaled_new_data)

# Interpret predictions
for i, prediction in enumerate(predictions):
    if prediction == 1:
        print(f"Song {i+1} is liked.")
    else:
        print(f"Song {i+1} is disliked.")


Song 1 is liked.




In [13]:
def predict_song_likes(new_songs):
    # Load the saved model
    model = joblib.load('spotify_recommendation_model.pkl')
    
    # Convert new songs to DataFrame
    new_df = pd.DataFrame(new_songs)
    scaled_new_data = scaler.transform(new_df)
    
    # Make predictions
    predictions = model.predict(scaled_new_data)
    
    # Interpret predictions
    results = []
    for i, prediction in enumerate(predictions):
        if prediction == 1:
            results.append(f"Song {i+1} is liked.")
        else:
            results.append(f"Song {i+1} is disliked.")
    
    return results

# Example new songs data
new_songs = [
    {
        "danceability": 0.5,
        "energy": 0.7,
        "key": 5,
        "loudness": -5.0,
        "mode": 1,
        "speechiness": 0.05,
        "acousticness": 0.2,
        "instrumentalness": 0.0,
        "liveness": 0.1,
        "valence": 0.5,
        "tempo": 120.0,
        "duration_ms": 200000,
        "time_signature": 4
    },
    {
        "danceability": 0.8,
        "energy": 0.9,
        "key": 10,
        "loudness": -3.0,
        "mode": 1,
        "speechiness": 0.1,
        "acousticness": 0.3,
        "instrumentalness": 0.2,
        "liveness": 0.3,
        "valence": 0.7,
        "tempo": 130.0,
        "duration_ms": 220000,
        "time_signature": 4
    }
]

# Get predictions
results = predict_song_likes(new_songs)
for result in results:
    print(result)


Song 1 is liked.
Song 2 is disliked.


