Load Data

In [1]:
import pandas as pd
df = pd.read_csv('Spotify-2000.csv')
print(df.head)

<bound method NDFrame.head of       Index                   Title                    Artist  \
0         1                 Sunrise               Norah Jones   
1         2             Black Night               Deep Purple   
2         3          Clint Eastwood                  Gorillaz   
3         4           The Pretender              Foo Fighters   
4         5  Waitin' On A Sunny Day         Bruce Springsteen   
...     ...                     ...                       ...   
1989   1990        Heartbreak Hotel             Elvis Presley   
1990   1991               Hound Dog             Elvis Presley   
1991   1992         Johnny B. Goode               Chuck Berry   
1992   1993               Take Five  The Dave Brubeck Quartet   
1993   1994          Blueberry Hill               Fats Domino   

                Top Genre  Year  Beats Per Minute (BPM)  Energy  Danceability  \
0         adult standards  2004                     157      30            53   
1              album rock  

Inspect Data

In [2]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing values in each column:")
print(missing_values)

# Identify numerical features
numerical_features = df.select_dtypes(include=['float64', 'int64']).columns
print("Numerical features in the dataset:")
print(numerical_features)

Missing values in each column:
Index                     0
Title                     0
Artist                    0
Top Genre                 0
Year                      0
Beats Per Minute (BPM)    0
Energy                    0
Danceability              0
Loudness (dB)             0
Liveness                  0
Valence                   0
Length (Duration)         0
Acousticness              0
Speechiness               0
Popularity                0
dtype: int64
Numerical features in the dataset:
Index(['Index', 'Year', 'Beats Per Minute (BPM)', 'Energy', 'Danceability',
       'Loudness (dB)', 'Liveness', 'Valence', 'Acousticness', 'Speechiness',
       'Popularity'],
      dtype='object')


Preprocess Data

In [3]:
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

# Identify numerical features
numerical_features = ['Beats Per Minute (BPM)', 'Energy', 'Danceability', 'Loudness (dB)',
                      'Liveness', 'Valence', 'Acousticness', 'Speechiness', 'Popularity']

# Normalize numerical features
scaler = MinMaxScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Identify and encode categorical features
categorical_features = ['Artist', 'Top Genre', 'Year']
label_encoder = LabelEncoder()
for feature in categorical_features:
    df[feature + '_encoded'] = label_encoder.fit_transform(df[feature])

# Display the processed DataFrame
print("Processed DataFrame:")
print(df.head())

Processed DataFrame:
   Index                   Title             Artist            Top Genre  \
0      1                 Sunrise        Norah Jones      adult standards   
1      2             Black Night        Deep Purple           album rock   
2      3          Clint Eastwood           Gorillaz  alternative hip hop   
3      4           The Pretender       Foo Fighters    alternative metal   
4      5  Waitin' On A Sunny Day  Bruce Springsteen         classic rock   

   Year  Beats Per Minute (BPM)    Energy  Danceability  Loudness (dB)  \
0  2004                0.710059  0.278351      0.500000           0.52   
1  2000                0.579882  0.783505      0.465116           0.64   
2  2001                0.775148  0.680412      0.651163           0.72   
3  2007                0.804734  0.958763      0.383721           0.92   
4  2002                0.408284  0.814433      0.558140           0.88   

   Liveness   Valence Length (Duration)  Acousticness  Speechiness  \
0  0.09

Recommender Function

In [4]:
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
import numpy as np

# Additional feature integration (if available)
# For example, lyrics or release date
# Assuming additional features have been added to the dataframe

# Create a feature matrix for content-based filtering
content_features = ['Beats Per Minute (BPM)', 'Energy', 'Danceability', 'Loudness (dB)',
                    'Liveness', 'Valence', 'Acousticness', 'Speechiness', 'Popularity',
                    'Artist_encoded', 'Top Genre_encoded', 'Year_encoded']
content_feature_matrix = df[content_features]

# Compute similarity matrix using different measures
content_similarity_matrix_cosine = cosine_similarity(content_feature_matrix)
content_similarity_matrix_euclidean = 1 / (1 + euclidean_distances(content_feature_matrix))  # Inverted to make it a similarity measure

# Combine similarity measures (weighted average or other methods)
combined_similarity_matrix = (content_similarity_matrix_cosine + content_similarity_matrix_euclidean) / 2

# Recommend similar songs
def recommend_songs(song_title, similarity_matrix, df, n_recommendations=5, similarity_threshold=0.5):
    try:
        song_index = df[df['Title'] == song_title].index[0]  # Ensure 'Title' is the correct column name
        similarity_scores = list(enumerate(similarity_matrix[song_index]))
        similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
        recommended_indices = [i[0] for i in similarity_scores[1:] if i[1] >= similarity_threshold][:n_recommendations]
        return df['Title'].iloc[recommended_indices]
    except IndexError:
        return "Song title not found in the dataset."

# Example recommendation using a valid song title
valid_song_title = "Empire State Of Mind"  # Replace with an actual song title from the dataset
recommendations = recommend_songs(valid_song_title, combined_similarity_matrix, df)
print("Recommendations for '{}':".format(valid_song_title))
print(recommendations)


Recommendations for 'Empire State Of Mind':
465           Ni**as In Paris
106             Numb / Encore
122    Voltooid Verleden Tijd
244             So Incredible
194           Tumble and Fall
Name: Title, dtype: object


Evaluation of Model

In [5]:
from sklearn.metrics import precision_score, recall_score, mean_absolute_error

# Simulate true and predicted recommendations
# In a real scenario, these should come from a ground truth dataset

# Example true recommendations (you should replace this with actual data)
true_recommendations = {
    "Empire State Of Mind": ["Ni**as In Paris", "Numb / Encore", "Love You Like I Love Myself", "Still Believe", "Annabel"],
    # Add more entries for other songs
}

# Generate predicted recommendations using the model
def get_predictions(true_recommendations, model, df):
    predictions = {}
    for song in true_recommendations:
        predicted = model(song, combined_similarity_matrix, df, n_recommendations=5)
        predictions[song] = list(predicted)
    return predictions

predicted_recommendations = get_predictions(true_recommendations, recommend_songs, df)

# Flatten lists for evaluation
true_labels = []
predicted_labels = []

for song in true_recommendations:
    true_labels.extend([1 if rec in true_recommendations[song] else 0 for rec in df['Title']])
    predicted_labels.extend([1 if rec in predicted_recommendations[song] else 0 for rec in df['Title']])

# Compute Precision, Recall, and MAE
precision = precision_score(true_labels, predicted_labels, average='micro')
recall = recall_score(true_labels, predicted_labels, average='micro')
mae = mean_absolute_error(true_labels, predicted_labels)

print(f'Precision: {precision}, Recall: {recall}, MAE: {mae}')


Precision: 0.9969909729187563, Recall: 0.9969909729187563, MAE: 0.003009027081243731


Save Files

In [7]:
# Save the similarity matrices and DataFrame
np.save('/content/combined_similarity_matrix.npy', combined_similarity_matrix)
df.to_csv('/content/processed_dataset.csv', index=False)