In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from datetime import datetime
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
import matplotlib.pyplot as plt

In [2]:
track_path = 'data/tracks.csv'
playlist_path = 'data/playlists.csv'

## Data Pre-Processing

* Read `track.csv` file
* drop unused columns
* drop duplicate tracks based on `Track_ID` column

In [3]:
tracks = pd.read_csv(track_path)
tracks = tracks.drop(['Unnamed: 0'], axis=1)
tracks = tracks.drop_duplicates(subset=['Track_ID'])

* Convert `Release Date` from string to datetime data type
* Convert `Explicit` from boolean to integer data type

In [5]:
tracks['Release Date'] = pd.to_datetime(tracks['Release Date'], format='%Y-%m-%d', errors='coerce')
tracks['Explicit'] = tracks['Explicit'].astype(int)

In [6]:
tracks.head()

Unnamed: 0,Track_ID,Popularity,Release Date,Explicit,External URLs,Danceability,Energy,Key,Loudness,Mode,Speechiness,Acousticness,Instrumentalness,Liveness,Valence,Tempo
0,44uuZDQFAtfag94mDPIsEu,0,2017-06-14,1,https://open.spotify.com/track/44uuZDQFAtfag94...,0.571,0.578,8,-9.696,0,0.27,0.254,2e-06,0.113,0.338,133.731
1,0Vwfd6fxFrL3kCnZSJ9vid,0,2013-01-01,1,https://open.spotify.com/track/0Vwfd6fxFrL3kCn...,0.74,0.716,6,-5.796,0,0.0609,0.207,4e-06,0.0926,0.804,117.977
2,2Ze0YvSXz8CnC81hw5rXNo,17,2015-10-16,0,https://open.spotify.com/track/2Ze0YvSXz8CnC81...,0.61,0.766,2,-5.663,1,0.0267,0.0233,0.0148,0.107,0.582,107.483
3,34hMOtKwf5nm8tjvkGV0Dk,46,2016-05-06,0,https://open.spotify.com/track/34hMOtKwf5nm8tj...,0.706,0.324,6,-14.048,0,0.0305,0.167,0.104,0.116,0.114,119.992
4,6EpRaXYhGOB3fj4V2uDkMJ,0,2017-05-18,0,https://open.spotify.com/track/6EpRaXYhGOB3fj4...,0.869,0.485,6,-5.595,1,0.0545,0.246,0.0,0.0765,0.527,106.028


* Read `playlists.csv` file
* drop unused columns
* drop duplicate tracks in a playlists based on `pid` and `track_id` column

In [4]:
playlists = pd.read_csv(playlist_path)
playlists = playlists.drop(['Unnamed: 0', 'track_uri', 'album_uri', 'artist_uri'], axis=1)
playlists = playlists.drop_duplicates(subset=['pid', 'track_id'])

In [7]:
playlists.head()

Unnamed: 0,pid,name,artist_name,track_name,duration_ms,album_name,track_id
0,549003,August,C-Trox,Passionate,206001,Passionate,44uuZDQFAtfag94mDPIsEu
1,549003,August,Watsky,Sloppy Seconds,245933,Cardboard Castles,0Vwfd6fxFrL3kCnZSJ9vid
2,549003,August,Maritime,Roaming Empire,234960,Magnetic Bodies/Maps of Bones,2Ze0YvSXz8CnC81hw5rXNo
3,549003,August,RY X,Howling,309590,Dawn,34hMOtKwf5nm8tjvkGV0Dk
4,549003,August,Liam Payne,Strip That Down,204502,Strip That Down,6EpRaXYhGOB3fj4V2uDkMJ


## Data Modeling

Splits each playlist into train and test tracks. This simulates a scenario where we know some subset of a user's playlist tracks (train) and want to predict the missing ones (test).

In [9]:
def train_test_split_playlists(playlists, test_ratio=0.2, random_state=42):
    """
    Splits each playlist into train and test tracks. This simulates a scenario
    where we know some subset of a user's playlist tracks (train) and want to 
    predict the missing ones (test).
    """
    np.random.seed(random_state)
    train_data = []
    test_data = []
    for pid, group in playlists.groupby('pid'):
        track_list = group['track_id'].tolist()
        np.random.shuffle(track_list)
        split_index = int(len(track_list)*(1-test_ratio))
        train_tracks = track_list[:split_index]
        test_tracks = track_list[split_index:]
        
        for t in train_tracks:
            train_data.append({'pid': pid, 'track_id': t})
        for t in test_tracks:
            test_data.append({'pid': pid, 'track_id': t})
            
    train_df = pd.DataFrame(train_data)
    test_df = pd.DataFrame(test_data)
    return train_df, test_df

train_df, test_df = train_test_split_playlists(playlists, test_ratio=0.2)


In [10]:
numeric_features = ['Popularity', 'Danceability', 'Energy', 'Loudness', 'Speechiness', 
                    'Acousticness', 'Instrumentalness', 'Liveness', 'Valence', 'Tempo', 'Explicit']

In [11]:
train_track_ids = train_df['track_id'].unique()
train_tracks_df = tracks[tracks['Track_ID'].isin(train_track_ids)]

In [12]:
scaler = MinMaxScaler()
train_tracks_df.loc[:, numeric_features] = scaler.fit_transform(train_tracks_df[numeric_features].astype(float))


  train_tracks_df.loc[:, numeric_features] = scaler.fit_transform(train_tracks_df[numeric_features].astype(float))


In [13]:
all_scaled_values = scaler.transform(tracks[numeric_features])
tracks[numeric_features] = all_scaled_values

In [14]:
def get_user_playlist_tracks(playlist_df, pid):
    """
    Given a specific playlist id (pid), return that user's training tracks.
    """
    user_playlist = playlist_df[playlist_df['pid'] == pid]
    user_track_ids = user_playlist['track_id'].unique()
    return user_playlist, user_track_ids

In [15]:
def build_user_profile(tracks_df, user_track_ids, numeric_features):
    """
    Build a user profile from the user's known (training) tracks by averaging the feature vectors.
    """
    user_tracks = tracks_df[tracks_df['Track_ID'].isin(user_track_ids)]
    if user_tracks.empty:
        return None
    user_profile = user_tracks[numeric_features].mean(axis=0).values.reshape(1, -1)
    return user_profile


In [16]:
def compute_content_scores(tracks_df, user_profile, user_track_ids, numeric_features):
    """
    Compute content-based similarity scores for candidate tracks 
    (tracks not in the user's training set).
    """
    candidate_tracks = tracks_df[~tracks_df['Track_ID'].isin(user_track_ids)].copy()
    if candidate_tracks.empty or user_profile is None:
        return pd.DataFrame(columns=['Track_ID', 'content_score'])
    
    candidate_features = candidate_tracks[numeric_features].values
    similarities = cosine_similarity(candidate_features, user_profile)
    candidate_tracks['content_score'] = similarities[:, 0]
    return candidate_tracks[['Track_ID', 'content_score']]

In [17]:
def build_cooccurrence_df(playlist_df):
    """
    Build a co-occurrence dataframe from the playlist data (here, only train_df).
    This prevents the model from "seeing" test co-occurrences.
    """
    grouped = playlist_df.groupby('pid')['track_id'].apply(list)
    records = []
    for track_list in grouped:
        for i in range(len(track_list)):
            for j in range(i+1, len(track_list)):
                t1, t2 = track_list[i], track_list[j]
                records.append((t1, t2, 1))
                records.append((t2, t1, 1))
                
    cooccurrence_df = pd.DataFrame(records, columns=['track_id_1', 'track_id_2', 'count'])
    cooccurrence_df = cooccurrence_df.groupby(['track_id_1', 'track_id_2'], as_index=False)['count'].sum()
    return cooccurrence_df

In [18]:
cooccurrence = build_cooccurrence_df(train_df)

In [19]:
def compute_collaborative_scores_df(user_track_ids, cooccurrence_df, all_track_ids):
    """
    Compute collaborative filtering scores (from co-occurrence) for candidate tracks.
    Uses only training co-occurrences.
    """
    user_cooccurrences = cooccurrence_df[cooccurrence_df['track_id_1'].isin(user_track_ids)]
    user_cooccurrences = user_cooccurrences[~user_cooccurrences['track_id_2'].isin(user_track_ids)]
    
    collab_df = user_cooccurrences.groupby('track_id_2', as_index=False)['count'].sum()
    collab_df.rename(columns={'track_id_2': 'Track_ID', 'count': 'collab_score'}, inplace=True)
    
    if not collab_df.empty:
        collab_df['collab_score'] = collab_df['collab_score'] / collab_df['collab_score'].max()
    else:
        collab_df['collab_score'] = 0
    
    collab_df = collab_df[collab_df['Track_ID'].isin(all_track_ids)]
    return collab_df

In [None]:
def hybrid_recommendations(tracks_df, user_profile, user_track_ids, numeric_features, cooccurrence_df, top_n, alpha=0.5):
    """
    Generate hybrid recommendations (content + collaborative).
    """
    content_scores = compute_content_scores(tracks_df, user_profile, user_track_ids, numeric_features)
    all_track_ids = set(tracks_df['Track_ID'].unique())
    collab_scores = compute_collaborative_scores_df(user_track_ids, cooccurrence_df, all_track_ids)
    
    combined = pd.merge(content_scores, collab_scores, on='Track_ID', how='outer').fillna(0)
    combined['hybrid_score'] = alpha * combined['content_score'] + (1 - alpha) * combined['collab_score']
    combined = combined.sort_values('hybrid_score', ascending=False)
    return combined.head(top_n)['Track_ID'].tolist()

In [None]:
def evaluate_model(tracks_df, train_df, test_df, numeric_features, cooccurrence, alpha=0.5):
    """
    Evaluate the model by comparing the recommended tracks to the test tracks.
    We only use co-occurrence and user profiles built from train_df.
    """
    test_dict = test_df.groupby('pid')['track_id'].apply(set).to_dict()
    train_dict = train_df.groupby('pid')['track_id'].apply(set).to_dict()
    
    precisions, recalls = [], []
    results = []
    for pid in train_df['pid'].unique():
        user_train_tracks = train_dict.get(pid, set())
        user_test_tracks = test_dict.get(pid, set())
        
        if len(user_test_tracks) == 0:
            continue
        total_playlist_size = len(user_train_tracks) + len(user_test_tracks)
        
        # Build user profile from train tracks only
        user_profile = build_user_profile(tracks_df, list(user_train_tracks), numeric_features)
        if user_profile is None:
            continue
        
        recommendations = hybrid_recommendations(
            tracks_df,
            user_profile,
            list(user_train_tracks),
            numeric_features,
            cooccurrence,
            len(user_test_tracks),
            alpha=alpha
        )
        
        recommended_set = set(recommendations)
        hit_count = len(recommended_set.intersection(user_train_tracks.union(user_test_tracks)))
        
        #print(f"Length of test: {len(user_test_tracks)} | Length of recommendations: {len(recommended_set)}")
        #print(f"Hit count: {hit_count}")
        
        precision = hit_count / top_n
        recall = hit_count / len(user_test_tracks)
        
        precisions.append(precision)
        recalls.append(recall)
        
        results.append({
            'pid': pid,
            'hit_rate': recall,
            'playlist_length': total_playlist_size
        })
        
    avg_precision = np.mean(precisions) if precisions else 0
    avg_recall = np.mean(recalls) if recalls else 0
    return pd.DataFrame(results)

In [None]:
alpha = []
mean_hit_rate = []

for param in np.arange(0.1, 0.6, 0.1):
    alpha.append(param)
    print(param)
    result_df = evaluate_model(tracks, train_df, test_df, numeric_features, cooccurrence, alpha=param, top_n=10)
    hit_rate = result_df['hit_rate'].mean()
    mean_hit_rate.append(hit_rate)
    print(mean_hit_rate)

df = pd.DataFrame({
    'alpha': alpha,
    'mean_hit_rate': mean_hit_rate
})

print(df)


0.1
[0.046178869047619045]
0.2


In [51]:
results_df = evaluate_model(tracks, train_df, test_df, numeric_features, cooccurrence, alpha=0.5, top_n=10)


Length of test: 8 | Length of recommendations: 8
Hit count: 0
Length of test: 2 | Length of recommendations: 2
Hit count: 0
Length of test: 5 | Length of recommendations: 5
Hit count: 0
Length of test: 3 | Length of recommendations: 3
Hit count: 0
Length of test: 7 | Length of recommendations: 7
Hit count: 1
Length of test: 6 | Length of recommendations: 6
Hit count: 0
Length of test: 6 | Length of recommendations: 6
Hit count: 0
Length of test: 7 | Length of recommendations: 7
Hit count: 0
Length of test: 4 | Length of recommendations: 4
Hit count: 0
Length of test: 8 | Length of recommendations: 8
Hit count: 0
Length of test: 3 | Length of recommendations: 3
Hit count: 0
Length of test: 3 | Length of recommendations: 3
Hit count: 0
Length of test: 6 | Length of recommendations: 6
Hit count: 0
Length of test: 5 | Length of recommendations: 5
Hit count: 1
Length of test: 3 | Length of recommendations: 3
Hit count: 0
Length of test: 3 | Length of recommendations: 3
Hit count: 0
Length o

ValueError: Bin labels must be one fewer than the number of bin edges

In [56]:
overall_hit_rate = results_df['hit_rate'].mean()
print(f"Overall Hit Rate: {overall_hit_rate}")

Overall Hit Rate: 0.04644464285714285


In [54]:
bin_edges = [0, 5, 10, 15, 20, 25, 30, 35, 40]
bin_labels = ['<5', '5-10', '11-15', '16-20', '21-25', '26-30', '31-35', '36-40']
results_df['length_bin'] = pd.cut(results_df['playlist_length'], bins=bin_edges, labels=bin_labels, right=True)
hit_rate_by_bin = results_df.groupby('length_bin', observed=False)['hit_rate'].mean().reset_index()

bin_counts = results_df['length_bin'].value_counts().reset_index()
bin_counts.columns = ['length_bin', 'playlist_count']

print(hit_rate_by_bin)
print(bin_counts)


  length_bin  hit_rate
0         <5  0.021277
1       5-10  0.029343
2      11-15  0.033014
3      16-20  0.040047
4      21-25  0.051994
5      26-30  0.049817
6      31-35  0.054314
7      36-40  0.069815
  length_bin  playlist_count
0      16-20             643
1      26-30             639
2      21-25             627
3      11-15             626
4      31-35             505
5      36-40             487
6       5-10             426
7         <5              47


In [57]:
content_results_df = evaluate_model(tracks, train_df, test_df, numeric_features, cooccurrence, alpha=1, top_n=10)

Length of test: 8 | Length of recommendations: 8
Hit count: 0
Length of test: 2 | Length of recommendations: 2
Hit count: 0
Length of test: 5 | Length of recommendations: 5
Hit count: 0
Length of test: 3 | Length of recommendations: 3
Hit count: 0
Length of test: 7 | Length of recommendations: 7
Hit count: 0
Length of test: 6 | Length of recommendations: 6
Hit count: 0
Length of test: 6 | Length of recommendations: 6
Hit count: 0
Length of test: 7 | Length of recommendations: 7
Hit count: 0
Length of test: 4 | Length of recommendations: 4
Hit count: 0
Length of test: 8 | Length of recommendations: 8
Hit count: 0
Length of test: 3 | Length of recommendations: 3
Hit count: 0
Length of test: 3 | Length of recommendations: 3
Hit count: 0
Length of test: 6 | Length of recommendations: 6
Hit count: 0
Length of test: 5 | Length of recommendations: 5
Hit count: 0
Length of test: 3 | Length of recommendations: 3
Hit count: 0
Length of test: 3 | Length of recommendations: 3
Hit count: 0
Length o

In [58]:
overall_content_hit_rate = content_results_df['hit_rate'].mean()
print(f"Overall Hit Rate: {overall_content_hit_rate}")

Overall Hit Rate: 0.0011160714285714285


In [59]:
colab_results_df = evaluate_model(tracks, train_df, test_df, numeric_features, cooccurrence, alpha=0, top_n=10)

Length of test: 8 | Length of recommendations: 8
Hit count: 0
Length of test: 2 | Length of recommendations: 2
Hit count: 0
Length of test: 5 | Length of recommendations: 5
Hit count: 0
Length of test: 3 | Length of recommendations: 3
Hit count: 0
Length of test: 7 | Length of recommendations: 7
Hit count: 1
Length of test: 6 | Length of recommendations: 6
Hit count: 0
Length of test: 6 | Length of recommendations: 6
Hit count: 0
Length of test: 7 | Length of recommendations: 7
Hit count: 0
Length of test: 4 | Length of recommendations: 4
Hit count: 0
Length of test: 8 | Length of recommendations: 8
Hit count: 0
Length of test: 3 | Length of recommendations: 3
Hit count: 0
Length of test: 3 | Length of recommendations: 3
Hit count: 0
Length of test: 6 | Length of recommendations: 6
Hit count: 0
Length of test: 5 | Length of recommendations: 5
Hit count: 1
Length of test: 3 | Length of recommendations: 3
Hit count: 0
Length of test: 3 | Length of recommendations: 3
Hit count: 0
Length o

In [61]:
overall_colab_hit_rate = colab_results_df['hit_rate'].mean()
print(f"Overall Hit Rate: {overall_colab_hit_rate}")

Overall Hit Rate: 0.044925297619047615
