In [3]:
import pandas as pd
import os

In [4]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist

In [5]:
data_dir_base = '../data/output'
song_feats = '/audio_features.csv'
track_info = '/tracks.csv'

In [6]:
audio_feats = pd.read_csv(data_dir_base+song_feats)
tracks = pd.read_csv(data_dir_base + track_info)

In [35]:
tracks = tracks.drop_duplicates(['track_name']).reset_index(drop=True)

In [38]:
audio_feats = audio_feats[audio_feats.af_track_id.isin(tracks.track_id)].reset_index(drop=True)

In [45]:
audio_feats


Unnamed: 0,af_track_id,af_danceability,af_energy,af_loudness,af_mode,af_acousticness,af_instrumentalness,af_liveness,af_valence,af_tempo,af_key,af_speechiness,af_time_signature,af_duration_ms
0,6f807x0ima9a1j3VPbc7VN,0.748,0.916,-2.634,1,0.102000,0.000000,0.0653,0.5180,122.036,6,0.0583,4,194754
1,0r7CVbZTWZgbTCYdfa2P31,0.726,0.815,-4.969,1,0.072400,0.004210,0.3570,0.6930,99.972,11,0.0373,4,162600
2,1z1Hg7Vb0AhHDiEmnDE79l,0.675,0.931,-3.432,0,0.079400,0.000023,0.1100,0.6130,124.008,1,0.0742,4,176616
3,75FpbthrwQmzHlBJLuGdC7,0.718,0.930,-3.777,1,0.028700,0.000008,0.2040,0.2800,121.956,7,0.1010,4,169093
4,1e8PAfcKUYoKkxPhrHqw4x,0.650,0.833,-4.672,1,0.080300,0.000000,0.0833,0.7250,123.976,1,0.0359,4,189052
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23475,7bxnKAamR3snQ1VGLuVfC1,0.429,0.922,-1.814,1,0.076600,0.000000,0.0668,0.2100,128.168,2,0.0935,4,204375
23476,5Aevni09Em4575077nkWHz,0.522,0.786,-4.462,1,0.001710,0.004270,0.3750,0.4000,128.041,0,0.0420,4,353120
23477,7ImMqPP3Q1yfUHvsdn7wEo,0.529,0.821,-4.899,0,0.108000,0.000001,0.1500,0.4360,127.989,6,0.0481,4,210112
23478,2m69mhnfQ1Oq6lGtXuYhgX,0.626,0.888,-3.361,1,0.007920,0.127000,0.3430,0.3080,128.008,2,0.1090,4,367432


In [41]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

def calculate_cosine_similarity(features):
    # Calculate the cosine similarity matrix from the features
    return cosine_similarity(features)

def calculate_correlation_similarity(features):
    # Calculate the Pearson correlation matrix
    return features.corr()


In [46]:
from tqdm import tqdm 
def get_top_recommendations(df, top_n=10):
    # Assuming features are all columns except 'af_track_id'
    features = df.drop(['af_track_id','af_duration_ms'], axis=1)
    
    # Calculate similarity matrices
    cosine_sim = calculate_cosine_similarity(features)
    corr_sim = calculate_correlation_similarity(features)
    
    # Convert cosine similarity matrix to DataFrame to align with corr_sim
    cosine_sim_df = pd.DataFrame(cosine_sim, index=df.index, columns=df.index)
    
    # Calculate the average of cosine and correlation similarities
    avg_similarity = (cosine_sim_df) 
    
    # Prepare a dictionary to hold recommendations
    recommendations = {}
    
    # Get top recommendations for each song
    for idx in tqdm(df.index):
        track_id = df.at[idx, 'af_track_id']
        
        # Get similarity scores for this track with all others, skip the self-similarity at index 'idx'
        sim_scores = avg_similarity.iloc[idx]
        
        # Get indices of the songs with the highest similarity scores
        # argsort returns indices of sorted values; [::-1] reverses for descending order
        sorted_indices = sim_scores.argsort()[::-1][1:top_n+1]  # skip self by starting from 1
        
        # Get the track IDs of the most similar songs
        similar_tracks = df['af_track_id'].iloc[sorted_indices].tolist()
        
        # Store in the dictionary
        recommendations[track_id] = similar_tracks
        if idx%1000==0:
            print(idx)
    
    return recommendations


In [47]:
# Example DataFrame loading (replace with actual data loading code)
# df = pd.read_csv('path_to_your_data.csv')
import warnings
warnings.filterwarnings('ignore')
# Get recommendations
all_recommendations = get_top_recommendations(audio_feats)

# Convert to DataFrame
recommendations_df = pd.DataFrame(list(all_recommendations.items()), columns=['Track ID', 'Recommended Track IDs'])

# Display the result
print(recommendations_df.head())




  0%|                                                                                   | 30/23480 [00:00<02:33, 152.77it/s]

0


  4%|███▌                                                                             | 1039/23480 [00:04<01:18, 287.11it/s]

1000


  9%|██████▉                                                                          | 2019/23480 [00:07<01:28, 241.38it/s]

2000


 13%|██████████▍                                                                      | 3036/23480 [00:11<01:12, 281.56it/s]

3000


 17%|█████████████▉                                                                   | 4032/23480 [00:15<01:14, 261.28it/s]

4000


 21%|█████████████████▎                                                               | 5036/23480 [00:19<01:10, 262.22it/s]

5000


 26%|████████████████████▊                                                            | 6041/23480 [00:23<01:17, 224.52it/s]

6000


 30%|████████████████████████▎                                                        | 7038/23480 [00:27<00:59, 278.11it/s]

7000


 34%|███████████████████████████▊                                                     | 8050/23480 [00:32<01:22, 186.21it/s]

8000


 38%|███████████████████████████████▏                                                 | 9031/23480 [00:36<00:52, 274.85it/s]

9000


 43%|██████████████████████████████████▏                                             | 10030/23480 [00:41<00:49, 269.60it/s]

10000


 47%|█████████████████████████████████████▌                                          | 11027/23480 [00:44<00:57, 216.89it/s]

11000


 51%|████████████████████████████████████████▉                                       | 12029/23480 [00:50<00:41, 279.23it/s]

12000


 56%|████████████████████████████████████████████▍                                   | 13039/23480 [00:54<00:40, 259.73it/s]

13000


 60%|███████████████████████████████████████████████▉                                | 14053/23480 [00:58<00:36, 254.87it/s]

14000


 64%|███████████████████████████████████████████████████▏                            | 15030/23480 [01:01<00:34, 245.55it/s]

15000


 68%|██████████████████████████████████████████████████████▌                         | 16032/23480 [01:05<00:27, 268.94it/s]

16000


 73%|██████████████████████████████████████████████████████████                      | 17052/23480 [01:10<00:23, 268.64it/s]

17000


 77%|█████████████████████████████████████████████████████████████▍                  | 18035/23480 [01:15<00:22, 244.92it/s]

18000


 81%|████████████████████████████████████████████████████████████████▉               | 19053/23480 [01:18<00:16, 268.97it/s]

19000


 85%|████████████████████████████████████████████████████████████████████▏           | 20028/23480 [01:22<00:14, 243.49it/s]

20000


 90%|███████████████████████████████████████████████████████████████████████▋        | 21046/23480 [01:26<00:09, 269.82it/s]

21000


 94%|███████████████████████████████████████████████████████████████████████████     | 22029/23480 [01:30<00:05, 266.60it/s]

22000


 98%|██████████████████████████████████████████████████████████████████████████████▍ | 23032/23480 [01:34<00:01, 287.48it/s]

23000


100%|████████████████████████████████████████████████████████████████████████████████| 23480/23480 [01:35<00:00, 245.56it/s]


                 Track ID                              Recommended Track IDs
0  6f807x0ima9a1j3VPbc7VN  [7uYgVgRSgqluZclLS7wpp4, 72LFgoXC3a1sOKZM7jUeT...
1  0r7CVbZTWZgbTCYdfa2P31  [660zp5cgJrTXahkfq0zYMQ, 4fxOCTCTQtsrbK6y1K9JH...
2  1z1Hg7Vb0AhHDiEmnDE79l  [1GfsNRRMT5Dg4DjEYpRa1h, 0GuLonchqLiGK1zzcrp2y...
3  75FpbthrwQmzHlBJLuGdC7  [3BT5oU2l8UDLFgyOTUdKe4, 2jcYSJGmvhpYn3D6IVzS0...
4  1e8PAfcKUYoKkxPhrHqw4x  [6AKRht2NaMsg7z13psKjHk, 0385HqYimoTQaHZP17KwG...


In [57]:
recommendations_df_final = pd.concat([recommendations_df.drop('Recommended Track IDs',axis=1),\
                                      pd.DataFrame(recommendations_df['Recommended Track IDs'].values.tolist())],
                                     axis=1)

In [64]:
recommendations_df_final.columns = ['Track ID', 
                                    'recommended_track_1',
                                    'recommended_track_2',
                                    'recommended_track_3',
                                    'recommended_track_4',
                                    'recommended_track_5',
                                    'recommended_track_6',
                                    'recommended_track_7',
                                    'recommended_track_8',
                                    'recommended_track_9',
                                    'recommended_track_10']

In [66]:
recommendations_df_final.to_csv('recommendations.csv')