In [1]:
import pickle

import pandas as pd
import numpy as np

In [2]:
basedir = "./pkls/"

##Get whether track should be used for recommendations or not

In [3]:
with open(basedir + "metas_csv.csv", 'r') as tf:
    metas = pd.read_csv(tf, index_col=0)

In [4]:
metas.head()

Unnamed: 0,track_id,song_hottt,artist_hottt,artist_fam
0,TRBGPHG12903CE6CC3,0.252446,0.35747,0.469672
1,TRBGPYK128F42796E1,,0.305101,0.38272
2,TRBGPJP128E078ED20,,0.610731,0.872537
3,TRBGPXH128F428C912,,0.28805,0.404823
4,TRBGPSV12903CA9C25,,0.4127,0.595409


In [5]:
#arbitrary bounds:
upperb = 1.0 #don't recommend people who are too popular (Kanye West, Daft Punk, etc have > 1 artist_hottt)
lowerb = 0.5 #don't recommend people who have no following
def recommend(row):
    if row["song_hottt"] > upperb or row["artist_hottt"] > upperb or row["artist_fam"] > upperb:
        return 0
    elif row["song_hottt"] > lowerb or row["artist_hottt"] > lowerb or row["artist_fam"] > lowerb:
        return 1
    return 0

In [6]:
metas["recommend"] = metas.apply(recommend, axis=1)

In [7]:
metas.head()

Unnamed: 0,track_id,song_hottt,artist_hottt,artist_fam,recommend
0,TRBGPHG12903CE6CC3,0.252446,0.35747,0.469672,0
1,TRBGPYK128F42796E1,,0.305101,0.38272,0
2,TRBGPJP128E078ED20,,0.610731,0.872537,1
3,TRBGPXH128F428C912,,0.28805,0.404823,0
4,TRBGPSV12903CA9C25,,0.4127,0.595409,1


In [8]:
print len(metas[metas["recommend"] ==0].index)

3273


In [9]:
metas[metas["recommend"] ==0].tail(6)

Unnamed: 0,track_id,song_hottt,artist_hottt,artist_fam,recommend
9979,TRAQUIR128F429B067,,0.313866,0.379735,0
9981,TRAQUVZ128F932B2B7,0.0,0.298435,0.428062,0
9982,TRAQUBY128F42671D0,0.265861,0.349981,0.449689,0
9985,TRAQUZJ128F92F7059,0.0,0.0,0.359678,0
9986,TRAQUCX12903CD43F9,,0.258505,0.360078,0
9994,TRAQQLM128F147CBFE,0.440187,0.331373,0.49267,0


In [10]:
recs = metas[["track_id", "recommend"]]
print len(recs.index)

10000


In [11]:
recs.head()

Unnamed: 0,track_id,recommend
0,TRBGPHG12903CE6CC3,0
1,TRBGPYK128F42796E1,0
2,TRBGPJP128E078ED20,1
3,TRBGPXH128F428C912,0
4,TRBGPSV12903CA9C25,1


##Get cluster prediction per track

In [12]:
def unpickle(filename):
    ''' open file '''
    with open(filename, 'r') as picklefile:
        old_data = pickle.load(picklefile)
    return old_data

In [13]:
with open(basedir + "zscore_df_csv.csv", 'r') as tf:
    tracks = pd.read_csv(tf, index_col=0)

In [14]:
tracks.head(2)

Unnamed: 0,track_id,duration,fade_in,tempo,energy,danceability,loudness,mode,key_0,key_1,...,timbre_mode3,timbre_mode4,timbre_mode5,timbre_mode6,timbre_mode7,timbre_mode8,timbre_mode9,timbre_mode10,timbre_mode11,timbre_mode12
0,TRBGPHG12903CE6CC3,-0.668187,-0.319412,-0.194199,0,0,-1.007373,1,0,0,...,1.259628,1.189218,0.437165,0.945188,-0.284978,0.281554,0.185953,-0.112983,-1.104735,0.363289
1,TRBGPYK128F42796E1,0.926188,4.084562,0.881785,0,0,0.092725,1,0,0,...,0.536204,0.386584,0.661103,-0.21227,-1.06644,0.317366,-0.484924,0.358257,0.436883,0.239837


In [15]:
kmeans = unpickle(basedir + "kmeans40.pkl")

In [16]:
def predict(row):
    features = row.iloc[1:].tolist()
    return kmeans.predict(features)[0]

In [17]:
tracks["cluster"] = tracks.apply(predict, axis=1)

In [18]:
tracks.head(2)

Unnamed: 0,track_id,duration,fade_in,tempo,energy,danceability,loudness,mode,key_0,key_1,...,timbre_mode4,timbre_mode5,timbre_mode6,timbre_mode7,timbre_mode8,timbre_mode9,timbre_mode10,timbre_mode11,timbre_mode12,cluster
0,TRBGPHG12903CE6CC3,-0.668187,-0.319412,-0.194199,0,0,-1.007373,1,0,0,...,1.189218,0.437165,0.945188,-0.284978,0.281554,0.185953,-0.112983,-1.104735,0.363289,23
1,TRBGPYK128F42796E1,0.926188,4.084562,0.881785,0,0,0.092725,1,0,0,...,0.386584,0.661103,-0.21227,-1.06644,0.317366,-0.484924,0.358257,0.436883,0.239837,37


In [19]:
preds = tracks[["track_id", "cluster"]]
print len(preds.index)

10000


In [20]:
preds.head()

Unnamed: 0,track_id,cluster
0,TRBGPHG12903CE6CC3,23
1,TRBGPYK128F42796E1,37
2,TRBGPJP128E078ED20,4
3,TRBGPXH128F428C912,13
4,TRBGPSV12903CA9C25,38


##Combine dfs

In [21]:
trackclusterrecs = pd.merge(recs, preds, how='outer', on="track_id")

In [22]:
trackclusterrecs.head()

Unnamed: 0,track_id,recommend,cluster
0,TRBGPHG12903CE6CC3,0,23
1,TRBGPYK128F42796E1,0,37
2,TRBGPJP128E078ED20,1,4
3,TRBGPXH128F428C912,0,13
4,TRBGPSV12903CA9C25,1,38


##Bring in track title and artist names

In [23]:
with open(basedir + "titles_csv.csv", 'r') as tf:
    titles = pd.read_csv(tf, index_col=0)

In [24]:
titles.head()

Unnamed: 0,track_id,title,artist_id,artist,song_id
0,TRBGPHG12903CE6CC3,The Law Gonna Step On You (1931),ARFXRHR1187B98FF09,Bo Carter,SORUUEV12A58A7B9FA
1,TRBGPYK128F42796E1,Des Vôtres,ARQDTOS12086C11443,Fredericks_ Goldman_ Jones,SOEZDOH12A8AE4787A
2,TRBGPJP128E078ED20,Crazy,AR12F2S1187FB56EEF,Aerosmith,SOOOWIC12A6701C7E5
3,TRBGPXH128F428C912,Chiove,AR7LIU31187B98EF11,Rita Chiarelli,SOLSWBA12A8C141B9A
4,TRBGPSV12903CA9C25,Rude Bwoy Love (feat. Dj Fly_ Dj Traxx_ T.Will...,ARPGCHN1187B9A2831,Nicky B_ Naëlle,SOHNJQL12AB018CC5C


In [25]:
titles["title"] = titles["title"].str.replace(",","")
titles["title"] = titles["title"].str.replace("'", "")
titles["title"].head(10)

0                     The Law Gonna Step On You (1931)
1                                           Des Vôtres
2                                                Crazy
3                                               Chiove
4    Rude Bwoy Love (feat. Dj Fly_ Dj Traxx_ T.Will...
5                                     Down South Blues
6                                               Apagon
7                        It Came Upon A Midnight Clear
8                                              Doin It
9                                 Looking In Your Eyes
Name: title, dtype: object

In [26]:
titles["artist"] = titles["artist"].str.replace(",","")
titles["artist"] = titles["artist"].str.replace("'", "")
titles["artist"].head()

0                     Bo Carter
1    Fredericks_ Goldman_ Jones
2                     Aerosmith
3                Rita Chiarelli
4               Nicky B_ Naëlle
Name: artist, dtype: object

In [27]:
names = titles[["track_id", "title", "artist"]]

In [28]:
names.head()

Unnamed: 0,track_id,title,artist
0,TRBGPHG12903CE6CC3,The Law Gonna Step On You (1931),Bo Carter
1,TRBGPYK128F42796E1,Des Vôtres,Fredericks_ Goldman_ Jones
2,TRBGPJP128E078ED20,Crazy,Aerosmith
3,TRBGPXH128F428C912,Chiove,Rita Chiarelli
4,TRBGPSV12903CA9C25,Rude Bwoy Love (feat. Dj Fly_ Dj Traxx_ T.Will...,Nicky B_ Naëlle


##Make final df of track info for recommendations

In [29]:
trackinfo = pd.merge(names, trackclusterrecs, how='outer', on="track_id")

In [30]:
trackinfo.head()

Unnamed: 0,track_id,title,artist,recommend,cluster
0,TRBGPHG12903CE6CC3,The Law Gonna Step On You (1931),Bo Carter,0,23
1,TRBGPYK128F42796E1,Des Vôtres,Fredericks_ Goldman_ Jones,0,37
2,TRBGPJP128E078ED20,Crazy,Aerosmith,1,4
3,TRBGPXH128F428C912,Chiove,Rita Chiarelli,0,13
4,TRBGPSV12903CA9C25,Rude Bwoy Love (feat. Dj Fly_ Dj Traxx_ T.Will...,Nicky B_ Naëlle,1,38


In [31]:
with open(basedir+"trackinfo.csv", 'w') as twf:
    twf.write(trackinfo.to_csv())