# Importing the libraries

In [2]:
import os
import pandas as pd
import numpy as np
import json
import spotipy
import spotipy.oauth2 as oauth2
from spotipy.oauth2 import SpotifyOAuth,SpotifyClientCredentials
import yaml
import re
from tqdm import tqdm
import multiprocessing as mp
import time
import random
import datetime
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from skimage import io
from sklearn.preprocessing import OneHotEncoder

In [3]:
stream= open("spotify/spotify.yaml")
spotify_details = yaml.safe_load(stream)
auth_manager = SpotifyClientCredentials(client_id=spotify_details['Client_id'],
                                        client_secret=spotify_details['client_secret'])
sp = spotipy.client.Spotify(auth_manager=auth_manager)

FileNotFoundError: [Errno 2] No such file or directory: 'spotify/spotify.yaml'

# Importing the dataset

In [3]:
dtypes = {'track_uri': 'object', 'artist_uri': 'object', 'album_uri': 'object', 'danceability': 'float16', 'energy': 'float16', 'key': 'float16',
               'loudness': 'float16', 'mode': 'float16', 'speechiness': 'float16', 'acousticness': 'float16', 'instrumentalness': 'float16',
               'liveness': 'float16', 'valence': 'float16', 'tempo': 'float16', 'duration_ms': 'float32', 'time_signature': 'float16',
               'Track_release_date': 'int8', 'Track_pop': 'int8', 'Artist_pop': 'int8', 'Artist_genres': 'object'}
try:
    df=pd.read_csv('Data/1M_unique_processed_data_grow.csv',dtype=dtypes)
except:
    print('Failed to load grow')
    df=pd.read_csv('Data/1M_unique_processed_data.csv',dtype=dtypes)


# Test

Extract playlist tracks and artist uri

In [4]:
def get_IDs (user, playlist_id):
 track_ids = []
 artist_id = []
 playlist=sp.user_playlist (user, playlist_id)
 for item in playlist['tracks']['items']:
  track=item['track']
  track_ids.append(track['id'])
  artist=item['track']['artists']
  artist_id.append(artist[0]['id'])
 return track_ids,artist_id


track_ids,artist_id = get_IDs ('Ruby', 'spotify:playlist:37i9dQZF1DX8FwnYE6PRvL') 
print (len(track_ids))
print (len(artist_id))

80
80


getting the unique URI and repeating the extraction features and preprocessing steps for the user's playlist (input)

In [5]:
artist_id_uni=list(set(artist_id))
track_ids_uni=list(set(track_ids))

In [6]:
audio_features=pd.DataFrame()
for i in tqdm(range(0,len(track_ids_uni),25)):
    try:
     track_feature = sp.audio_features(track_ids_uni[i:i+25])
     track_df = pd.DataFrame(track_feature)
     audio_features=pd.concat([audio_features,track_df],axis=0)
    except Exception as e:
        print(e)
        continue

100%|██████████| 4/4 [00:00<00:00, 12.20it/s]


In [7]:
track_=pd.DataFrame()
for i in tqdm(range(0,len(track_ids_uni),25)):
    try:
        track_features = sp.tracks(track_ids_uni[i:i+25])
        for x in range(25):
            track_pop=pd.DataFrame([track_ids_uni[i+x]],columns=['Track_uri'])
            track_pop['Track_release_date']=track_features['tracks'][x]['album']['release_date']
            track_pop['Track_pop'] = track_features['tracks'][x]["popularity"]
            track_pop['Artist_uri']=track_features['tracks'][x]['artists'][0]['id']
            track_pop['Album_uri']=track_features['tracks'][x]['album']['id']
            track_=pd.concat([track_,track_pop],axis=0)
    except Exception as e:
        print(e)
        continue

100%|██████████| 4/4 [00:00<00:00,  4.37it/s]

list index out of range





In [8]:
artist_=pd.DataFrame()
for i in tqdm(range(0,len(artist_id_uni),25)):
    try:
        artist_features = sp.artists(artist_id_uni[i:i+25])
        for x in range(25):
            artist_df=pd.DataFrame([artist_id_uni[i+x]],columns=['Artist_uri'])
            artist_pop = artist_features['artists'][x]["popularity"]
            artist_genres = artist_features['artists'][x]["genres"]
            artist_df["Artist_pop"] = artist_pop
            if artist_genres: 
                artist_df["genres"] = " ".join([re.sub(' ','_',i) for i in artist_genres])
            else:
              artist_df["genres"] = "unknown"
            artist_=pd.concat([artist_,artist_df],axis=0)
    except Exception as e:
        print(e)
        continue

100%|██████████| 3/3 [00:00<00:00,  9.76it/s]

list index out of range





In [9]:
test=pd.DataFrame(track_,columns=['Track_uri','Artist_uri','Album_uri'])

In [10]:
test.rename(columns = {'Track_uri':'track_uri','Artist_uri':'artist_uri','Album_uri':'album_uri'}, inplace = True)

In [11]:
audio_features.drop(columns=['type','uri','track_href','analysis_url'],axis=1,inplace=True)

In [12]:
test = pd.merge(test,audio_features, left_on = "track_uri", right_on= "id",how = 'outer')
test = pd.merge(test,track_, left_on = "track_uri", right_on= "Track_uri",how = 'outer')
test = pd.merge(test,artist_, left_on = "artist_uri", right_on= "Artist_uri",how = 'outer')

In [13]:
del audio_features,track_,artist_

In [14]:
test.rename(columns = {'genres':'Artist_genres'}, inplace = True)

In [15]:
test.drop(columns=['Track_uri','Artist_uri_x','Artist_uri_y','Album_uri','id'],axis=1,inplace=True)

In [16]:
test.dropna(axis=0,inplace=True)

In [17]:
test['Track_pop'] = test['Track_pop'].apply(lambda x: int(x/5))
test['Artist_pop'] = test['Artist_pop'].apply(lambda x: int(x/5))
test['Track_release_date'] = test['Track_release_date'].apply(lambda x: x.split('-')[0])
test['Track_release_date']=test['Track_release_date'].astype('int16')
test['Track_release_date'] = test['Track_release_date'].apply(lambda x: int(x/50))

In [18]:
test[['danceability', 'energy', 'key','loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness','liveness', 'valence', 'tempo', 'time_signature']]=test[['danceability', 'energy', 'key','loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness','liveness', 'valence', 'tempo','time_signature']].astype('float16')
test[['duration_ms']]=test[['duration_ms']].astype('float32')
test[['Track_release_date', 'Track_pop', 'Artist_pop']]=test[['Track_release_date', 'Track_pop', 'Artist_pop']].astype('int8')

In [19]:
currentdf=len(df)

In [20]:
df=pd.concat([df,test],axis=0)

In [21]:
df.drop_duplicates(subset=['track_uri'],inplace=True,keep='last') ## keep last to keep the dataset updated 

In [22]:
df.dropna(axis=0,inplace=True)

In [23]:
print('{} New Tracks Found'.format(len(df)-currentdf))

36 New Tracks Found


In [24]:
#saving the tracks if they weren't found in the dataset
if len(df)>currentdf:   
    df.to_csv('data/1M_unique_processed_data_grow.csv',index=False)
    print('{} New Found'.format(len(df)-currentdf))
    streamlit=df[df.Track_pop >0]             # dropped track with 0 popularity score to save space and ram for the final model
    streamlit.to_csv('data/streamlit.csv',index=False)
    del streamlit

36 New Found


In [25]:
df = df[~df['track_uri'].isin(test['track_uri'].values)]

In [26]:
test['Artist_genres'] = test['Artist_genres'].apply(lambda x: x.split(" "))
tfidf = TfidfVectorizer(max_features=3) #max_features=5 
tfidf_matrix = tfidf.fit_transform(test['Artist_genres'].apply(lambda x: " ".join(x)))
genre_df = pd.DataFrame(tfidf_matrix.toarray())
genre_df.columns = ['genre' + "|" + i for i in tfidf.get_feature_names_out()]

In [27]:
genre_df=genre_df.astype('float16')
test.drop(columns=['Artist_genres'],axis=1,inplace=True)

In [28]:
test = pd.concat([test.reset_index(drop=True), genre_df.reset_index(drop=True)],axis = 1)


In [29]:
test.isna().sum().sum()

0

# df

In [30]:
df['Artist_genres'] = df['Artist_genres'].apply(lambda x: x.split(" "))
tfidf_matrix = tfidf.transform(df['Artist_genres'].apply(lambda x: " ".join(x)))
genre_df = pd.DataFrame(tfidf_matrix.toarray())
genre_df.columns = ['genre' + "|" + i for i in tfidf.get_feature_names_out()]

In [31]:
genre_df=genre_df.astype('float16')
df.drop(columns=['Artist_genres'],axis=1,inplace=True)

In [32]:
df = pd.concat([df.reset_index(drop=True), genre_df.reset_index(drop=True)],axis = 1)

# pred

In [33]:
try:
    df.drop(columns=['genre|unknown'],axis=1,inplace=True)
    test.drop(columns=['genre|unknown'],axis=1,inplace=True)
except:
    print('genre|unknown not found')

genre|unknown not found


In [34]:
test.columns

Index(['track_uri', 'artist_uri', 'album_uri', 'danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature',
       'Track_release_date', 'Track_pop', 'Artist_pop', 'genre|modern_rock',
       'genre|permanent_wave', 'genre|rock'],
      dtype='object')

In [35]:
df.columns

Index(['track_uri', 'artist_uri', 'album_uri', 'danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature',
       'Track_release_date', 'Track_pop', 'Artist_pop', 'genre|modern_rock',
       'genre|permanent_wave', 'genre|rock'],
      dtype='object')

I was first using OneHotEncoder for "Track_release_date", "Track_pop", and "Artist_pop," but I found no difference in the final result other than high memory usage.

In [36]:
""" ohe = OneHotEncoder(handle_unknown='ignore')
dummies = pd.DataFrame(ohe.fit_transform(test[['Track_release_date', 'Track_pop', 'Artist_pop']]).toarray(), index=test.index,dtype=int)
column_name = ohe.get_feature_names_out(['Track_release_date', 'Track_pop', 'Artist_pop'])
dummies.columns=column_name
test = pd.concat([test.drop(['Track_release_date', 'Track_pop', 'Artist_pop'], axis=1), dummies], axis=1) """

" ohe = OneHotEncoder(handle_unknown='ignore')\ndummies = pd.DataFrame(ohe.fit_transform(test[['Track_release_date', 'Track_pop', 'Artist_pop']]).toarray(), index=test.index,dtype=int)\ncolumn_name = ohe.get_feature_names_out(['Track_release_date', 'Track_pop', 'Artist_pop'])\ndummies.columns=column_name\ntest = pd.concat([test.drop(['Track_release_date', 'Track_pop', 'Artist_pop'], axis=1), dummies], axis=1) "

In [37]:
""" ohe2 = OneHotEncoder(categories=ohe.categories_,handle_unknown='ignore')
dummies = pd.DataFrame(ohe2.fit_transform(df[['Track_release_date', 'Track_pop', 'Artist_pop']]).toarray(), index=df.index, dtype=int)
column_name = ohe2.get_feature_names_out(['Track_release_date', 'Track_pop', 'Artist_pop'])
dummies.columns=column_name
df=pd.concat([df.drop(['Track_release_date', 'Track_pop', 'Artist_pop'], axis=1), dummies], axis=1)
 """

" ohe2 = OneHotEncoder(categories=ohe.categories_,handle_unknown='ignore')\ndummies = pd.DataFrame(ohe2.fit_transform(df[['Track_release_date', 'Track_pop', 'Artist_pop']]).toarray(), index=df.index, dtype=int)\ncolumn_name = ohe2.get_feature_names_out(['Track_release_date', 'Track_pop', 'Artist_pop'])\ndummies.columns=column_name\ndf=pd.concat([df.drop(['Track_release_date', 'Track_pop', 'Artist_pop'], axis=1), dummies], axis=1)\n "

In [38]:
#df.info(memory_usage = "deep")

In [39]:
#test.loc[:,test.columns.str.startswith('genre')]=test.loc[:,test.columns.str.startswith('genre')].astype('bool')
#df.loc[:,df.columns.str.startswith('genre')]=df.loc[:,df.columns.str.startswith('genre')].astype('bool')


In [40]:
sc=MinMaxScaler()
df.iloc[:,3:19]=sc.fit_transform(df.iloc[:,3:19])
pickle.dump(sc, open('data/sc.sav', 'wb'))

In [41]:
test.iloc[:,3:19]=sc.transform(test.iloc[:,3:19])

In [42]:
playvec=pd.DataFrame(test.sum(axis=0)).T
playvec

Unnamed: 0,track_uri,artist_uri,album_uri,danceability,energy,key,loudness,mode,speechiness,acousticness,...,valence,tempo,duration_ms,time_signature,Track_release_date,Track_pop,Artist_pop,genre|modern_rock,genre|permanent_wave,genre|rock
0,2AT8iROs4FQueDv2c8q2KE69uxyAqqPIsUyTO8txoP2M37...,7Ln80lUS6He07XvHI8qqHH4gzpq5DPGxSnKTe4SA8HAU58...,78bpIziExqiI9qztvNFlQu3cfAM8b8KqJRoIzt3zLKqw0k...,44.138238,66.62793,36.363637,66.575184,57.0,5.761925,4.38462,...,51.553955,38.882252,2.999022,64.000001,78.950001,67.777777,56.650001,16.453125,17.1875,43.09375


In [43]:
df['sim']=cosine_similarity(df.drop(['track_uri', 'artist_uri', 'album_uri'], axis = 1),playvec.drop(['track_uri', 'artist_uri', 'album_uri'], axis = 1))
df['sim2']=cosine_similarity(df.iloc[:,16:-1],playvec.iloc[:,16:])
df['sim3']=cosine_similarity(df.iloc[:,19:-2],playvec.iloc[:,19:])
df = df.sort_values(['sim3','sim2','sim'],ascending = False,kind='stable')
qq=df.groupby('artist_uri').head(5).track_uri.head(50)     #to limit recmmendation by same artist
aa=sp.tracks(qq[0:50])
Fresult=pd.DataFrame()
for i in range(50):
    result=pd.DataFrame([i])
    result['track_name']=aa['tracks'][i]['name']
    result['artist_name']=aa['tracks'][i]['artists'][0]['name']
    #result['url']=aa['tracks'][i]['external_urls']['spotify']
    #result['image']=aa['tracks'][i]['album']['images'][1]['url']
    Fresult=pd.concat([Fresult,result],axis=0)
Fresult

Unnamed: 0,0,track_name,artist_name
0,0,Don't Stop Me Now - Remastered 2011,Queen
0,1,Another One Bites The Dust - Remastered 2011,Queen
0,2,It's My Life,Bon Jovi
0,3,Can You Feel My Heart,Bring Me The Horizon
0,4,Sweet Child O' Mine,Guns N' Roses
0,5,Welcome To The Jungle,Guns N' Roses
0,6,Highway to Hell,AC/DC
0,7,Can't Help Falling in Love,Elvis Presley
0,8,Have You Ever Seen The Rain,Creedence Clearwater Revival
0,9,Fortunate Son,Creedence Clearwater Revival


In [44]:
df['sim']=cosine_similarity(df.iloc[:,3:16],playvec.iloc[:,3:16])
df['sim2']=cosine_similarity(df.loc[:, df.columns.str.startswith('T')|df.columns.str.startswith('A')],playvec.loc[:, playvec.columns.str.startswith('T')|playvec.columns.str.startswith('A')])
df['sim3']=cosine_similarity(df.loc[:, df.columns.str.startswith('genre')],playvec.loc[:, playvec.columns.str.startswith('genre')])
df['sim4']=(df['sim']+df['sim2']+df['sim3'])/3
df = df.sort_values(['sim4'],ascending = False,kind='stable')
# genra>audio>pop
qq=df.groupby('artist_uri').head(5).track_uri.head(50)
aa=sp.tracks(qq[0:50])
Fresult=pd.DataFrame()
for i in range(50):
    result=pd.DataFrame([i])
    result['track_name']=aa['tracks'][i]['name']
    result['artist_name']=aa['tracks'][i]['artists'][0]['name']
    #result['url']=aa['tracks'][i]['external_urls']['spotify']
    #result['image']=aa['tracks'][i]['album']['images'][1]['url']
    Fresult=pd.concat([Fresult,result],axis=0)
Fresult

Unnamed: 0,0,track_name,artist_name
0,0,Go Your Own Way - 2004 Remaster,Fleetwood Mac
0,1,We Didn't Start the Fire,Billy Joel
0,2,Gimme All Your Lovin',ZZ Top
0,3,Thorn in My Side - Remastered,Eurythmics
0,4,Any Way You Want It,Journey
0,5,I Don't Wanna Stop,Ozzy Osbourne
0,6,Semi-Charmed Life,Third Eye Blind
0,7,Sweet Child O' Mine,Guns N' Roses
0,8,Old Time Rock & Roll,Bob Seger
0,9,Lump,The Presidents Of The United States Of America


In [45]:
Spotifyresult=pd.DataFrame()
for i in range(len(test)-1):
    if len(Spotifyresult)>=50:
        break
    ff=sp.recommendations(seed_tracks=list(test.track_uri[1+i:5+i]),limit=2)
    for z in range(2):
        result=pd.DataFrame([z+(2*i)+1])
        result['track_name']=ff['tracks'][z]['name']
        result['artist_name']=ff['tracks'][z]['artists'][0]['name']
        #result['uri']=ff['tracks'][z]['id']
        #result['url']=ff['tracks'][z]['external_urls']['spotify']
        #result['image']=ff['tracks'][z]['album']['images'][1]['url']
        Spotifyresult=pd.concat([Spotifyresult,result],axis=0)
Spotifyresult

Unnamed: 0,0,track_name,artist_name
0,1,R.O.C.K. In The U.S.A. (A Salute To 60's Rock),John Mellencamp
0,2,Blood On Blood,Bon Jovi
0,3,T.N.T.,AC/DC
0,4,Warning,Green Day
0,5,Turn The Page,Metallica
0,6,Forever Now,Green Day
0,7,Duality,Slipknot
0,8,Lit Up,Buckcherry
0,9,Plug in Baby,Muse
0,10,Wheel in the Sky,Journey


sorry ur playlist must have atleast 5 tracks for this method to work