# Importing the libraries

In [1]:
import os
import pandas as pd
import numpy as np
import json
import spotipy
import spotipy.oauth2 as oauth2
from spotipy.oauth2 import SpotifyOAuth,SpotifyClientCredentials
import yaml
import re
from tqdm import tqdm
import multiprocessing as mp
import time
import random
import datetime

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

In [2]:
# Spotify API credentials
client_id = '69f9da2fa5d741779e2cf9d276b09dc0'
client_secret = '85fd8b5e00434f3fbec9f67bb08a4187'

auth_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.client.Spotify(auth_manager=auth_manager)

# Importing the dataset

In [8]:
df=pd.read_csv('data/1m.csv')
artist_features=pd.read_csv('data/artist_features.csv')
audio_features=pd.read_csv('data/audio_features.csv')
track_features=pd.read_csv('data/track_features.csv')

In [16]:
track_features

Unnamed: 0,31kTKEPu5PPTTODydIYai0,2016-04-29,20
0,54i6YAQkZImwWjmzPrQeqd,2016-06-24,0
1,1UqhkbzB1kuFwt2iy4h29Q,2011,72
2,03f7xZmt2wHCIDJBFPK8G4,2015-08-14,74
3,3ASt4jBSatnN4RGMlym74h,2014-10-27,13
4,5bcTCxgc7xVfSaMV3RuVke,2017-06-30,77
...,...,...,...
64781,2PBTwMH2mzfLigdMyPzOcp,1991-09-03,44
64782,5JugcqxQihVYdvCSPzmP1H,2016-09-09,0
64783,1cKRBp7hrBVD4eP3W9x2AI,1998,0
64784,38griAVM808crjbFp9gcPD,2003,15


# Merging all dataframes

In [15]:
df = pd.merge(df,audio_features, left_on = "track_uri", right_on= "id",how = 'outer')

MergeError: Must pass "right_on" OR "right_index".

In [None]:
df = pd.merge(df,track_features, left_on = "track_uri", right_on= "Track_uri",how = 'outer')

In [None]:
df = pd.merge(df,artist_features, left_on = "artist_uri", right_on= "Artist_uri",how = 'outer')

# Handling missing data 

In [None]:
df.isna().sum()

track_uri               0
artist_uri              0
album_uri               0
danceability          101
energy                101
key                   101
loudness              101
mode                  101
speechiness           101
acousticness          101
instrumentalness      101
liveness              101
valence               101
tempo                 101
type                  101
id                    101
uri                   101
track_href            101
analysis_url          101
duration_ms           101
time_signature        101
Track_uri             576
Track_release_date    576
Track_pop             576
Artist_uri              0
Artist_pop              0
Artist_genres           0
dtype: int64

## Handling audio_features missing From extraction

In [None]:
missing_t_uri=df.track_uri[df.id.isna()]
missing_t_uri=missing_t_uri.unique()
random.shuffle(missing_t_uri)

In [None]:
f = open('data/audio_features.csv','a')
for i in tqdm(range(0,len(missing_t_uri),1)):
    try:
     track_feature = sp.audio_features(missing_t_uri[i:i+1])
     track_df = pd.DataFrame(track_feature)
     csv_data = track_df.to_csv(header=False,index=False)
     f.write(csv_data)
    except Exception as e:
        r = open("extract_log0.txt", "a")
        r.write(datetime.datetime.now().strftime("%d.%b %Y %H:%M:%S")+": "+str(e)+'\n')
        r.close()
        time.sleep(1)
        continue
f.close()

## Handling track_features missing From extraction

In [None]:
missing_t_uri=df.track_uri[df.Track_uri.isna()]
missing_t_uri=missing_t_uri.unique()
random.shuffle(missing_t_uri)

In [None]:
f = open('data/track_features.csv','a')
for i in tqdm(range(0,len(missing_t_uri),1)):
    try:
        track_features = sp.tracks(missing_t_uri[i:i+1])
        for x in range(1):
            track_pop=pd.DataFrame([missing_t_uri[i+x]])
            track_pop['release_date']=track_features['tracks'][x]['album']['release_date']
            track_pop['pop'] = track_features['tracks'][x]["popularity"]
            csv_data = track_pop.to_csv(header=False,index=False)
            f.write(csv_data)
    except Exception as e:
        r = open("extract_log.txt", "a")
        r.write(datetime.datetime.now().strftime("%d.%b %Y %H:%M:%S")+": "+str(e)+'\n')
        r.close()
        time.sleep(1)
        continue
f.close()

# Droping Unwanted Columns Save Space

There were still 101 from audio_features and 576 from track_features extraction that were missing from the soptify api, so I had to drop them.

In [None]:
df.dropna(axis=0,inplace=True)

In [None]:
df.isna().sum().sum()

0

In [None]:
df.columns

Index(['track_uri', 'artist_uri', 'album_uri', 'danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'type', 'id', 'uri', 'track_href',
       'analysis_url', 'duration_ms', 'time_signature', 'Track_uri',
       'Track_release_date', 'Track_pop', 'Artist_uri', 'Artist_pop',
       'Artist_genres'],
      dtype='object')

In [None]:
df.drop(columns=['Track_uri','Artist_uri','type','id','uri','track_href','analysis_url'],axis=1,inplace=True)

In [None]:
df.head(1)

Unnamed: 0,track_uri,artist_uri,album_uri,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,Track_release_date,Track_pop,Artist_pop,Artist_genres
0,0UaMYEvWZi0ZqiDOoHU3YI,2wIVse2owClT7go1WT98tk,6vV5UrXcfyQD1wu4Qo2I9K,0.904,0.813,4.0,-7.105,0.0,0.121,0.0311,0.00697,0.0471,0.81,125.461,226864.0,4.0,2005-07-04,67.0,71,dance_pop hip_hop hip_pop pop_rap r&b rap urba...


## Data Preprocessing

Create five point buckets for track and artist popularity .

and 50 point buckets for the track release date.

In [None]:
df['Track_pop'] = df['Track_pop'].apply(lambda x: int(x/5))
df['Artist_pop'] = df['Artist_pop'].apply(lambda x: int(x/5))

In [None]:
df['Track_release_date'] = df['Track_release_date'].apply(lambda x: x.split('-')[0])
df['Track_release_date']=df['Track_release_date'].astype('int16')
df['Track_release_date'] = df['Track_release_date'].apply(lambda x: int(x/50))

In [None]:
df.head(1)

Unnamed: 0,track_uri,artist_uri,album_uri,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,Track_release_date,Track_pop,Artist_pop,Artist_genres
0,0UaMYEvWZi0ZqiDOoHU3YI,2wIVse2owClT7go1WT98tk,6vV5UrXcfyQD1wu4Qo2I9K,0.904,0.813,4.0,-7.105,0.0,0.121,0.0311,0.00697,0.0471,0.81,125.461,226864.0,4.0,40,13,14,dance_pop hip_hop hip_pop pop_rap r&b rap urba...


In [None]:
df.to_csv('data/1M_unique_processed_data.csv',index=False)