# Importing the libraries

In [392]:
import os
import pandas as pd
import numpy as np
import json
import spotipy
import spotipy.oauth2 as oauth2
from spotipy.oauth2 import SpotifyOAuth,SpotifyClientCredentials
import yaml
import re
from tqdm import tqdm
import multiprocessing as mp
import time
import random
import datetime

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

In [393]:
# Spotify API credentials
client_id = 'd4eec2244fb2416ebce8ec488b1f9587'
client_secret = '0cf3c26eca6244e796a10a847dc11f17'

auth_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.client.Spotify(auth_manager=auth_manager)

# Importing the dataset

In [394]:
df=pd.read_csv('../Data/unique_tracks.csv')
artist_features=pd.read_csv('../Data/artist_features.csv')
audio_features=pd.read_csv('../Data/audio_features.csv')
track_features=pd.read_csv('../Data/track_features.csv')

In [395]:
track_features.columns

Index(['track_uri', 'release_date', 'popularity'], dtype='object')

In [396]:
audio_features.columns

Index(['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'type', 'id', 'uri', 'track_href', 'analysis_url', 'duration_ms',
       'time_signature'],
      dtype='object')

In [397]:
artist_features.columns

Index(['0', 'artist_pop', 'genres'], dtype='object')

In [398]:
df.columns

Index(['name', 'num_holdouts', 'pid', 'num_tracks', 'num_samples', '0', 'pos',
       'artist_name', 'track_uri', 'artist_uri', 'track_name', 'album_uri',
       'duration_ms', 'album_name'],
      dtype='object')

In [399]:
# Remove 'spotify:track:' from 'track_uri' in df
df['track_uri'] = df['track_uri'].str.replace('spotify:track:', '')

df['artist_uri'] = df['artist_uri'].str.replace('spotify:artist:', '')

# Rename the column '0' to 'artist_uri' in artist_features DataFrame
artist_features.rename(columns={'0': 'artist_uri'}, inplace=True)

# Merge df with audio_features using the modified 'track_uri' and 'id'
final_df = pd.merge(df, audio_features, left_on='track_uri', right_on='id', how='outer')


In [400]:
final_df.isna().sum()

name                 15254
num_holdouts             0
pid                      0
num_tracks               0
num_samples              0
0                   287205
pos                   1000
artist_name           1000
track_uri             1000
artist_uri            1000
track_name            1001
album_uri             1000
duration_ms_x         1000
album_name            1000
danceability         74117
energy               74117
key                  74117
loudness             74117
mode                 74117
speechiness          74117
acousticness         74117
instrumentalness     74117
liveness             74117
valence              74117
tempo                74117
type                 74117
id                   74117
uri                  74117
track_href           74117
analysis_url         74117
duration_ms_y        74117
time_signature       74117
dtype: int64

In [401]:
# Merging df with track_features based on 'track_uri'
final_df = pd.merge(final_df, track_features, on='track_uri', how='outer')

# Now, let's merge artist_features using the column 'track_uri' from final_df and '0' from artist_features
final_df = pd.merge(final_df, artist_features, on='artist_uri', how='outer')

# Drop duplicate columns (if any)
final_df = final_df.loc[:, ~final_df.columns.duplicated()]

# Display the first few rows of the final DataFrame
final_df.head()


Unnamed: 0,name,num_holdouts,pid,num_tracks,num_samples,0,pos,artist_name,track_uri,artist_uri,...,id,uri,track_href,analysis_url,duration_ms_y,time_signature,release_date,popularity,artist_pop,genres
0,spanish playlist,11,1000002,11,0,,,,,,...,,,,,,,,,,
1,Groovin,48,1000003,48,0,,,,,,...,,,,,,,,,,
2,uplift,40,1000004,40,0,,,,,,...,,,,,,,,,,
3,WUBZ,27,1000006,27,0,,,,,,...,,,,,,,,,,
4,new,41,1000007,41,0,,,,,,...,,,,,,,,,,


In [402]:
# Rename columns for clarity
final_df.rename(columns={
 
    'duration_ms_x': 'track_duration_ms',
    'duration_ms_y': 'audio_duration_ms'
    # ... other columns need renaming as per your preference
}, inplace=True)

# Drop duplicate columns or unwanted columns
final_df = final_df.drop(['id', 'uri', 'track_href', 'analysis_url'], axis=1)

# Display the modified DataFrame
final_df.head()


Unnamed: 0,name,num_holdouts,pid,num_tracks,num_samples,0,pos,artist_name,track_uri,artist_uri,...,liveness,valence,tempo,type,audio_duration_ms,time_signature,release_date,popularity,artist_pop,genres
0,spanish playlist,11,1000002,11,0,,,,,,...,,,,,,,,,,
1,Groovin,48,1000003,48,0,,,,,,...,,,,,,,,,,
2,uplift,40,1000004,40,0,,,,,,...,,,,,,,,,,
3,WUBZ,27,1000006,27,0,,,,,,...,,,,,,,,,,
4,new,41,1000007,41,0,,,,,,...,,,,,,,,,,


In [403]:
final_df.isna().sum()

name                  15254
num_holdouts              0
pid                       0
num_tracks                0
num_samples               0
0                    287205
pos                    1000
artist_name            1000
track_uri              1000
artist_uri             1000
track_name             1001
album_uri              1000
track_duration_ms      1000
album_name             1000
danceability          74117
energy                74117
key                   74117
loudness              74117
mode                  74117
speechiness           74117
acousticness          74117
instrumentalness      74117
liveness              74117
valence               74117
tempo                 74117
type                  74117
audio_duration_ms     74117
time_signature        74117
release_date           2967
popularity             2967
artist_pop            17702
genres                17702
dtype: int64

# Droping Unwanted Columns Save Space

There were still 101 from audio_features and 576 from track_features extraction that were missing from the soptify api, so I had to drop them.

In [404]:
final_df.shape

(287205, 32)

In [405]:
#final_df.dropna(axis=0,inplace=True)
final_df.shape

(287205, 32)

In [406]:
missing_t_uri=final_df.track_uri.isna()
missing_t_uri=missing_t_uri.unique()
random.shuffle(missing_t_uri)


In [407]:
f = open('track_features.csv','a')
for i in tqdm(range(0,len(missing_t_uri),1)):
    try:
        track_features = sp.tracks(missing_t_uri[i:i+1])
        for x in range(1):
            track_pop=pd.DataFrame([missing_t_uri[i+x]])
            track_pop['release_date']=track_features['tracks'][x]['album']['release_date']
            track_pop['pop'] = track_features['tracks'][x]["popularity"]
            csv_data = track_pop.to_csv(header=False,index=False)
            f.write(csv_data)
    except Exception as e:
        r = open("extract_log.txt", "a")
        r.write(datetime.datetime.now().strftime("%d.%b %Y %H:%M:%S")+": "+str(e)+'\n')
        r.close()
        time.sleep(1)
        continue
f.close()

100%|██████████| 2/2 [00:02<00:00,  1.01s/it]


In [408]:
f = open('audio_features.csv','a')
for i in tqdm(range(0,len(missing_t_uri),1)):
    try:
     track_feature = sp.audio_features(missing_t_uri[i:i+1])
     track_df = pd.DataFrame(track_feature)
     csv_data = track_df.to_csv(header=False,index=False)
     f.write(csv_data)
    except Exception as e:
        r = open("extract_log0.txt", "a")
        r.write(datetime.datetime.now().strftime("%d.%b %Y %H:%M:%S")+": "+str(e)+'\n')
        r.close()
        time.sleep(1)
        continue
f.close()

100%|██████████| 2/2 [00:02<00:00,  1.01s/it]


In [409]:
final_df.columns

Index(['name', 'num_holdouts', 'pid', 'num_tracks', 'num_samples', '0', 'pos',
       'artist_name', 'track_uri', 'artist_uri', 'track_name', 'album_uri',
       'track_duration_ms', 'album_name', 'danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'type', 'audio_duration_ms',
       'time_signature', 'release_date', 'popularity', 'artist_pop', 'genres'],
      dtype='object')

In [410]:
final_df.dropna(subset='track_uri',inplace=True)

In [411]:
final_df.head()

Unnamed: 0,name,num_holdouts,pid,num_tracks,num_samples,0,pos,artist_name,track_uri,artist_uri,...,liveness,valence,tempo,type,audio_duration_ms,time_signature,release_date,popularity,artist_pop,genres
1000,Party,70,1000000,75,5,,0.0,AronChupa,66U0ASk1VHZsqIkpMjKX3B,5vCOdeiQt9LyzdI87kt5Sh,...,,,,,,,,,,
1001,Party 1,54,1039848,154,100,,130.0,AronChupa,66U0ASk1VHZsqIkpMjKX3B,5vCOdeiQt9LyzdI87kt5Sh,...,,,,,,,,,,
1002,Party,70,1000000,75,5,,1.0,AronChupa,5MhsZlmKJG6X5kTHkdwC4B,5vCOdeiQt9LyzdI87kt5Sh,...,,,,,,,,,,
1003,AC,39,1009575,49,10,,2.0,AronChupa,5MhsZlmKJG6X5kTHkdwC4B,5vCOdeiQt9LyzdI87kt5Sh,...,,,,,,,,,,
1004,Party,76,1007292,101,25,,32.0,AronChupa,5MhsZlmKJG6X5kTHkdwC4B,5vCOdeiQt9LyzdI87kt5Sh,...,,,,,,,,,,


In [412]:
final_df.shape

(286205, 32)

In [413]:
final_df.drop('0',axis=1,inplace=True)

In [414]:
final_df.isna().sum()

name                 15254
num_holdouts             0
pid                      0
num_tracks               0
num_samples              0
pos                      0
artist_name              0
track_uri                0
artist_uri               0
track_name               1
album_uri                0
track_duration_ms        0
album_name               0
danceability         73117
energy               73117
key                  73117
loudness             73117
mode                 73117
speechiness          73117
acousticness         73117
instrumentalness     73117
liveness             73117
valence              73117
tempo                73117
type                 73117
audio_duration_ms    73117
time_signature       73117
release_date          1967
popularity            1967
artist_pop           16702
genres               16702
dtype: int64

In [416]:
final_df.to_csv('../Data/1M_unique_processed_data.csv',index=False)