<a href="https://colab.research.google.com/github/viperyzen/SpotifyRooms/blob/master/Spotify_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Spotify Recommendation System

PREPROCESSING AND TRANSFORMATION OF ARTISTS DATA

In [None]:
# Using the dataset collected by Yamac Eren Ay using Spotify Web API
# Dataset Link : https://www.kaggle.com/datasets/yamaerenay/spotify-dataset-19212020-600k-tracks


In [3]:
# To read and handle data files
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# For handling arrays and vectors
import numpy as np

# For string manipulation
import string

# For displaying progress
from tqdm.auto import tqdm
tqdm.pandas()

# Supress warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Read data file and display first 5 records
artists = pd.read_csv('artists.csv', engine='python', encoding='utf-8', error_bad_lines=False)
artists.head()

Unnamed: 0,id,followers,genres,name,popularity
0,0DheY5irMjBUeLybbCUEZ2,0.0,[],Armid & Amir Zare Pashai feat. Sara Rouzbehani,0
1,0DlhY15l3wsrnlfGio2bjU,5.0,[],ปูนา ภาวิณี,0
2,0DmRESX2JknGPQyO15yxg7,0.0,[],Sadaa,0
3,0DmhnbHjm1qw6NCYPeZNgJ,0.0,[],Tra'gruda,0
4,0Dn11fWM7vHQ3rinvWEl4E,2.0,[],Ioannis Panoutsopoulos,0


In [None]:
artists.describe()

Unnamed: 0,followers,popularity
count,1162084.0,1162095.0
mean,10220.7,8.795961
std,254399.5,13.55777
min,0.0,0.0
25%,10.0,0.0
50%,57.0,2.0
75%,417.0,13.0
max,78900230.0,100.0


In [None]:
artists.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1162095 entries, 0 to 1162094
Data columns (total 5 columns):
 #   Column      Non-Null Count    Dtype  
---  ------      --------------    -----  
 0   id          1162095 non-null  object 
 1   followers   1162084 non-null  float64
 2   genres      1162095 non-null  object 
 3   name        1162095 non-null  object 
 4   popularity  1162095 non-null  int64  
dtypes: float64(1), int64(1), object(3)
memory usage: 44.3+ MB


In [None]:
# Display final 5 records in artists DataFrame
artists.tail()

Unnamed: 0,id,followers,genres,name,popularity
139505,5voNQzKKIBqYGeHm4Abw3v,87515.0,"['sertanejo', 'sertanejo tradicional', 'sertan...",Pedro Bento & Zé Da Estrada,44
139506,2DHROEfQ05NhnciQxReWk6,21574.0,"['malaysian pop', 'rock kapak']",Spring,44
139507,7gCZfkXLOfuZozSItq6tCG,1786.0,['british choir'],London Symphony Chorus,44
139508,5r4KrJXWxLsbym4vjcRv1i,59628.0,"['french hip hop', 'old school rap francais', ...",Tandem,44
139509,2PjgZkwAEk7UTin4jP6HLP,47366.0,"['bebop', 'big band', 'cool jazz', 'jazz', 'st...",Lionel Hampton,44


In [None]:
# Sample record from artists data
pd.DataFrame(artists.iloc[144][list(artists.columns)]).transpose()

Unnamed: 0,id,followers,genres,name,popularity
144,4sTTEheJxmjwv9TmrHOaPz,288.0,"[""children's story""]",Honor Blackman,15


In [None]:
# Function to preprocess and transform artists data
def transform_artists(df):
    print('Transforming artists DataFrame...')

    # Remove punctuations from "genre" field for each record
    print('\nRemoving punctuations from "genre" field...')
    df['genres'] = df.progress_apply(lambda x: x['genres'].lower().translate(str.maketrans('', '', string.punctuation)), axis=1)
    
    # Reorder columns
    df_transformed = df.reindex(columns=['id', 'name', 'genres', 'followers', 'popularity'])

    # Return preprocessed DataFrame
    print("\nartists DataFrame transformed successfully!\n")
    return df_transformed

In [None]:
# Preprocess and transform artists DataFrame and display first 5 records
artists_transformed = transform_artists(artists)
artists_transformed.head()

Transforming artists DataFrame...

Removing punctuations from "genre" field...


  0%|          | 0/139510 [00:00<?, ?it/s]


artists DataFrame transformed successfully!



Unnamed: 0,id,name,genres,followers,popularity
0,0DheY5irMjBUeLybbCUEZ2,Armid & Amir Zare Pashai feat. Sara Rouzbehani,,0.0,0
1,0DlhY15l3wsrnlfGio2bjU,ปูนา ภาวิณี,,5.0,0
2,0DmRESX2JknGPQyO15yxg7,Sadaa,,0.0,0
3,0DmhnbHjm1qw6NCYPeZNgJ,Tra'gruda,,0.0,0
4,0Dn11fWM7vHQ3rinvWEl4E,Ioannis Panoutsopoulos,,2.0,0


In [None]:
# Display final 5 records in artists DataFrame
artists_transformed.tail()

Unnamed: 0,id,name,genres,followers,popularity
139505,5voNQzKKIBqYGeHm4Abw3v,Pedro Bento & Zé Da Estrada,sertanejo sertanejo tradicional sertanejo univ...,87515.0,44
139506,2DHROEfQ05NhnciQxReWk6,Spring,malaysian pop rock kapak,21574.0,44
139507,7gCZfkXLOfuZozSItq6tCG,London Symphony Chorus,british choir,1786.0,44
139508,5r4KrJXWxLsbym4vjcRv1i,Tandem,french hip hop old school rap francais rap inde,59628.0,44
139509,2PjgZkwAEk7UTin4jP6HLP,Lionel Hampton,bebop big band cool jazz jazz stride swing voc...,47366.0,44


In [None]:
# Sample record from transformed artists data
pd.DataFrame(artists_transformed.iloc[144][list(artists_transformed.columns)]).transpose()

Unnamed: 0,id,name,genres,followers,popularity
144,4sTTEheJxmjwv9TmrHOaPz,Honor Blackman,childrens story,288.0,15


In [None]:
artists.to_csv('artists_transformed.csv', index=None)

PREPROCESSING AND TRANSFORMATION OF TRACKS DATA

In [4]:
# Read data file and display first 5 records
tracks = pd.read_csv('tracks.csv')
tracks.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0.0,['Uli'],['45tIt06XoI0Iio4LBEVpls'],1922-02-22,0.645,0.445,0.0,-13.338,1.0,0.451,0.674,0.744,0.151,0.127,104.851,3.0
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,0,98200,0.0,['Fernando Pessoa'],['14jtPCOoNZwquk5wd9DxrY'],1922-06-01,0.695,0.263,0.0,-22.136,1.0,0.957,0.797,0.0,0.148,0.655,102.009,1.0
2,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,0,181640,0.0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.434,0.177,1.0,-21.18,1.0,0.0512,0.994,0.0218,0.212,0.457,130.418,5.0
3,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,0,176907,0.0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.321,0.0946,7.0,-27.961,1.0,0.0504,0.995,0.918,0.104,0.397,169.98,3.0
4,08y9GfoqCWfOGsKdwojr5e,Lady of the Evening,0,163080,0.0,['Dick Haymes'],['3BiJGZsyX9sJchTqcSA7Su'],1922,0.402,0.158,3.0,-16.9,0.0,0.039,0.989,0.13,0.311,0.196,103.22,4.0


In [5]:
tracks.describe()

Unnamed: 0,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
count,45718.0,45718.0,45717.0,45717.0,45717.0,45717.0,45717.0,45717.0,45717.0,45717.0,45717.0,45717.0,45717.0,45717.0,45717.0
mean,14.748764,210423.5,0.001728,0.513833,0.318997,5.0997,-14.130028,0.71901,0.136598,0.789039,0.249217,0.20873,0.538731,111.72041,3.776844
std,17.689579,146440.4,0.041534,0.167853,0.202392,3.453748,5.3691,0.449487,0.241472,0.260435,0.362865,0.162735,0.262736,30.901983,0.628517
min,0.0,3344.0,0.0,0.0,0.0,0.0,-60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,153016.5,0.0,0.393,0.169,2.0,-17.35,0.0,0.0368,0.695,2e-06,0.107,0.329,87.176,4.0
50%,6.0,181253.0,0.0,0.525,0.277,5.0,-13.315,1.0,0.047,0.905,0.00236,0.146,0.559,109.576,4.0
75%,28.0,213107.0,0.0,0.645,0.432,8.0,-10.293,1.0,0.0817,0.982,0.579,0.262,0.757,129.6,4.0
max,83.0,3577800.0,1.0,0.957,1.0,11.0,4.584,1.0,0.97,0.996,0.999,0.999,1.0,238.895,5.0


In [6]:
tracks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45718 entries, 0 to 45717
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                45718 non-null  object 
 1   name              45718 non-null  object 
 2   popularity        45718 non-null  int64  
 3   duration_ms       45718 non-null  int64  
 4   explicit          45717 non-null  float64
 5   artists           45717 non-null  object 
 6   id_artists        45717 non-null  object 
 7   release_date      45717 non-null  object 
 8   danceability      45717 non-null  float64
 9   energy            45717 non-null  float64
 10  key               45717 non-null  float64
 11  loudness          45717 non-null  float64
 12  mode              45717 non-null  float64
 13  speechiness       45717 non-null  float64
 14  acousticness      45717 non-null  float64
 15  instrumentalness  45717 non-null  float64
 16  liveness          45717 non-null  float6

In [None]:
# Display final 5 records in tracks DataFrame
tracks.tail()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
39972,7KhqTPA2HROMOeOlAyPxZo,Coconut Grove - 2003 Remaster,43,161307,0,"[""The Lovin' Spoonful""]",['7CCn4PFRRRZF127jtCBAUe'],1966-11,0.448,0.0729,11,-20.502,0,0.0505,0.965,0.000484,0.0981,0.341,106.583,3.0
39973,4lVZxG2q3fomnIQ5kZ3ifQ,"Hold On, I'm Comin'",4,155707,0,['Sam & Dave'],['2BVYdY4PyfCF9z4NrkhEB2'],2020-03-13,0.803,0.338,6,-14.029,1,0.0349,0.196,0.038,0.097,0.912,106.841,4.0
39974,0rcaoSGw38qRItmDwwpDrw,L'estasi dell'oro,42,202850,0,['Ennio Morricone'],['1nIUhcKHnK6iyumRyoV68C'],1966,0.136,0.484,0,-12.945,1,0.0505,0.715,0.523,0.0615,0.102,99.566,4.0
39975,2RvDHSvKghCX03v6w80gBb,Good Lovin',3,200760,0,['Mary Wells'],['1cjZk1xXn3YCToNg3uJpA7'],2020-03-13,0.553,0.57,5,-9.297,1,0.0618,0.356,0.0,0.412,0.699,153.547,4.0
39976,45pvbsb7O8aWz0KwMxfxE4,The Trio - Il Triello - Extended Version,42,434387,0,['Ennio Morricone'],['1nIUhcKHnK6iyumRyoV68C'],1966,0.194,0.633,2,-3.078,0,0.0338,0.542,0.766,0.0,,,


In [None]:
# Sample record from tracks data
pd.DataFrame(tracks.iloc[17591][list(tracks.columns)]).transpose()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
17591,1S337ZMUIKJnxoRD3UbiMc,"Der Freischütz, Op. 77, J. 277 Arr. By Hans Ma...",0,273907,0,"['Carl Maria von Weber', 'Robert Heger']","['1p6wR69pnH9LBWZvwliuz2', '1PcmDPoDizKeZDOjdW...",1943-12-31,0.29,0.193,7,-18.009,1,0.0541,0.934,0.00268,0.219,0.147,79.494,5.0


**Normalization:** It invloves adjusting values measured on different scales to a common scale .It permits to adjust values refered to different columns to a common scale



**Min Max:** Similar to Single Feature scaling, min max converts every values of a column into a number between 0 and 1. The new value is calculated as the difference between the current value and the min value , divided by the range of the column values

In [9]:
tracks['popularity']=(tracks['popularity']-tracks['popularity'].min())/(tracks['popularity'].max()-tracks['popularity'].min())
tracks

In [10]:
tracks.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,35iwgR4jXetI318WEWsa1Q,Carve,0.072289,126903,0.0,['Uli'],['45tIt06XoI0Iio4LBEVpls'],1922-02-22,0.645,0.445,0.0,-13.338,1.0,0.451,0.674,0.744,0.151,0.127,104.851,3.0
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,0.0,98200,0.0,['Fernando Pessoa'],['14jtPCOoNZwquk5wd9DxrY'],1922-06-01,0.695,0.263,0.0,-22.136,1.0,0.957,0.797,0.0,0.148,0.655,102.009,1.0
2,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,0.0,181640,0.0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.434,0.177,1.0,-21.18,1.0,0.0512,0.994,0.0218,0.212,0.457,130.418,5.0
3,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,0.0,176907,0.0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.321,0.0946,7.0,-27.961,1.0,0.0504,0.995,0.918,0.104,0.397,169.98,3.0
4,08y9GfoqCWfOGsKdwojr5e,Lady of the Evening,0.0,163080,0.0,['Dick Haymes'],['3BiJGZsyX9sJchTqcSA7Su'],1922,0.402,0.158,3.0,-16.9,0.0,0.039,0.989,0.13,0.311,0.196,103.22,4.0


In [None]:
# Function to preprocess/transform tracks data
def transform_tracks(df):
    print('Transforming tracks DataFrame...')

    # Remove unwanted characters from "artists" field for each record
    print('\nRemoving unwanted characters from "artists" field...')
    df['artists'] = df.progress_apply(lambda x: x['artists'][1:-1].translate(str.maketrans('', '', "\'")), axis=1)

    # Remove "feat" from "artists" field for each record (unwanted)
    print('\nRemoving word "feat" from "artists" field...')
    df['artists'] = df.progress_apply(lambda x: x['artists'].replace('feat', ''), axis=1)

    # Remove unwanted characters from "id_artists" fied for each record and split to form a list of artist IDs
    print('\nTransforming "id_artists" field into list of artist IDs...')
    df['id_artists'] = df.progress_apply(lambda x: x['id_artists'][1:-1].split(','), axis=1)

    # Remove punctuations from artist IDs for each record
    print('\nRemoving punctuations from "id_artists" field...')
    df['id_artists'] = df.progress_apply(lambda x: [x['id_artists'][i].translate(str.maketrans('', '', string.punctuation)).strip() for i in range(len(x['id_artists']))], axis=1)

    # Create new column "release_year" to store year of release of song/track
    print('\nCreating new field "release_year"...')
    df['release_year'] = df.progress_apply(lambda x: x['release_date'][0:4], axis=1)

    # Transform "loudness" to have positive values
    print('\nTransforming "loudness" field to have only positive values...')
    df['loudness'] = df.progress_apply(lambda x: x['loudness']+60.0, axis=1)

    # Convert duration from milliseconds to seconds
    print('\nCreating "duration_s" field using "duration_ms" field...')
    df['duration_s'] = df.progress_apply(lambda x: x['duration_ms']*0.001, axis=1)

    # Create "genres" column using artists data
    print('\nCreating "genres" field using artists data...')
    df['genres'] = df.progress_apply(lambda x: str(artists.loc[artists['id'].isin(x['id_artists'])]['genres'].tolist()).translate(str.maketrans('', '', string.punctuation)), axis=1)

    # Drop "release_date", "duration_ms", "explicit" and "time_signature" columns
    df.drop(['release_date', 'duration_ms', 'explicit', 'time_signature'], axis=1, inplace=True)

    # Reorder columns
    df_transformed = df.reindex(columns=['id', 'name', 'artists', 'id_artists', 'genres', 'release_year', 'duration_s',
                                         'popularity', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
                                         'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo'])

    # Return preprocessed DataFrame
    print("\ntracks DataFrame transformed successfully!\n")
    return df_transformed

In [None]:
# Preprocess and transform tracks DataFrame and display first 5 records
tracks_transformed = transform_tracks(tracks)
tracks_transformed.head()

Transforming tracks DataFrame...

Removing unwanted characters from "artists" field...


  0%|          | 0/39977 [00:00<?, ?it/s]


Removing word "feat" from "artists" field...


  0%|          | 0/39977 [00:00<?, ?it/s]


Transforming "id_artists" field into list of artist IDs...


  0%|          | 0/39977 [00:00<?, ?it/s]


Removing punctuations from "id_artists" field...


  0%|          | 0/39977 [00:00<?, ?it/s]


Creating new field "release_year"...


  0%|          | 0/39977 [00:00<?, ?it/s]


Transforming "loudness" field to have only positive values...


  0%|          | 0/39977 [00:00<?, ?it/s]


Creating "duration_s" field using "duration_ms" field...


  0%|          | 0/39977 [00:00<?, ?it/s]


Creating "genres" field using artists data...


  0%|          | 0/39977 [00:00<?, ?it/s]


tracks DataFrame transformed successfully!



Unnamed: 0,id,name,artists,id_artists,genres,release_year,duration_s,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,35iwgR4jXetI318WEWsa1Q,Carve,Uli,[45tIt06XoI0Iio4LBEVpls],,1922,126.903,6,0.645,0.445,0,46.662,1,0.451,0.674,0.744,0.151,0.127,104.851
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,Fernando Pessoa,[14jtPCOoNZwquk5wd9DxrY],,1922,98.2,0,0.695,0.263,0,37.864,1,0.957,0.797,0.0,0.148,0.655,102.009
2,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,Ignacio Corsini,[5LiOoJbxVSAMkBS2fUm3X2],tango vintage tango,1922,181.64,0,0.434,0.177,1,38.82,1,0.0512,0.994,0.0218,0.212,0.457,130.418
3,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,Ignacio Corsini,[5LiOoJbxVSAMkBS2fUm3X2],tango vintage tango,1922,176.907,0,0.321,0.0946,7,32.039,1,0.0504,0.995,0.918,0.104,0.397,169.98
4,08y9GfoqCWfOGsKdwojr5e,Lady of the Evening,Dick Haymes,[3BiJGZsyX9sJchTqcSA7Su],adult standards big band easy listening lounge...,1922,163.08,0,0.402,0.158,3,43.1,0,0.039,0.989,0.13,0.311,0.196,103.22


In [None]:
# Display final 5 records in tracks DataFrame
tracks_transformed.tail()

Unnamed: 0,id,name,artists,id_artists,genres,release_year,duration_s,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
39972,7KhqTPA2HROMOeOlAyPxZo,Coconut Grove - 2003 Remaster,"""The Lovin Spoonful""",[7CCn4PFRRRZF127jtCBAUe],,1966,161.307,43,0.448,0.0729,11,39.498,0,0.0505,0.965,0.000484,0.0981,0.341,106.583
39973,4lVZxG2q3fomnIQ5kZ3ifQ,"Hold On, I'm Comin'",Sam & Dave,[2BVYdY4PyfCF9z4NrkhEB2],,2020,155.707,4,0.803,0.338,6,45.971,1,0.0349,0.196,0.038,0.097,0.912,106.841
39974,0rcaoSGw38qRItmDwwpDrw,L'estasi dell'oro,Ennio Morricone,[1nIUhcKHnK6iyumRyoV68C],,1966,202.85,42,0.136,0.484,0,47.055,1,0.0505,0.715,0.523,0.0615,0.102,99.566
39975,2RvDHSvKghCX03v6w80gBb,Good Lovin',Mary Wells,[1cjZk1xXn3YCToNg3uJpA7],,2020,200.76,3,0.553,0.57,5,50.703,1,0.0618,0.356,0.0,0.412,0.699,153.547
39976,45pvbsb7O8aWz0KwMxfxE4,The Trio - Il Triello - Extended Version,Ennio Morricone,[1nIUhcKHnK6iyumRyoV68C],,1966,434.387,42,0.194,0.633,2,56.922,0,0.0338,0.542,0.766,0.0,,


In [None]:
# Sample record from transformed tracks data
pd.DataFrame(tracks_transformed.iloc[17591][list(tracks_transformed.columns)]).transpose()

Unnamed: 0,id,name,artists,id_artists,genres,release_year,duration_s,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
17591,1S337ZMUIKJnxoRD3UbiMc,"Der Freischütz, Op. 77, J. 277 Arr. By Hans Ma...","Carl Maria von Weber, Robert Heger","[1p6wR69pnH9LBWZvwliuz2, 1PcmDPoDizKeZDOjdWfuTD]",classical classical era early music early rom...,1943,273.907,0,0.29,0.193,7,41.991,1,0.0541,0.934,0.00268,0.219,0.147,79.494


In [None]:
tracks_transformed.to_csv('tracks_transformed.csv', index=None)