In [1]:
import pandas as pd
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials 
import requests
import os
import time
import numpy as np
import re
from tqdm import tqdm
from configparser import ConfigParser
 

In [2]:
sleep_min = 1
sleep_max = 3
configur = ConfigParser()
configur.read('config.ini')
client_id = configur['main']['client_id']
client_secret = configur['main']['client_secret']
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager) # sp is the instance of the spotipy api

In [115]:
data = pd.read_csv('/Users/ytkd/Desktop/LY_Artist_lyrics_genre_data_from_big5_mft_users_likes_final.csv')
data = data[data['lang_detect_spacy']=='en']
sample_dataset = data[:200]

In [117]:
artists = list(sample_dataset['Artist'])
titles = list(sample_dataset['title'])

</h2> Data Cleaning


In [118]:
def pre_process_text(text):
    text = text.lower()
    text = re.sub(r"[^\x00-\x7F]+'",' ', text)
    return text


In [119]:
'''
check how many artists and titles have special characters
'''
c = 0
for title in titles:
    if str(title).isascii():
        c+=1
print('The number of non ascii characters in our sample dataset titles are:- ', 200-c)

The number of non ascii characters in our sample dataset titles are:-  28


In [120]:
artists_clean = []
titles_clean = []

for i in tqdm(range(len(artists))):
    artists_clean.append(pre_process_text(artists[i]))
    titles_clean.append(pre_process_text(titles[i]))

100%|██████████| 200/200 [00:00<00:00, 143665.15it/s]


In [121]:
request_count = 0
start_time = time.time()
spotify_artists = []
check_dat = []
for i in tqdm(range(len(artists))):
    query = sp.search(f'artist:{artists_clean[i]} track:{titles_clean[i]}')
    if query['tracks']['items']:
        spotify_artists.append(query['tracks']['items'][0]['artists'][0]['name'])
        check_dat.append(query['tracks']['items'][0]['artists'][0]['name'])
    else:
        spotify_artists.append(artists[i])
        
    request_count+=1
    if request_count % 5 == 0:
        # print(str(request_count) + " requests sent")
        time.sleep(np.random.uniform(sleep_min, sleep_max))
        # print('Elapsed Time: {} seconds'.format(time.time() - start_time))
print('Completed...')

  7%|▋         | 14/200 [00:07<01:35,  1.94it/s]


KeyboardInterrupt: 

In [122]:
print(f'Out of the 200 sample data records, data for {len(check_dat)} records is available')

Out of the 200 sample data records, data for 14 records is available


In [123]:
df1 = pd.DataFrame(spotify_artists)
df2 = pd.DataFrame(artists)
df3 = pd.concat([df2,df1], axis=1, join='outer')
df3.to_csv('check.csv')

Checking if the artisrts that we extracted from spotify are same as in our dataset

In [124]:
count = 0
for i in tqdm(range(len(artists))):
    if pre_process_text(artists[i]) in pre_process_text(spotify_artists[i]):
        count+=1
count

  8%|▊         | 15/200 [00:00<00:00, 57879.08it/s]


IndexError: list index out of range

Extracting Data

In [125]:
'''
In track_preview_available function I am checking that the query returned by spotify's api is not empty and if its not I check if the preview url is available.

In is_same_artist function I check whether the artist the we found in our query is the same one from our dataset.
'''
def is_same_artist_and_title(query, artist, title, index):
    is_same_artist = pre_process_text(artist[index]) in pre_process_text(query['tracks']['items'][0]['artists'][0]['name'])
    is_same_title = pre_process_text(title[index]) in pre_process_text(query['tracks']['items'][0]['name'])
    return is_same_artist and is_same_title

    # if pre_process_text(artist[index]) in pre_process_text(query['tracks']['items'][0]['artists'][0]['name']):
    #     if pre_process_text(title[index]) in pre_process_text(query['tracks']['items'][0]['name']):
    #         return True
    #     else:
    #         return False
    # else:
    #     return False

def track_preview_available(query):
    if query['tracks']['items']:
        if query['tracks']['items'][0]['preview_url']:
            return True
        return True
        
def artist_available(query):
    if query['tracks']['items']:
        return True
    else:
        return False

def get_audio_features(track_id, chunk_size):
    audio_features_list = []

    for i in range(0, len(track_id), chunk_size):    
        track_id_list = track_id[i:i+chunk_size]
        results = sp.audio_features(track_id_list)
        results = [v for v in results if str(v) != 'None']
        audio_features_list.extend(results)

    return audio_features_list

In [126]:
sleep_min = 1
sleep_max = 3
request_count = 0
start_time = time.time()

preview_url = []
track_id = []
artist_id = []
track_popularity = []
artists_df = []
titles_df = []

for index in range(len(artists_clean)):
    # artist_name = spotify_artists[index]
    artist_name = artists_clean[index]
    song_title = titles_clean[index]
    search = f'artist:{artist_name} track:{song_title}'
    query = sp.search(search, type='track')


    # in the below line of code I am checking that the query returned by spotify's api is not empty and if its not I check if the preview url is available, and
    # even further I check a third condition whether the artist the we found in our query is the same from our dataset.


    # if query['tracks']['items'] and query['tracks']['items'][0]['preview_url'] and query['tracks']['items'][0]['artists'][0]['name'] in alpha_artists_titles[index]:
    if track_preview_available(query) and is_same_artist_and_title(query,artists,titles,index):
        preview_url.append(query['tracks']['items'][0]['preview_url'])
        track_id.append(query['tracks']['items'][0]['id'])
        artist_id.append(query['tracks']['items'][0]['artists'][0]['id'])
        track_popularity.append(query['tracks']['items'][0]['popularity'])
        artists_df.append(query['tracks']['items'][0]['artists'][0]['name'])
        titles_df.append(query['tracks']['items'][0]['name'])
        
         
    else:
        preview_url.append(None)
        track_id.append(None)
        artist_id.append(None)
        track_popularity.append(None)
        # artists_df.append(artists_clean[index])
        # titles_df.append(titles_clean[index])
        artists_df.append(query['tracks']['items'][0]['artists'][0]['name'])
        titles_df.append(query['tracks']['items'][0]['name'])

    request_count+=1
    if request_count % 5 == 0:
        print(str(request_count) + " artists uris fetched")
        time.sleep(np.random.uniform(sleep_min, sleep_max))
        print('Loop #: {}'.format(request_count))
        print('Elapsed Time: {} seconds'.format(time.time() - start_time))

5 artists uris fetched
Loop #: 5
Elapsed Time: 2.441851854324341 seconds
10 artists uris fetched
Loop #: 10
Elapsed Time: 5.0771238803863525 seconds


IndexError: list index out of range

In [None]:
track_data = pd.DataFrame(artists_df, columns=['artists'])
track_data['tracks'] = titles_df
track_data['artist_id'] = artist_id
track_data['track_id'] = track_id
track_data['track_popularity'] = track_popularity
track_data['track_url'] = preview_url

In [None]:
track_data

Unnamed: 0,artists,tracks,artist_id,track_id,track_popularity,track_url
0,*NSYNC,Bye Bye Bye,6Ff53KvcvAj5U7Z1vojB5o,62bOmKYxYg7dhrC6gH9vFn,71.0,https://p.scdn.co/mp3-preview/612fd571c0216fa3...
1,*NSYNC,It's Gonna Be Me,,,,
2,*NSYNC,Tearin' up My Heart - Radio Edit,,,,
3,*NSYNC,Gone,6Ff53KvcvAj5U7Z1vojB5o,4CCUjYJPbSXLL23BFeBVbI,52.0,https://p.scdn.co/mp3-preview/eea62070ce877996...
4,*NSYNC,"Merry Christmas, Happy Holidays",6Ff53KvcvAj5U7Z1vojB5o,4v9WbaxW8HdjqfUiWYWsII,39.0,https://p.scdn.co/mp3-preview/158f2eafaa4f03f5...
5,*NSYNC,It Makes Me Ill,6Ff53KvcvAj5U7Z1vojB5o,0gbysjaH16DW29QpUnowcx,46.0,https://p.scdn.co/mp3-preview/c00dc4528ddf44eb...
6,*NSYNC,This I Promise You,6Ff53KvcvAj5U7Z1vojB5o,46n2EGFnPC3tzWCN1Aqe26,66.0,https://p.scdn.co/mp3-preview/0b9a69ffaf5fbb52...
7,*NSYNC,Pop,6Ff53KvcvAj5U7Z1vojB5o,0Jc8qF1mUPo1A96HE9QxZz,57.0,https://p.scdn.co/mp3-preview/edcadba3c2b60c2f...
8,*NSYNC,I Want You Back - Radio Edit,6Ff53KvcvAj5U7Z1vojB5o,221LRlPHPuevgE1tuUlof9,61.0,https://p.scdn.co/mp3-preview/3cbf84331a097fd1...
9,*shels,Butterflies (on Luci's Way),,,,


Extracting Features

In [21]:
# def get_audio_features(track_id, chunk_size):
#     audio_features_list = []

#     for i in range(0, len(track_id), chunk_size):    
#         track_id_list = track_id[i:i+chunk_size]
#         if track_id[i] is not None:
#             results = sp.audio_features(track_id_list)
#             results = [v for v in results if str(v) != 'None']
#             audio_features_list.extend(results)
#         else:
#             audio_features_list.extend(None)


#     return audio_features_list

In [55]:
def get_audio_features(track_id):
    request_count = 0
    start_time = time.time()
    features_dict = {}
    features_dict['acousticness'] = []
    features_dict['danceability'] = []
    features_dict['energy'] = []
    features_dict['instrumentalness'] = []
    features_dict['liveness'] = []
    features_dict['loudness'] = []
    features_dict['speechiness'] = []
    features_dict['tempo'] = []
    features_dict['valence'] = []

    for i in range(len(track_id)):    
        if track_id[i] is not None:
            features = sp.audio_features(track_id[i])
            features_dict['acousticness'].append(features[0]['acousticness'])
            features_dict['danceability'].append(features[0]['danceability'])
            features_dict['energy'].append(features[0]['energy'])
            features_dict['instrumentalness'].append(features[0]['instrumentalness'])
            features_dict['liveness'].append(features[0]['liveness'])
            features_dict['loudness'].append(features[0]['loudness'])
            features_dict['speechiness'].append(features[0]['speechiness'])
            features_dict['tempo'].append(features[0]['tempo'])
            features_dict['valence'].append(features[0]['valence'])
        else:
            features_dict['acousticness'].append(None)
            features_dict['danceability'].append(None)
            features_dict['energy'].append(None)
            features_dict['instrumentalness'].append(None)
            features_dict['liveness'].append(None)
            features_dict['loudness'].append(None)
            features_dict['speechiness'].append(None)
            features_dict['tempo'].append(None)
            features_dict['valence'].append(None)

        request_count+=1  
        if request_count % 5 == 0:
            print(str(request_count) + " artists's features extracted")
            time.sleep(np.random.uniform(sleep_min, sleep_max))
            print('Loop #: {}'.format(request_count))
            print('Elapsed Time: {} seconds'.format(time.time() - start_time))


    return features_dict

In [56]:
features_dict = get_audio_features(track_id)

5 artists's features extracted
Loop #: 5
Elapsed Time: 2.9754562377929688 seconds
10 artists's features extracted
Loop #: 10
Elapsed Time: 5.924633026123047 seconds
15 artists's features extracted
Loop #: 15
Elapsed Time: 7.816481828689575 seconds
20 artists's features extracted
Loop #: 20
Elapsed Time: 9.872878074645996 seconds
25 artists's features extracted
Loop #: 25
Elapsed Time: 12.524617195129395 seconds
30 artists's features extracted
Loop #: 30
Elapsed Time: 14.685796976089478 seconds
35 artists's features extracted
Loop #: 35
Elapsed Time: 18.231149911880493 seconds
40 artists's features extracted
Loop #: 40
Elapsed Time: 21.643114805221558 seconds
45 artists's features extracted
Loop #: 45
Elapsed Time: 24.709187984466553 seconds
50 artists's features extracted
Loop #: 50
Elapsed Time: 27.042639017105103 seconds
55 artists's features extracted
Loop #: 55
Elapsed Time: 30.16559410095215 seconds
60 artists's features extracted
Loop #: 60
Elapsed Time: 32.64086413383484 seconds

In [57]:
temp = list(zip(features_dict['acousticness'],features_dict['danceability'],features_dict['energy'],features_dict['instrumentalness'],features_dict['liveness'],features_dict['loudness'],features_dict['speechiness'],features_dict['tempo'],features_dict['valence']))
data_features = pd.DataFrame(temp, columns=['acousticness','danceability','energy','instrumentalness','liveness','loudness','speechiness','tempo','valence'])
data_features

Unnamed: 0,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence
0,0.03100,0.610,0.926,0.00120,0.0821,-4.843,0.0479,172.638,0.861
1,,,,,,,,,
2,,,,,,,,,
3,0.43000,0.704,0.409,0.00000,0.1090,-8.581,0.0597,113.863,0.495
4,0.10400,0.643,0.939,0.00000,0.8810,-3.967,0.0463,104.999,0.756
...,...,...,...,...,...,...,...,...,...
195,0.02890,0.193,0.485,0.84700,0.0999,-7.956,0.0437,90.001,0.127
196,,,,,,,,,
197,0.10900,0.789,0.881,0.00000,0.3550,-5.758,0.0331,133.299,0.647
198,0.00103,0.895,0.412,0.00179,0.1280,-7.845,0.0645,122.223,0.183


In [61]:
final_df = pd.concat([track_data,data_features], axis=1, join='outer')
final_df

Unnamed: 0,artists,tracks,artist_id,track_id,track_popularity,track_url,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence
0,*NSYNC,bye bye bye,6Ff53KvcvAj5U7Z1vojB5o,62bOmKYxYg7dhrC6gH9vFn,71.0,https://p.scdn.co/mp3-preview/612fd571c0216fa3...,0.03100,0.610,0.926,0.00120,0.0821,-4.843,0.0479,172.638,0.861
1,*NSYNC,it s gonna be me,,,,,,,,,,,,,
2,*NSYNC,tearin up my heart,,,,,,,,,,,,,
3,*NSYNC,gone,6Ff53KvcvAj5U7Z1vojB5o,4CCUjYJPbSXLL23BFeBVbI,52.0,https://p.scdn.co/mp3-preview/eea62070ce877996...,0.43000,0.704,0.409,0.00000,0.1090,-8.581,0.0597,113.863,0.495
4,*NSYNC,"merry christmas, happy holidays",6Ff53KvcvAj5U7Z1vojB5o,4v9WbaxW8HdjqfUiWYWsII,39.0,https://p.scdn.co/mp3-preview/158f2eafaa4f03f5...,0.10400,0.643,0.939,0.00000,0.8810,-3.967,0.0463,104.999,0.756
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,65daysofstatic,heat death infinity splitter,6DVVsQAnpHdJjb1nYuOQ6g,3PibgbUWMGaYZObGpvGhmJ,26.0,https://p.scdn.co/mp3-preview/2799b9f6b7e4fe22...,0.02890,0.193,0.485,0.84700,0.0999,-7.956,0.0437,90.001,0.127
196,Three 6 Mafia,sippin on some syrup,,,,,,,,,,,,,
197,Three 6 Mafia,stay fly,26s8LSolLfCIY88ysQbIuT,5MYFw4T2gy52pOGBN4EYHS,69.0,https://p.scdn.co/mp3-preview/56db43014efeb814...,0.10900,0.789,0.881,0.00000,0.3550,-5.758,0.0331,133.299,0.647
198,Three 6 Mafia,late nite tip,26s8LSolLfCIY88ysQbIuT,75RK78POyFmg3u6O1cpBdr,51.0,https://p.scdn.co/mp3-preview/3e7e1d015ddb7feb...,0.00103,0.895,0.412,0.00179,0.1280,-7.845,0.0645,122.223,0.183


In [73]:
# Downloading these songs according to track id
final_df.set_index('track_id',inplace=True)
final_df.to_csv('track_data.csv')

In [74]:
final_df

Unnamed: 0_level_0,artists,tracks,artist_id,track_popularity,track_url,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
62bOmKYxYg7dhrC6gH9vFn,*NSYNC,bye bye bye,6Ff53KvcvAj5U7Z1vojB5o,71.0,https://p.scdn.co/mp3-preview/612fd571c0216fa3...,0.03100,0.610,0.926,0.00120,0.0821,-4.843,0.0479,172.638,0.861
,*NSYNC,it s gonna be me,,,,,,,,,,,,
,*NSYNC,tearin up my heart,,,,,,,,,,,,
4CCUjYJPbSXLL23BFeBVbI,*NSYNC,gone,6Ff53KvcvAj5U7Z1vojB5o,52.0,https://p.scdn.co/mp3-preview/eea62070ce877996...,0.43000,0.704,0.409,0.00000,0.1090,-8.581,0.0597,113.863,0.495
4v9WbaxW8HdjqfUiWYWsII,*NSYNC,"merry christmas, happy holidays",6Ff53KvcvAj5U7Z1vojB5o,39.0,https://p.scdn.co/mp3-preview/158f2eafaa4f03f5...,0.10400,0.643,0.939,0.00000,0.8810,-3.967,0.0463,104.999,0.756
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3PibgbUWMGaYZObGpvGhmJ,65daysofstatic,heat death infinity splitter,6DVVsQAnpHdJjb1nYuOQ6g,26.0,https://p.scdn.co/mp3-preview/2799b9f6b7e4fe22...,0.02890,0.193,0.485,0.84700,0.0999,-7.956,0.0437,90.001,0.127
,Three 6 Mafia,sippin on some syrup,,,,,,,,,,,,
5MYFw4T2gy52pOGBN4EYHS,Three 6 Mafia,stay fly,26s8LSolLfCIY88ysQbIuT,69.0,https://p.scdn.co/mp3-preview/56db43014efeb814...,0.10900,0.789,0.881,0.00000,0.3550,-5.758,0.0331,133.299,0.647
75RK78POyFmg3u6O1cpBdr,Three 6 Mafia,late nite tip,26s8LSolLfCIY88ysQbIuT,51.0,https://p.scdn.co/mp3-preview/3e7e1d015ddb7feb...,0.00103,0.895,0.412,0.00179,0.1280,-7.845,0.0645,122.223,0.183


In [71]:
if os.path.exists('/Users/ytkd/Desktop/downloaded_songs') is False:
    os.mkdir('/Users/ytkd/Desktop/downloaded_songs')

audio_path = '/Users/ytkd/Desktop/downloaded_songs'
for i,url in enumerate(preview_url):
    if url is not None:
        response = requests.get(url, verify=False)
        if os.path.exists(f'{audio_path}/{track_id[i][:1]}') is False:
            os.mkdir(f'{audio_path}/{track_id[i][:1]}')
        open(f"{os.path.join(audio_path,track_id[i][:1],track_id[i]+'.mp3')}", 'wb').write(response.content)  






In [72]:
count = 0
for id in preview_url:
    if id is None:
        count = count + 1
count
print(f'Out of 200 sample records the preview url is available for only {int(200-count)} songs')

Out of 200 sample records the preview url is available for only 91 songs


In [204]:
x = sp.search('artist: 5 Seconds of Summer track: she looks so perfect',type='track')
x['tracks']['items']

[{'album': {'album_type': 'album',
   'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/5Rl15oVamLq7FbSb0NNBNy'},
     'href': 'https://api.spotify.com/v1/artists/5Rl15oVamLq7FbSb0NNBNy',
     'id': '5Rl15oVamLq7FbSb0NNBNy',
     'name': '5 Seconds of Summer',
     'type': 'artist',
     'uri': 'spotify:artist:5Rl15oVamLq7FbSb0NNBNy'}],
   'available_markets': ['CA', 'MX', 'US'],
   'external_urls': {'spotify': 'https://open.spotify.com/album/2LkWHNNHgD6BRNeZI2SL1L'},
   'href': 'https://api.spotify.com/v1/albums/2LkWHNNHgD6BRNeZI2SL1L',
   'id': '2LkWHNNHgD6BRNeZI2SL1L',
   'images': [{'height': 640,
     'url': 'https://i.scdn.co/image/ab67616d0000b27393432e914046a003229378da',
     'width': 640},
    {'height': 300,
     'url': 'https://i.scdn.co/image/ab67616d00001e0293432e914046a003229378da',
     'width': 300},
    {'height': 64,
     'url': 'https://i.scdn.co/image/ab67616d0000485193432e914046a003229378da',
     'width': 64}],
   'name': '5 Seconds Of Su

In [231]:
test = sp.search('artist:Coldplay track:Fix You', type='track')
test['tracks']['href']

'https://api.spotify.com/v1/search?query=artist%3AColdplay+track%3AFix+You&type=track&offset=0&limit=10'

In [224]:
import json
with open('test.json', 'w') as convert_file:
     convert_file.write(json.dumps(test))

In [None]:
# create a histigram and check file names 