In [23]:
import pandas as pd
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials 
import requests
import os
import time
import numpy as np
import re
from tqdm import tqdm
from configparser import ConfigParser
 

In [24]:
sleep_min = 1
sleep_max = 3
configur = ConfigParser()
configur.read('config.ini')
client_id = configur['main']['client_id']
client_secret = configur['main']['client_secret']
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager) # sp is the instance of the spotipy api

In [25]:
data = pd.read_csv('/Users/ytkd/Desktop/LY_Artist_lyrics_genre_data_from_big5_mft_users_likes_final.csv')
data = data[data['lang_detect_spacy']=='en']
sample_dataset = data[:200]

In [26]:
artists = list(sample_dataset['Artist'])
titles = list(sample_dataset['title'])

</h2> Data Cleaning


In [27]:
def pre_process_text(text):
    text = text.lower()
    text = re.sub(r'[^\x00-\x7F]+',' ', text)
    return text


In [28]:
'''
check how many artists and titles have special characters
'''
c = 0
for title in titles:
    if str(title).isascii():
        c+=1
print('The number of non ascii characters in our sample dataset titles are:- ', 200-c)

The number of non ascii characters in our sample dataset titles are:-  28


In [29]:
artists_clean = []
titles_clean = []

for i in tqdm(range(len(artists))):
    artists_clean.append(pre_process_text(artists[i]))
    titles_clean.append(pre_process_text(titles[i]))

100%|██████████| 200/200 [00:00<00:00, 114739.54it/s]


In [30]:
request_count = 0
start_time = time.time()
spotify_artists = []
check_dat = []
for i in tqdm(range(len(artists))):
    query = sp.search(f'artist:{artists_clean[i]} track:{titles_clean[i]}')
    if query['tracks']['items']:
        spotify_artists.append(query['tracks']['items'][0]['artists'][0]['name'])
        check_dat.append(query['tracks']['items'][0]['artists'][0]['name'])
    else:
        spotify_artists.append(artists[i])
        
    request_count+=1
    if request_count % 5 == 0:
        # print(str(request_count) + " requests sent")
        time.sleep(np.random.uniform(sleep_min, sleep_max))
        # print('Elapsed Time: {} seconds'.format(time.time() - start_time))
print('Completed...')

100%|██████████| 200/200 [02:05<00:00,  1.59it/s]

Completed...





In [31]:
print(f'Out of the 200 sample data records, data for {len(check_dat)} records is available')

Out of the 200 sample data records, data for 185 records is available


In [32]:
df1 = pd.DataFrame(spotify_artists)
df2 = pd.DataFrame(artists)
df3 = pd.concat([df2,df1], axis=1, join='outer')
df3.to_csv('check.csv')

Checking if the artisrts that we extracted from spotify are same as in our dataset

In [33]:
count = 0
for i in tqdm(range(len(artists))):
    if pre_process_text(artists[i]) in pre_process_text(spotify_artists[i]):
        count+=1
count

100%|██████████| 200/200 [00:00<00:00, 154914.28it/s]


199

'Fix You'

Extracting Data

In [34]:
'''
In track_preview_available function I am checking that the query returned by spotify's api is not empty and if its not I check if the preview url is available.

In is_same_artist function I check whether the artist the we found in our query is the same one from our dataset.
'''
def is_same_artist_and_title(query, artist, title, index):
    if pre_process_text(artist[index]) in pre_process_text(query['tracks']['items'][0]['artists'][0]['name']):
        if pre_process_text(title[index]) in pre_process_text(query['tracks']['items'][0]['name']):
            return True

def track_preview_available(query):
    if query['tracks']['items']:
        if query['tracks']['items'][0]['preview_url']:
            return True
        return True
        
def artist_available(query):
    if query['tracks']['items']:
        return True

def is_same_track(query, data , index):
    
        return True
    # track name

In [236]:
query = sp.search('coldplay fix you', type='track')
query['tracks']['items'][0]

{'album': {'album_type': 'album',
  'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/4gzpq5DPGxSnKTe4SA8HAU'},
    'href': 'https://api.spotify.com/v1/artists/4gzpq5DPGxSnKTe4SA8HAU',
    'id': '4gzpq5DPGxSnKTe4SA8HAU',
    'name': 'Coldplay',
    'type': 'artist',
    'uri': 'spotify:artist:4gzpq5DPGxSnKTe4SA8HAU'}],
  'available_markets': ['AD',
   'AE',
   'AG',
   'AL',
   'AM',
   'AO',
   'AR',
   'AT',
   'AU',
   'AZ',
   'BA',
   'BB',
   'BD',
   'BE',
   'BF',
   'BG',
   'BH',
   'BI',
   'BJ',
   'BN',
   'BO',
   'BR',
   'BS',
   'BT',
   'BW',
   'BY',
   'BZ',
   'CA',
   'CD',
   'CG',
   'CH',
   'CI',
   'CL',
   'CM',
   'CO',
   'CR',
   'CV',
   'CW',
   'CY',
   'CZ',
   'DE',
   'DJ',
   'DK',
   'DM',
   'DO',
   'DZ',
   'EC',
   'EE',
   'EG',
   'ES',
   'FI',
   'FJ',
   'FM',
   'FR',
   'GA',
   'GB',
   'GD',
   'GE',
   'GH',
   'GM',
   'GN',
   'GQ',
   'GR',
   'GT',
   'GW',
   'GY',
   'HK',
   'HN',
   'HR',
   'HT',
   

In [35]:
sleep_min = 1
sleep_max = 3
request_count = 0
start_time = time.time()

preview_url = []
track_id = []
artist_id = []
track_popularity = []
artists_df = []
titles_df = []

for index in range(len(spotify_artists)):
    artist_name = spotify_artists[index]
    song_title = titles_clean[index]
    search = f'artist:{artist_name} track:{song_title}'
    query = sp.search(search, type='track')


    # in the below line of code I am checking that the query returned by spotify's api is not empty and if its not I check if the preview url is available, and
    # even further I check a third condition whether the artist the we found in our query is the same from our dataset.


    # if query['tracks']['items'] and query['tracks']['items'][0]['preview_url'] and query['tracks']['items'][0]['artists'][0]['name'] in alpha_artists_titles[index]:
    if track_preview_available(query) and is_same_artist_and_title(query,artists,titles,index):
        preview_url.append(query['tracks']['items'][0]['preview_url'])
        track_id.append(query['tracks']['items'][0]['id'])
        artist_id.append(query['tracks']['items'][0]['artists'][0]['id'])
        track_popularity.append(query['tracks']['items'][0]['popularity'])
        artists_df.append(spotify_artists[index])
        titles_df.append(titles_clean[index])
         
    else:
        preview_url.append(None)
        track_id.append(None)
        artist_id.append(None)
        track_popularity.append(None)
        artists_df.append(spotify_artists[index])
        titles_df.append(titles_clean[index])


    request_count+=1
    if request_count % 5 == 0:
        print(str(request_count) + " artists uris fetched")
        time.sleep(np.random.uniform(sleep_min, sleep_max))
        print('Loop #: {}'.format(request_count))
        print('Elapsed Time: {} seconds'.format(time.time() - start_time))

5 artists uris fetched
Loop #: 5
Elapsed Time: 4.708026885986328 seconds
10 artists uris fetched
Loop #: 10
Elapsed Time: 6.846842050552368 seconds
15 artists uris fetched
Loop #: 15
Elapsed Time: 9.166652917861938 seconds
20 artists uris fetched
Loop #: 20
Elapsed Time: 12.867906093597412 seconds
25 artists uris fetched
Loop #: 25
Elapsed Time: 15.142414093017578 seconds
30 artists uris fetched
Loop #: 30
Elapsed Time: 17.409158945083618 seconds
35 artists uris fetched
Loop #: 35
Elapsed Time: 19.910676956176758 seconds
40 artists uris fetched
Loop #: 40
Elapsed Time: 23.548586130142212 seconds
45 artists uris fetched
Loop #: 45
Elapsed Time: 26.40551495552063 seconds
50 artists uris fetched
Loop #: 50
Elapsed Time: 29.777029275894165 seconds
55 artists uris fetched
Loop #: 55
Elapsed Time: 32.3474600315094 seconds
60 artists uris fetched
Loop #: 60
Elapsed Time: 34.96685791015625 seconds
65 artists uris fetched
Loop #: 65
Elapsed Time: 38.19969201087952 seconds
70 artists uris fetche

In [36]:
track_data = pd.DataFrame(artists_df, columns=['artists'])
track_data['tracks'] = titles_df
track_data['artist_id'] = artist_id
track_data['track_id'] = track_id
track_data['track_popularity'] = track_popularity
track_data['track_url'] = preview_url

In [37]:
track_data

Unnamed: 0,artists,tracks,artist_id,track_id,track_popularity,track_url
0,*NSYNC,bye bye bye,6Ff53KvcvAj5U7Z1vojB5o,62bOmKYxYg7dhrC6gH9vFn,71.0,https://p.scdn.co/mp3-preview/612fd571c0216fa3...
1,*NSYNC,it s gonna be me,,,,
2,*NSYNC,tearin up my heart,,,,
3,*NSYNC,gone,6Ff53KvcvAj5U7Z1vojB5o,4CCUjYJPbSXLL23BFeBVbI,52.0,https://p.scdn.co/mp3-preview/eea62070ce877996...
4,*NSYNC,"merry christmas, happy holidays",6Ff53KvcvAj5U7Z1vojB5o,4v9WbaxW8HdjqfUiWYWsII,39.0,https://p.scdn.co/mp3-preview/158f2eafaa4f03f5...
...,...,...,...,...,...,...
195,65daysofstatic,heat death infinity splitter,6DVVsQAnpHdJjb1nYuOQ6g,3PibgbUWMGaYZObGpvGhmJ,26.0,https://p.scdn.co/mp3-preview/2799b9f6b7e4fe22...
196,Three 6 Mafia,sippin on some syrup,,,,
197,Three 6 Mafia,stay fly,26s8LSolLfCIY88ysQbIuT,5MYFw4T2gy52pOGBN4EYHS,68.0,https://p.scdn.co/mp3-preview/56db43014efeb814...
198,Three 6 Mafia,late nite tip,26s8LSolLfCIY88ysQbIuT,75RK78POyFmg3u6O1cpBdr,51.0,https://p.scdn.co/mp3-preview/3e7e1d015ddb7feb...


In [38]:
# Downloading these songs according to track id
track_data.set_index('track_id',inplace=True)
track_data.to_csv('track_data.csv')

In [44]:
track_data

Unnamed: 0_level_0,artists,tracks,artist_id,track_popularity,track_url
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
62bOmKYxYg7dhrC6gH9vFn,*NSYNC,bye bye bye,6Ff53KvcvAj5U7Z1vojB5o,71.0,https://p.scdn.co/mp3-preview/612fd571c0216fa3...
,*NSYNC,it s gonna be me,,,
,*NSYNC,tearin up my heart,,,
4CCUjYJPbSXLL23BFeBVbI,*NSYNC,gone,6Ff53KvcvAj5U7Z1vojB5o,52.0,https://p.scdn.co/mp3-preview/eea62070ce877996...
4v9WbaxW8HdjqfUiWYWsII,*NSYNC,"merry christmas, happy holidays",6Ff53KvcvAj5U7Z1vojB5o,39.0,https://p.scdn.co/mp3-preview/158f2eafaa4f03f5...
...,...,...,...,...,...
3PibgbUWMGaYZObGpvGhmJ,65daysofstatic,heat death infinity splitter,6DVVsQAnpHdJjb1nYuOQ6g,26.0,https://p.scdn.co/mp3-preview/2799b9f6b7e4fe22...
,Three 6 Mafia,sippin on some syrup,,,
5MYFw4T2gy52pOGBN4EYHS,Three 6 Mafia,stay fly,26s8LSolLfCIY88ysQbIuT,68.0,https://p.scdn.co/mp3-preview/56db43014efeb814...
75RK78POyFmg3u6O1cpBdr,Three 6 Mafia,late nite tip,26s8LSolLfCIY88ysQbIuT,51.0,https://p.scdn.co/mp3-preview/3e7e1d015ddb7feb...


In [43]:
c = 0
for i in preview_url:
    if i is not None:
        c+=1
c

91

In [None]:
# response = requests.get(query['tracks']['items'][0]['preview_url'], verify=False) # using track uris
# audio_fn = os.path.join(audio_path,*track_id[:2],track_id+'.mp3') 
# open(f"downloaded_songs/{artists_10[i]}/{top_10_track_names_clean[song]}.mp3", 'wb').write(response.content)  

In [166]:
audio_path = '/Users/ytkd/Desktop/downloaded_songs'
# trac = "62bOmKYxYg7dhrC6gH9vFn"
# x = os.path.join(audio_path,trac[:1],trac+'.mp3')
# x

'/Users/ytkd/Desktop/downloaded_songs/6/62bOmKYxYg7dhrC6gH9vFn.mp3'

In [177]:
os.mkdir('/Users/ytkd/Desktop/downloaded_songs')
audio_path = '/Users/ytkd/Desktop/downloaded_songs'
for i,url in enumerate(preview_url):
    if url is not None:
        response = requests.get(url, verify=False)
        if os.path.exists(f'{audio_path}/{track_id[i][:1]}') is False:
            os.mkdir(f'{audio_path}/{track_id[i][:1]}')
        open(f"{os.path.join(audio_path,track_id[i][:1],track_id[i]+'.mp3')}", 'wb').write(response.content)  






In [203]:
count = 0
for id in preview_url:
    if id is None:
        count = count + 1
count
print(f'Out of 200 sample records the preview url is available for only {int(200-count)} songs')

Out of 200 sample records the preview url is available for only 109 songs


In [204]:
x = sp.search('artist: 5 Seconds of Summer track: she looks so perfect',type='track')
x['tracks']['items']

[{'album': {'album_type': 'album',
   'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/5Rl15oVamLq7FbSb0NNBNy'},
     'href': 'https://api.spotify.com/v1/artists/5Rl15oVamLq7FbSb0NNBNy',
     'id': '5Rl15oVamLq7FbSb0NNBNy',
     'name': '5 Seconds of Summer',
     'type': 'artist',
     'uri': 'spotify:artist:5Rl15oVamLq7FbSb0NNBNy'}],
   'available_markets': ['CA', 'MX', 'US'],
   'external_urls': {'spotify': 'https://open.spotify.com/album/2LkWHNNHgD6BRNeZI2SL1L'},
   'href': 'https://api.spotify.com/v1/albums/2LkWHNNHgD6BRNeZI2SL1L',
   'id': '2LkWHNNHgD6BRNeZI2SL1L',
   'images': [{'height': 640,
     'url': 'https://i.scdn.co/image/ab67616d0000b27393432e914046a003229378da',
     'width': 640},
    {'height': 300,
     'url': 'https://i.scdn.co/image/ab67616d00001e0293432e914046a003229378da',
     'width': 300},
    {'height': 64,
     'url': 'https://i.scdn.co/image/ab67616d0000485193432e914046a003229378da',
     'width': 64}],
   'name': '5 Seconds Of Su

In [231]:
test = sp.search('artist:Coldplay track:Fix You', type='track')
test['tracks']['href']

'https://api.spotify.com/v1/search?query=artist%3AColdplay+track%3AFix+You&type=track&offset=0&limit=10'

In [224]:
import json
with open('test.json', 'w') as convert_file:
     convert_file.write(json.dumps(test))

In [None]:
# create a histigram and check file names 