# Getting The Dataset 

In [2]:
import numpy as np
import pandas as pd

In [2]:
import os
import json
directory = "/home/jovyan/Spotify/data"
# Get a list of all JSON files in the directory
files = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.json')]

In [3]:
len(files)

1000

In [4]:
spotify_data = []

for file in files:
    # Open each file
    with open(file, 'r') as f:
        # Load the data from the file and append it to the list
        spotify_data.append(json.load(f))

In [5]:
import pandas as pd
spotify_df= pd.DataFrame(spotify_data)

In [5]:
spotify_df.to_csv('spotify_df.csv', index=False)

# Create DataFrame For Playlists

In [6]:
playlists = []
# Loop through each file's data
for file_data in spotify_data:
    # Append each playlist to the list
    playlists.extend(file_data['playlists'])

In [7]:
playlists_df = pd.DataFrame(playlists)

In [8]:
playlists_df.to_csv('playlists_df.csv', index=False)

# Create DataFrame For Tracks & Artists

In [13]:
all_tracks = []
for _, row in playlists_df.iterrows():
    for track in row['tracks']:
        # Add playlist information to each track
        track['playlist_name'] = row['name']# add this columns correspond to playlists name in playlists df
        
        track['playlist_pid'] = row['pid']
        all_tracks.append(track)

# Create a DataFrame from the extracted track details
tracks_df = pd.DataFrame(all_tracks)


In [10]:
tracks_df.to_csv('tracks_df.csv', index=False)

In [None]:
# Create The Interaction Dataframe

In [14]:
playlist_tracks = []
for playlist in playlists:
    pid = playlist['pid']
    for track in playlist['tracks']:
        playlist_tracks.append([pid, track['track_uri'], 1])  # Adding implicit feedback rating of 1

In [15]:
interaction_df = pd.DataFrame(playlist_tracks, columns=['playlist_id', 'track_uri', 'rating'])

In [16]:
print(interaction_df.shape)
interaction_df.head()

(66346428, 3)


Unnamed: 0,playlist_id,track_uri,rating
0,981000,spotify:track:7gKIt3rDGIMJDFVSPBnGmj,1
1,981000,spotify:track:2eAAEa8pxKF7My0EO4rFgR,1
2,981000,spotify:track:5rnFOEEIBIWZ6dhHrY6zHh,1
3,981000,spotify:track:7fwXWKdDNI5IutOMc5OKYw,1
4,981000,spotify:track:19yIQRLAYMNxmEfdnnQDsS,1


In [22]:
interaction_df.to_csv('interaction_df.csv', index=False)

In [19]:
#store unique pis and track_uri
playlists = list(np.sort(interaction_df['playlist_id'].unique()))
tracks = list(np.sort(interaction_df['track_uri'].unique()))

In [20]:
# Map IDs to integer indices
playlist_to_idx = {playlist: i for i, playlist in enumerate(playlists)}
track_to_idx = {track: i for i, track in enumerate(tracks)}

# Map the interaction DataFrame values to integer indices
interaction_df['playlist_idx'] = interaction_df['playlist_id'].map(playlist_to_idx)
interaction_df['track_idx'] = interaction_df['track_uri'].map(track_to_idx)

# Getting the Kaggle Dataset

In [11]:
kaggle_data = pd.read_csv('kaggle_data.csv')

In [9]:
kaggle_data.shape

(1159764, 20)

In [8]:
kaggle_data.head()

Unnamed: 0.1,Unnamed: 0,artist_name,track_name,track_id,popularity,year,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,0,Jason Mraz,I Won't Give Up,53QF56cjZA9RTuuMZDrSA6,68,2012,acoustic,0.483,0.303,4,-10.058,1,0.0429,0.694,0.0,0.115,0.139,133.406,240166,3
1,1,Jason Mraz,93 Million Miles,1s8tP3jP4GZcyHDsjvw218,50,2012,acoustic,0.572,0.454,3,-10.286,1,0.0258,0.477,1.4e-05,0.0974,0.515,140.182,216387,4
2,2,Joshua Hyslop,Do Not Let Me Go,7BRCa8MPiyuvr2VU3O9W0F,57,2012,acoustic,0.409,0.234,3,-13.711,1,0.0323,0.338,5e-05,0.0895,0.145,139.832,158960,4
3,3,Boyce Avenue,Fast Car,63wsZUhUZLlh1OsyrZq7sz,58,2012,acoustic,0.392,0.251,10,-9.845,1,0.0363,0.807,0.0,0.0797,0.508,204.961,304293,4
4,4,Andrew Belle,Sky's Still Blue,6nXIYClvJAfi6ujLiKqEq8,54,2012,acoustic,0.43,0.791,6,-5.419,0,0.0302,0.0726,0.0193,0.11,0.217,171.864,244320,4


## Merge track_df and kaggle_data

In [7]:
tracks_df = pd.read_csv('tracks_df.csv')

In [6]:
tracks_df['track_id']= tracks_df['track_uri'].str.replace('spotify:track:', '')
tracks_df.head()

Unnamed: 0,pos,artist_name,track_uri,artist_uri,track_name,album_uri,duration_ms,album_name,playlist_name,playlist_pid,track_id
0,0,Nicki Minaj,spotify:track:7gKIt3rDGIMJDFVSPBnGmj,spotify:artist:0hCNtLu0JehylgoiP8L4Gh,Super Bass,spotify:album:2RfF6dGpYIN5u1mNkfG8Pb,200013,Pink Friday,life,981000,7gKIt3rDGIMJDFVSPBnGmj
1,1,Dua Lipa,spotify:track:2eAAEa8pxKF7My0EO4rFgR,spotify:artist:6M2wZ9GZgrQXHCFfjv46we,Blow Your Mind (Mwah),spotify:album:01sfgrNbnnPUEyz6GZYlt9,178583,Dua Lipa,life,981000,2eAAEa8pxKF7My0EO4rFgR
2,2,ILoveMakonnen,spotify:track:5rnFOEEIBIWZ6dhHrY6zHh,spotify:artist:3aGFCoR8xGN6DKwvdzeSja,Love (feat. Rae Sremmurd),spotify:album:44sGCAuUKSGPbF6fSXO2vZ,283193,Love (feat. Rae Sremmurd),life,981000,5rnFOEEIBIWZ6dhHrY6zHh
3,3,J Balvin,spotify:track:7fwXWKdDNI5IutOMc5OKYw,spotify:artist:1vyhD5VmyZ7KMfW5gqLgo5,Mi Gente (feat. Beyoncé),spotify:album:0ARVq1kA5eRP4F5VsZsr3m,209733,Mi Gente (feat. Beyoncé),life,981000,7fwXWKdDNI5IutOMc5OKYw
4,4,Era Istrefi,spotify:track:19yIQRLAYMNxmEfdnnQDsS,spotify:artist:4poL7YCSkG7kMnWjAdDU91,No I Love Yous,spotify:album:6z4lsw2W6YwkA3CNDhGP42,180012,No I Love Yous,life,981000,19yIQRLAYMNxmEfdnnQDsS


In [8]:
tracks_df.shape

(66346428, 11)

In [19]:
# Merege
merged_df = pd.merge(tracks_df, kaggle_data, on='track_id', how='inner').drop_duplicates(subset='track_id')

In [20]:
merged_df.shape

(188298, 30)

In [21]:
merged_df

Unnamed: 0,pos,artist_name_x,track_uri,artist_uri,track_name_x,album_uri,duration_ms_x,album_name,playlist_name,playlist_pid,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms_y,time_signature
0,5,Era Istrefi,spotify:track:2tBFf1pd3PyYXVLJ5QO3de,spotify:artist:4poL7YCSkG7kMnWjAdDU91,Bonbon - English Version,spotify:album:7sMM9mV7CxSNYf1AqjxEgT,167171,Bonbon,life,981000,...,-4.133,0,0.1100,0.077400,0.000666,0.1180,0.699,94.962,167171,4
1,9,Louis Tomlinson,spotify:track:7F9vK8hNFMml4GtHsaXui6,spotify:artist:57WHJIHrjOE3iAxpihhMnp,Back to You (feat. Bebe Rexha & Digital Farm A...,spotify:album:4sBgGazGb7S9ZUQJu2Y0qa,190427,Back to You (feat. Bebe Rexha & Digital Farm A...,life,981000,...,-4.918,0,0.1420,0.207000,0.000000,0.3940,0.645,75.016,190428,4
2,12,Khalid,spotify:track:248OFOZef6ShXv6DGgbnxU,spotify:artist:6LuN9FCkKOj5PcnpouEgny,Saved,spotify:album:6kf46HbnYCZzP6rjvQHYzg,206533,American Teen,life,981000,...,-10.280,0,0.1380,0.189000,0.000000,0.1180,0.553,81.044,206533,4
3,21,SZA,spotify:track:0q75NwOoFiARAVp4EXU4Bs,spotify:artist:7tYKF4w9nC0nq9CsPZTHyP,Love Galore,spotify:album:76290XdXVF9rPzGdNRWdCh,275080,Ctrl,life,981000,...,-6.200,1,0.0748,0.112000,0.000000,0.1620,0.409,135.002,275080,4
4,22,Maggie Lindemann,spotify:track:1NDxZ7cFAo481dtYWdrUnR,spotify:artist:0uGk2czvcpWQA383Im6ajf,Pretty Girl - Cheat Codes X CADE Remix,spotify:album:1XYA8eDvomdYTbQBzk0jT1,193613,Pretty Girl,life,981000,...,-4.661,0,0.0291,0.150000,0.132000,0.1040,0.733,121.030,193613,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19846424,69,Hoth,spotify:track:78oeyFJcN3PGfhDYn154KD,spotify:artist:6AZvcFijP8NR8dhGjJRx1U,The Unholy Conception,spotify:album:0mjYHlEGn0G5atFXIrDe2n,442160,Oathbreaker,metal,930808,...,-9.262,0,0.0433,0.000079,0.791000,0.1040,0.195,98.767,442160,3
19846829,86,Insan3lik3,spotify:track:4tKvPpe5IRu8GdwMtwokfu,spotify:artist:0ACEzksK2oPw72QtDMCUVV,Go Ballistic,spotify:album:1ppqWOIAnfR8jpMNSldsJg,368483,Go Ballistic,edm,930821,...,-10.220,0,0.0608,0.012700,0.828000,0.0927,0.318,127.995,368483,4
19849729,40,Project 86,spotify:track:11ZZgXmPo8dR2WI4VZeR4P,spotify:artist:7toVzxZQU21OjB5PqXNvTF,Know What It Means,spotify:album:6A534gDjI3kQfU5hlC097a,254933,Truthless Heroes,Nostalgia,930965,...,-5.033,1,0.0666,0.000158,0.002010,0.1060,0.188,182.567,254933,3
19849734,47,Project 86,spotify:track:0NciXpcSyzMAa3CTNVVuh1,spotify:artist:7toVzxZQU21OjB5PqXNvTF,Another Boredom Movement,spotify:album:6A534gDjI3kQfU5hlC097a,234160,Truthless Heroes,Nostalgia,930965,...,-4.314,1,0.0493,0.000071,0.638000,0.3840,0.267,119.676,234160,4


In [26]:
kaggle_data['track_id'].unique().shape

(1159764,)

In [None]:
merged_df.to_csv('merged_csv', index=False)


# Getting The Musicbrainz Artist Tar File

In [17]:
import json

artist_json_path = "/home/jovyan/Spotify/musicbrainz/mbdump/artist" 

artists_data = []
with open(artist_json_path, 'r', encoding='utf-8') as f:
    for line in f:
        artists_data.append(json.loads(line))


In [18]:
artist_df = pd.DataFrame(artists_data)

In [None]:
artist_df.to_csv('artist_df.csv', index = False)

In [3]:
artist_df = pd.read_csv('artist_df.csv')

In [4]:
artist_df.shape

(2425588, 21)

In [42]:
len(artist_df['name'].unique())

2195132

In [28]:
artist_df.isnull().sum()

annotation        2406905
begin-area        2024378
ipis                    0
disambiguation    1629042
type-id            535150
rating                  0
sort-name              36
aliases                 0
area              1236934
life-span               0
name                   30
id                      0
type               535150
country           1496627
end-area          2361069
gender            1389796
isnis                   0
gender-id         1389796
relations               0
genres                  0
tags                    0
dtype: int64

In [44]:
artist_df[:5]

Unnamed: 0,annotation,begin-area,ipis,disambiguation,type-id,rating,sort-name,aliases,area,life-span,...,artist_uri,type,country,end-area,gender,isnis,gender-id,relations,genres,tags
0,,"{'type-id': None, 'name': 'San Pedro', 'id': '...",[],early 80s US hardcore punk band,e431f5f6-b5d2-343d-8b36-72607fffb74b,"{'value': 4, 'votes-count': 1}",Minutemen,"[{'name': 'The Minutemen', 'primary': None, 'b...","{'type-id': None, 'iso-3166-1-codes': ['US'], ...","{'end': '1985-12-22', 'ended': True, 'begin': ...",...,d4ad0149-d8ae-4105-8009-0221fce9ff35,Group,US,,,[],,"[{'attribute-ids': {}, 'source-credit': '', 'e...","[{'disambiguation': '', 'id': '00055e8b-b951-4...","[{'count': 1, 'name': 'acoustic rock'}, {'coun..."
1,,,[],,e431f5f6-b5d2-343d-8b36-72607fffb74b,"{'votes-count': 0, 'value': None}",Burunduk Kvartet,"[{'primary': None, 'name': 'Burunduk Quartet',...","{'type-id': None, 'iso-3166-1-codes': ['RU'], ...","{'ended': False, 'begin': '1997', 'end': None}",...,76b84628-bb79-4589-ae7c-91e1d886fc3c,Group,RU,,,[],,"[{'type': 'remixer', 'direction': 'forward', '...",[],[]
2,,"{'type-id': None, 'sort-name': 'Kingston', 'na...",[],dancehall deejay,b6e035f4-3ce9-331c-97df-83397230b0df,"{'value': None, 'votes-count': 0}",Super Cat,"[{'name': 'Supercat', 'primary': None, 'begin'...","{'disambiguation': '', 'type': None, 'id': '2d...","{'ended': False, 'begin': '1963-06-25', 'end':...",...,2d5fbbfd-27a7-4b74-848a-2b1f24fa1d0a,Person,JM,,Male,['0000000055177699'],36d3d30a-839d-3eda-8cb3-29be4384e4a9,"[{'attributes': [], 'direction': 'forward', 'e...","[{'disambiguation': '', 'name': 'dancehall', '...","[{'name': 'dancehall', 'count': 2}, {'count': ..."
3,Archived Czech homepage: http://wayback.archiv...,"{'sort-name': 'Praha', 'type-id': None, 'iso-3...",['00066842160'],"Czech-American composer, filmmaker, musician",b6e035f4-3ce9-331c-97df-83397230b0df,"{'votes-count': 0, 'value': None}","Král, Ivan",[{'type-id': '1937e404-b981-3cb7-8151-4c86ebfc...,"{'disambiguation': '', 'type': None, 'name': '...","{'ended': True, 'begin': '1948-05-12', 'end': ...",...,3095b79c-0f52-4ac8-a3c4-c1a15a61aa12,Person,US,"{'type-id': None, 'sort-name': 'Michigan', 'di...",Male,['0000000046377410'],36d3d30a-839d-3eda-8cb3-29be4384e4a9,"[{'attribute-values': {}, 'end': '1974', 'type...",[],[]
4,,"{'type-id': None, 'iso-3166-1-codes': ['US'], ...",[],,e431f5f6-b5d2-343d-8b36-72607fffb74b,"{'votes-count': 1, 'value': 5}",Love Jones,[],"{'iso-3166-1-codes': ['US'], 'type-id': None, ...","{'end': None, 'begin': '1990', 'ended': False}",...,4231388d-489e-4fb6-918c-ecc514bf882d,Group,US,,,[],,"[{'attribute-ids': {}, 'source-credit': '', 'l...",[],[]


In [5]:
### Rename the columns
artist_df=  artist_df.rename(columns={'id': 'artist_uri', 'name': 'artist_name'})

In [8]:
artist_df['artist_name'][:10]

0             Minutemen
1      Бурундук Квартет
2             Super Cat
3             Ivan Král
4            Love Jones
5               Niagara
6            E_SHAK MMS
7                 Hydra
8    Saturday Supercade
9               Persone
Name: artist_name, dtype: object

### 1. Merging track_df And artist_df

In [8]:
tracks_df1= tracks_df[['track_uri','track_name','artist_name','artist_uri','album_uri', 'album_name']]

In [6]:
tracks_df1.head()

Unnamed: 0,track_uri,track_name,artist_name,artist_uri,album_uri,album_name
0,spotify:track:7gKIt3rDGIMJDFVSPBnGmj,Super Bass,Nicki Minaj,spotify:artist:0hCNtLu0JehylgoiP8L4Gh,spotify:album:2RfF6dGpYIN5u1mNkfG8Pb,Pink Friday
1,spotify:track:2eAAEa8pxKF7My0EO4rFgR,Blow Your Mind (Mwah),Dua Lipa,spotify:artist:6M2wZ9GZgrQXHCFfjv46we,spotify:album:01sfgrNbnnPUEyz6GZYlt9,Dua Lipa
2,spotify:track:5rnFOEEIBIWZ6dhHrY6zHh,Love (feat. Rae Sremmurd),ILoveMakonnen,spotify:artist:3aGFCoR8xGN6DKwvdzeSja,spotify:album:44sGCAuUKSGPbF6fSXO2vZ,Love (feat. Rae Sremmurd)
3,spotify:track:7fwXWKdDNI5IutOMc5OKYw,Mi Gente (feat. Beyoncé),J Balvin,spotify:artist:1vyhD5VmyZ7KMfW5gqLgo5,spotify:album:0ARVq1kA5eRP4F5VsZsr3m,Mi Gente (feat. Beyoncé)
4,spotify:track:19yIQRLAYMNxmEfdnnQDsS,No I Love Yous,Era Istrefi,spotify:artist:4poL7YCSkG7kMnWjAdDU91,spotify:album:6z4lsw2W6YwkA3CNDhGP42,No I Love Yous


In [10]:
tracks_df1.to_csv('tracks_df1.csv', index = False)

In [9]:
tracks_df1 = pd.read_csv('tracks_df1.csv')

### Checking if some artists have ambiguous name

In [28]:
print(artist_groups[artist_groups > 1])

sort-name
!!!        3
"O"        2
$K         2
&          2
'Fusion    2
          ..
賊恩         2
김영민        2
김창훈        2
이은지        2
정우         2
Name: id, Length: 101929, dtype: int64


In [32]:
artist_groups1 = tracks_df1.groupby('artist_name')['artist_uri'].nunique()
print(artist_groups1[artist_groups1 > 1])

artist_name
& More         3
100%           2
11:11          2
12th Planet    2
1982           2
              ..
邱勝翊            2
매니악            2
박정현            2
정승환            2
하울             3
Name: artist_uri, Length: 5990, dtype: int64


### Merging tracks_df1 and artists_df Dataframe

In [10]:
# Merging two DataFrames on 'artist_name'
merged_df = pd.merge(tracks_df1, artist_df[['artist_name', 'type-id', 'tags']], on='artist_name', how='left').drop_duplicates(subset=['artist_name'])
merged_df

Unnamed: 0,track_uri,track_name,artist_name,artist_uri,album_uri,album_name,type-id,tags
0,spotify:track:7gKIt3rDGIMJDFVSPBnGmj,Super Bass,Nicki Minaj,spotify:artist:0hCNtLu0JehylgoiP8L4Gh,spotify:album:2RfF6dGpYIN5u1mNkfG8Pb,Pink Friday,b6e035f4-3ce9-331c-97df-83397230b0df,"[{'count': 1, 'name': '2010s'}, {'name': 'east..."
1,spotify:track:2eAAEa8pxKF7My0EO4rFgR,Blow Your Mind (Mwah),Dua Lipa,spotify:artist:6M2wZ9GZgrQXHCFfjv46we,spotify:album:01sfgrNbnnPUEyz6GZYlt9,Dua Lipa,b6e035f4-3ce9-331c-97df-83397230b0df,"[{'name': 'alternative pop', 'count': 1}, {'na..."
2,spotify:track:5rnFOEEIBIWZ6dhHrY6zHh,Love (feat. Rae Sremmurd),ILoveMakonnen,spotify:artist:3aGFCoR8xGN6DKwvdzeSja,spotify:album:44sGCAuUKSGPbF6fSXO2vZ,Love (feat. Rae Sremmurd),,
3,spotify:track:7fwXWKdDNI5IutOMc5OKYw,Mi Gente (feat. Beyoncé),J Balvin,spotify:artist:1vyhD5VmyZ7KMfW5gqLgo5,spotify:album:0ARVq1kA5eRP4F5VsZsr3m,Mi Gente (feat. Beyoncé),b6e035f4-3ce9-331c-97df-83397230b0df,"[{'count': 6, 'name': 'hip hop'}, {'count': 5,..."
4,spotify:track:19yIQRLAYMNxmEfdnnQDsS,No I Love Yous,Era Istrefi,spotify:artist:4poL7YCSkG7kMnWjAdDU91,spotify:album:6z4lsw2W6YwkA3CNDhGP42,No I Love Yous,b6e035f4-3ce9-331c-97df-83397230b0df,[]
...,...,...,...,...,...,...,...,...
124252510,spotify:track:3wu9ADop1FXdhToPCxwBL8,Take You Down,Chris Brown,spotify:artist:7bXgB6jMjp9ATFy66eO08Z,spotify:album:1UtE4zAlSE2TlKmTFgrTg5,Exclusive - The Forever Edition,b6e035f4-3ce9-331c-97df-83397230b0df,[]
124252511,spotify:track:3wu9ADop1FXdhToPCxwBL8,Take You Down,Chris Brown,spotify:artist:7bXgB6jMjp9ATFy66eO08Z,spotify:album:1UtE4zAlSE2TlKmTFgrTg5,Exclusive - The Forever Edition,b6e035f4-3ce9-331c-97df-83397230b0df,[]
124252512,spotify:track:3wu9ADop1FXdhToPCxwBL8,Take You Down,Chris Brown,spotify:artist:7bXgB6jMjp9ATFy66eO08Z,spotify:album:1UtE4zAlSE2TlKmTFgrTg5,Exclusive - The Forever Edition,b6e035f4-3ce9-331c-97df-83397230b0df,[]
124252513,spotify:track:3wu9ADop1FXdhToPCxwBL8,Take You Down,Chris Brown,spotify:artist:7bXgB6jMjp9ATFy66eO08Z,spotify:album:1UtE4zAlSE2TlKmTFgrTg5,Exclusive - The Forever Edition,b6e035f4-3ce9-331c-97df-83397230b0df,[]


In [15]:
merged_df.to_csv('merged_df', index= False)

# Spotify Million Song Dataset 

In [None]:
import requests
url = 'http://millionsongdataset.com/sites/default/files/AdditionalFiles/unique_tracks.txt'
response = requests.get(url)
# Save the content to a local file
with open('unique_tracks.txt', 'wb') as file:
    file.write(response.content)

print("Download completed!")


In [4]:
spotifymilsongs = pd.read_csv('/home/jovyan/Spotify/unique_tracks.txt',sep='<SEP>', header=None)
spotifymilsongs.columns = ['track_uri', 'artist_uri', 'artist_name', 'track_name']

  spotifymilsongs = pd.read_csv('/home/jovyan/Spotify/unique_tracks.txt',sep='<SEP>', header=None)


In [3]:
spotifymilsongs = pd.read_csv('/home/jovyan/Spotify/spotifymilsongs.csv')

In [6]:
spotifymilsongs.head()

Unnamed: 0,track_uri,artist_uri,artist_name,track_name
0,TRMMMYQ128F932D901,SOQMMHC12AB0180CB8,Faster Pussy cat,Silent Night
1,TRMMMKD128F425225D,SOVFVAK12A8C1350D9,Karkkiautomaatti,Tanssi vaan
2,TRMMMRX128F93187D9,SOGTUKN12AB017F4F1,Hudson Mohawke,No One Could Ever
3,TRMMMCH128F425532C,SOBNYVR12A8C13558C,Yerba Brava,Si Vos Querés
4,TRMMMWA128F426B589,SOHSBXH12A8C13B0DF,Der Mystic,Tangle Of Aspens


# The Echo Nest Taste Profile Subset

In [3]:
users_pcount= pd.read_csv('/home/jovyan/Spotify/MillionSongDatsetKaggle/train_triplets.txt', sep='\t', header = None)

In [8]:
users_pcount.columns =['user_id', 'artist_uri', 'play_count']

In [9]:
users_pcount.to_csv('users_pcount.csv', index=False)

In [12]:
users_pcount.head()

Unnamed: 0,user_id,artist_uri,play_count
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAPDEY12A81C210A9,1
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFNSP12AF72A0E22,1
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFOVM12A58A7D494,1


In [None]:
users_pcount= pd.read_csv('/home/jovyan/Spotify/MillionSongDatsetKaggle/users_pcount.csv'

# Merged User Playcount - Million Songs

In [13]:
merged_mil_df = pd.merge(spotifymilsongs ,users_pcount, on='artist_uri', how='inner')

In [11]:
len(merged_mil_df[merged_mil_df['play_count']>10])

2090147

In [15]:
merged_mil_df.shape

(49664528, 6)

In [16]:
merged_mil_df.to_csv('merged_mil_df .csv', index=False)

In [6]:
merged_mil_df= pd.read_csv('/home/jovyan/Spotify/merged_mil_df .csv')

# Lyrics Train/Test Dataset From MillionSongDataset

In [78]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

In [2]:
lyrics_df = pd.read_csv('/home/jovyan/Spotify/MillionSongDatsetKaggle/lyrics_dataset.txt', sep='<SEP>', comment='#', header=None, engine='python')

In [3]:
lyrics_trainset = pd.read_csv('/home/jovyan/Spotify/MillionSongDatsetKaggle/lyrics_trainset.txt', delimiter='\t',comment='#' ,header=None)