In [1]:
!pip install -U scikit-learn scikit-surprise numpy



In [137]:
from surprise import Dataset
from surprise import Reader
import pandas as pd
import numpy as np
import random
import re

In [138]:
pd.options.display.max_rows = 5000

In [140]:
p = 1
df = pd.read_csv('data/spotify_dataset.csv', 
                 error_bad_lines=False, 
                 warn_bad_lines=False, 
                 skiprows=lambda i: i>0 and random.random() > p,
                encoding='utf-8')
df.dropna(inplace=True)

In [141]:
df.columns = df.columns.str.replace('"', '')
df.columns = df.columns.str.replace('name', '')
df.columns = df.columns.str.replace(' ', '')

In [142]:
def clean_artist(text):
    text = (str(text)).lower()
    FEAT_PAT = re.compile(r"[\s\S]+[\s]+(feat\.|ft\.|featuring|ft|feat)[\s]+[\s\S]+")
    AMP_PAT = re.compile(r"[\s\S]*(&|and|\+)[\s\S]*")
    #check if we have featured artist
    if FEAT_PAT.match(text):
        text = re.split(r"feat\.|ft\.|featuring|ft|feat", text)[0]
    
    #Remove & from all artists
    if AMP_PAT.match(text):
        text = re.split(r"&", text)[0]
    return text.strip()

In [143]:
df['clean_artist'] = df['artist'].apply(clean_artist)

In [135]:
df[['artist','clean_artist']].loc[6000:7000]

Unnamed: 0,artist,clean_artist
6000,Buddy Holly,buddy holly
6001,The All-American Rejects,the all-american rejects
6002,José González,josé gonzález
6003,Erik Hassle,erik hassle
6004,Elvis Presley,elvis presley
6005,John Mayer,john mayer
6006,Led Zeppelin,led zeppelin
6007,Gotye,gotye
6008,The Naked And Famous,the naked and famous
6009,Depeche Mode,depeche mode


In [144]:
print(len(df.artist.unique()))
print(len(df.clean_artist.unique()))

289603
257168


In [145]:
df.isna().sum()

user_id         0
artist          0
track           0
playlist        0
clean_artist    0
dtype: int64

In [5]:
df["artist_track"] = df["artist"]  + "-" + df["track"]
df["artist_track"] = df["artist_track"].str.replace(" ", "")
df.head()

Unnamed: 0,user_id,artist,track,playlist,artist_track
0,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,(The Angels Wanna Wear My) Red Shoes,HARD ROCK 2010,ElvisCostello-(TheAngelsWannaWearMy)RedShoes
1,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,"(What's So Funny 'Bout) Peace, Love And Unders...",HARD ROCK 2010,ElvisCostello&TheAttractions-(What'sSoFunny'Bo...
2,9cc0cfd4d7d7885102480dd99e7a90d6,Tiffany Page,7 Years Too Late,HARD ROCK 2010,TiffanyPage-7YearsTooLate
3,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,Accidents Will Happen,HARD ROCK 2010,ElvisCostello&TheAttractions-AccidentsWillHappen
4,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,Alison,HARD ROCK 2010,ElvisCostello-Alison


In [48]:
size = lambda x: len(x)
df_freq = df.groupby(['user_id', 'artist']).agg('size').reset_index().rename(columns={0:'freq'})[['user_id', 'artist', 'freq']].sort_values(['freq'], ascending=False)
df_freq.head()

Unnamed: 0,user_id,artist,freq
2870626,defced0ece4ce946160b0d2698142eac,Vitamin String Quartet,3346
502897,26b51e580277e131f87e4c7ee4c0887a,Vitamin String Quartet,3306
844217,414050deadb38aafd8d4ad22ca634055,Vitamin String Quartet,2587
2798380,d993ff8f2de226e2c6803e47a22e9d7e,Lata Mangeshkar,2281
22579,014e695cc6df96011b90a5beb3206012,Ilaiyaraaja,2242


In [63]:
a = df_freq.artist.unique().tolist()
print(len(a))

289821


In [83]:
import re
remove_ft = r"[\s\S]+[\s]+(feat\.|ft\.|featuring|ft|feat)[\s]+[\s\S]+"
r = re.compile(remove_ft, re.IGNORECASE)

filtered_list = list(filter(r.match, a))

print(len(filtered_list))

13717


In [87]:
cleaned = re.split(r"feat\.|ft\.|featuring|ft|feat", "Byron Lee & The Dragonaires [feat. The Mighty Sparrow]")[0]

In [90]:
no_amp = re.split(r"&",cleaned)[0].strip()

In [None]:
Case 1: No featuring
    Remove Amp
Case 2: Featuring
    Remove Amp

In [None]:
remove_amp = r"

In [74]:
filtered_list

['Feyenoord Rotterdam FanChants feat. Feyenoord Supporters Voetbal Liederen',
 'Arsenal FanChants feat. Arsenal Fans Songs & Gooners Football Chants',
 'Barça FanChants feat. Canciones del FC Barcelona',
 'Death Cab for Cutie featuring Magik*Magik Orchestra',
 'The Crystal Method featuring LMFAO',
 'Sunspot Jonz feat. Living Legends',
 'E.S. Posthumus featuring Luna Sans',
 'Motion Man feat. Kut Masta kurt',
 'New Life Worship featuring Ross Parsley & Desperation Band',
 'Bassnectar featuring Persia',
 'Robert G. featuring Kate Lesing',
 'Charles Bradley (feat. Menahan Street Band)',
 'Clutch feat. Basket of Eggs',
 'Tingsek & Vindla String Quartet feat. Måns Mernsten',
 "Charlie Parker's All Stars feat. Miles Davis",
 'Global Deejays feat. Rozalla',
 'Prince Ital Joe feat. Marky Mark',
 'YG feat. DJ Mustard',
 'Avenue Blue feat. Jeff Golub',
 'DJ Spooky feat. the Telos Ensemble',
 'Charlie Parker Quintet feat. Miles Davis',
 'Haji & Emanuel featuring Beverley Knight & Bryan Chambers',

In [None]:
size = lambda x: len(x)
df_track_freq = df.groupby(['user_id', 'artist']).agg('size').reset_index().rename(columns={0:'freq'})[['track', 'artist', 'freq']].sort_values(['freq'], ascending=False)
df_track_freq.head()

Read all unique artist names

In [49]:
artist_names = df.artist.unique()

In [50]:
len(artist_names)

289822

Get all songs for each artist

# Checks df for artist. Get's top 5 songs for each artist

In [55]:
def get_top5_songs(artist):
    
    return list(df[df['artist'] == artist]['track'].value_counts()[0:5].index)
#     songs = df.loc[df.arist]

In [56]:
get_top5_songs('David Cochran Heath')

['Psalm 63', 'Psalm 4', 'Psalm 114', 'Psalm 126', 'Job 2']

key:'kanye west'
value: ['gold digger', 'stronger', 'sdafasdf','fasdasdf', 'i miss kim'] *list should be sorted in decr order of frequency of song in database


In [7]:
artist= pd.unique(df['artist'].values.ravel())
artist = pd.Series(np.arange(len(artist)), artist)
df_freq["artist_id"] = df_freq[['artist']].applymap(artist.get)

In [8]:
print(len(df_freq["artist"].unique()))

289821


In [9]:
print(len(df_freq["artist_id"].unique()))

289821


In [10]:
print(len(df_freq["artist"]))

3285631


In [11]:
artist_track_df = pd.DataFrame(df_freq["artist_id"].unique(), columns = ["artist_id"])

In [12]:
artist_track_df["artist"] = df_freq["artist"].unique()
df_freq.drop('artist', axis=1, inplace=True)
df_freq.head()

Unnamed: 0,user_id,freq,artist_id
0,00055176fea33f6e027cd3302289378b,10,713
1,00055176fea33f6e027cd3302289378b,1,132909
2,00055176fea33f6e027cd3302289378b,3,7994
3,00055176fea33f6e027cd3302289378b,8,698
4,00055176fea33f6e027cd3302289378b,1,8100


In [13]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(1,5))

In [14]:
df_freq[["freq"]] = scaler.fit_transform(df_freq[["freq"]])
df_freq.head()

Unnamed: 0,user_id,freq,artist_id
0,00055176fea33f6e027cd3302289378b,1.010762,713
1,00055176fea33f6e027cd3302289378b,1.0,132909
2,00055176fea33f6e027cd3302289378b,1.002392,7994
3,00055176fea33f6e027cd3302289378b,1.008371,698
4,00055176fea33f6e027cd3302289378b,1.0,8100


In [15]:
reader = Reader(rating_scale=(1, 5))

In [16]:
data = Dataset.load_from_df(df_freq[['user_id', 'artist_id', 'freq']], reader)

In [17]:
from surprise.model_selection import cross_validate
from surprise import NormalPredictor

In [18]:
# from surprise import SVD
# from surprise.model_selection import KFold
# from surprise import accuracy
# # define a cross-validation iterator
# kf = KFold(n_splits=5)
# algo = SVD()
# for trainset, testset in kf.split(data):
#     # train and test algorithm.
#     algo.fit(trainset)
#     predictions = algo.test(testset)
#     # Compute and print Root Mean Squared Error
#     accuracy.rmse(predictions, verbose=True)

# Let's try to get predictions from SVD

In [19]:
from surprise import SVD
from surprise.model_selection import cross_validate



In [20]:
# SVD.predict?

In [21]:
svd = SVD(verbose=True, n_epochs=50)

In [23]:
cross_validate(svd, data, measures=['RMSE'], cv=2, verbose=True)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 25
Processing epoch 26
Processing epoch 27
Processing epoch 28
Processing epoch 29
Processing epoch 30
Processing epoch 31
Processing epoch 32
Processing epoch 33
Processing epoch 34
Processing epoch 35
Processing epoch 36
Processing epoch 37
Processing epoch 38
Processing epoch 39
Processing epoch 40
Processing epoch 41
Processing epoch 42
Processing epoch 43
Processing epoch 44
Processing epoch 45
Processing epoch 46
Processing epoch 47
Processing epoch 48
Processing epoch 49
Processing

{'test_rmse': array([0.02259635, 0.02275703]),
 'fit_time': (287.2614200115204, 282.93003702163696),
 'test_time': (21.067522048950195, 20.038546085357666)}

In [24]:
trainset = data.build_full_trainset()
svd.fit(trainset)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 25
Processing epoch 26
Processing epoch 27
Processing epoch 28
Processing epoch 29
Processing epoch 30
Processing epoch 31
Processing epoch 32
Processing epoch 33
Processing epoch 34
Processing epoch 35
Processing epoch 36
Processing epoch 37
Processing epoch 38
Processing epoch 39
Processing epoch 40
Processing epoch 41
Processing epoch 42
Processing epoch 43
Processing epoch 44
Processing epoch 45
Processing epoch 46
Processing epoch 47
Processing epoch 48
Processing epoch 49


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fddb7a1b8b0>

In [26]:
preds = list()
for id in df_freq.artist_id.values:
    preds.append(svd.predict(uid="00055176fea33f6e027cd3302289378b", iid=id))


    

In [None]:
# x = sorted(df_freq.freq.unique())
# x

In [28]:
preds = sorted(preds, key=lambda x: x.est, reverse=True)

In [31]:
preds[:20]

[Prediction(uid='00055176fea33f6e027cd3302289378b', iid=101477, r_ui=None, est=1.5538967097253304, details={'was_impossible': False}),
 Prediction(uid='00055176fea33f6e027cd3302289378b', iid=101477, r_ui=None, est=1.5538967097253304, details={'was_impossible': False}),
 Prediction(uid='00055176fea33f6e027cd3302289378b', iid=101477, r_ui=None, est=1.5538967097253304, details={'was_impossible': False}),
 Prediction(uid='00055176fea33f6e027cd3302289378b', iid=101477, r_ui=None, est=1.5538967097253304, details={'was_impossible': False}),
 Prediction(uid='00055176fea33f6e027cd3302289378b', iid=101477, r_ui=None, est=1.5538967097253304, details={'was_impossible': False}),
 Prediction(uid='00055176fea33f6e027cd3302289378b', iid=101477, r_ui=None, est=1.5538967097253304, details={'was_impossible': False}),
 Prediction(uid='00055176fea33f6e027cd3302289378b', iid=178381, r_ui=None, est=1.3791580882855914, details={'was_impossible': False}),
 Prediction(uid='00055176fea33f6e027cd3302289378b', iid

In [33]:
#unique preds
unique_preds = list()
iid=[]
for pred in preds:
    if pred.iid not in iid:
        iid.append(pred.iid)
    
    

KeyboardInterrupt: 

In [34]:
iid=[]
for pred in preds:
    iid.append(pred.iid)

In [47]:
iid=list(dict.fromkeys(iid))
iid

[101477,
 178381,
 223686,
 121377,
 84281,
 244635,
 43362,
 123507,
 224889,
 261475,
 266738,
 257963,
 68500,
 86231,
 116406,
 4464,
 149368,
 262771,
 233305,
 220779,
 81422,
 61778,
 272520,
 224165,
 62078,
 268459,
 278478,
 250606,
 48657,
 155322,
 122788,
 283426,
 82308,
 191461,
 79925,
 244073,
 250219,
 163138,
 180701,
 92246,
 120921,
 81851,
 263967,
 103438,
 187726,
 53895,
 16345,
 53726,
 113000,
 232530,
 251306,
 214627,
 150341,
 30428,
 107038,
 155492,
 230676,
 82318,
 222471,
 183101,
 190873,
 212105,
 243382,
 212727,
 9674,
 152522,
 116774,
 232548,
 256318,
 248524,
 18044,
 217074,
 261198,
 146576,
 168880,
 107784,
 237683,
 140525,
 60857,
 250525,
 103386,
 267221,
 151915,
 135931,
 230085,
 263283,
 213624,
 157307,
 289153,
 220197,
 101785,
 171093,
 102066,
 220229,
 163388,
 32868,
 8281,
 128006,
 7049,
 69260,
 59196,
 228122,
 216184,
 172940,
 128867,
 184892,
 269440,
 200830,
 131616,
 140437,
 84910,
 96021,
 273284,
 167891,
 22434

In [46]:
for i in iid[:5]:
    print(artist_track_df.loc[artist_track_df.artist_id==i].artist)

61358    Jamey Aebersold Play-A-Long
Name: artist, dtype: object
187737    David Cochran Heath
Name: artist, dtype: object
150305    Kurt Prestel
Name: artist, dtype: object
129571    Conlon Nancarrow
Name: artist, dtype: object
258621    Clipse ft. AB-Liva & Rosco P Goldchain
Name: artist, dtype: object
