In [70]:
import numpy as np
import scipy
import pandas as pd
import math
import random
import sklearn
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
import matplotlib.pyplot as plt

# Load Data

In [136]:
# store music dataset from 1991 to 2018
music_data = pd.DataFrame()
for i in range(1991, 2019):
    temp = pd.read_csv('https://raw.githubusercontent.com/z22741/RecommenderSys/master/spotify_data/music_data/'+str(i)+'.csv', index_col=0)
    music_data = pd.concat([music_data, temp], axis=0, ignore_index=True)

In [137]:
# store 3 user data dataset 
user_data = pd.DataFrame()
for i in range(50,125,25):
    temp = pd.read_csv('https://raw.githubusercontent.com/z22741/RecommenderSys/master/spotify_data/user_data/alpha'+str(i)+'.csv', index_col=0)
    user_data = pd.concat([user_data, temp], axis=0, ignore_index=True)

In [138]:
db_data = pd.read_csv('https://raw.githubusercontent.com/z22741/RecommenderSys/master/spotify_data/user_data/database.csv')

In [152]:
music_data.head()

Unnamed: 0,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,mode,song_id,speechiness,tempo,valence
0,0.108,0.602,0.553,0.0,11,0.0512,-9.336,1,5xlS0QkVrSH7ssEbBgBzbM,0.0328,130.594,0.971
1,0.425,0.611,0.566,0.000954,9,0.878,-11.715,1,4gdhsYfXJoVQsWpbmh32Da,0.0295,125.24,0.639
2,0.0468,0.737,0.801,0.888,0,0.105,-9.058,1,3oixYd5Q41mfwfwpc2LuYx,0.0344,145.003,0.859
3,0.948,0.34,0.265,0.815,8,0.0899,-7.477,1,0pNu2I9Jbio4lLVO6H0lE1,0.0284,110.588,0.0866
4,0.63,0.587,0.493,0.0,8,0.224,-8.368,1,0z5E34e7ZT3XKMYQNXh6tH,0.0296,131.727,0.21


In [139]:
user_data.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,user_session,user_response,user_id,database_id
0,0.327,0.686,10,-11.835,0,0.161,0.163,0.194,0.948,0.631,172.79,1,1,0,5bb7ac8a0c27ee4c381dd70a
1,0.659,0.588,6,-9.262,0,0.03,0.413,0.159,0.112,0.561,143.021,1,0,0,5bb7ac8a0c27ee4c381dd70d
2,0.333,0.0797,10,-14.597,1,0.036,0.97,0.0,0.112,0.221,176.535,1,0,0,5bb7ac930c27ee4c381dd9c7
3,0.367,0.769,9,-7.258,1,0.0317,0.00032,0.628,0.0631,0.7,173.186,1,1,0,5bb7ac8a0c27ee4c381dd703
4,0.443,0.851,7,-3.809,1,0.218,0.0856,0.0,0.499,0.693,170.346,1,1,0,5bb7ac9a0c27ee4c381ddbab


In [161]:
db_data.head()

Unnamed: 0,database_id,spotify_id,song_name,preview_url,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,5bb7ac890c27ee4c381dd69e,0bxllxzqkGNSKo8pHtZyA3,E.T.,https://p.scdn.co/mp3-preview/b010313e8fbd71b1...,0.626,0.457,8,-9.091,0,0.0339,0.0876,7.7e-05,0.0801,0.223,149.99
1,5bb7ac890c27ee4c381dd69f,362LzQDbfNdZDuv37BIEkG,Song For Zula - Live From Spotify London,https://p.scdn.co/mp3-preview/f85a3283dc9d614c...,0.441,0.344,4,-9.799,1,0.0299,0.386,0.000277,0.917,0.111,87.588
2,5bb7ac890c27ee4c381dd6a0,3AqPL1n1wKc5DVFFnYuJhp,To Build A Home,https://p.scdn.co/mp3-preview/b39e590a07f73f93...,0.264,0.128,9,-15.443,1,0.034,0.885,0.352,0.105,0.0774,148.499
3,5bb7ac890c27ee4c381dd6a1,7sDzAu7alAL1hWvL86tSGB,Sweater Weather - Spotify Sessions Curated by ...,https://p.scdn.co/mp3-preview/bed9e182c4c5af6b...,0.61,0.165,8,-14.659,1,0.0502,0.981,0.000544,0.134,0.258,102.68
4,5bb7ac890c27ee4c381dd6a2,7LVHVU3tWfcxj5aiPFEW4Q,Fix You,https://p.scdn.co/mp3-preview/b1640815319b2df5...,0.209,0.418,3,-8.74,1,0.0338,0.163,0.00195,0.113,0.123,138.265


# Drop Col

In [141]:
music_to_drop = ['album_id', 'album_name', 'analysis_url', 'artist_ids', 'artist_names', 'disc_number','duration_ms', 'popularity', 'preview_url', 'song_name', 'time_signature', 'track_href', 'track_number']
music_data.drop(music_to_drop, inplace=True, axis=1)

In [142]:
user_to_drop = ['user_session']
user_data.drop(user_to_drop, inplace=True, axis=1)

In [143]:
music_data.columns.values.tolist()
# user_data.columns.values.tolist()

['acousticness',
 'danceability',
 'energy',
 'instrumentalness',
 'key',
 'liveness',
 'loudness',
 'mode',
 'song_id',
 'speechiness',
 'tempo',
 'valence']

In [144]:
# sort col
music_data = music_data.reindex(sorted(music_data.columns), axis=1)
user_data = user_data.reindex(sorted(user_data.columns), axis=1)

# Remove Duplicated Data

In [146]:
music_data['song_id'].is_unique

False

In [147]:
# music_data.drop(music_data[music_data.duplicated(['song_id'])])
music_data = music_data.drop_duplicates(['song_id'], keep='first')

In [148]:
user_data = user_data.drop_duplicates(['database_id', 'user_id'], keep='first')

# Remove Null Data

In [149]:
# music_data[pd.isnull(music_data).any(1)]
music_data = music_data.dropna() 

In [150]:
user_data.user_id.value_counts()

4    433
3    415
5    405
2    361
1    338
8    329
9    328
0    312
6    290
7    244
Name: user_id, dtype: int64

In [151]:
# check every song is in db_data
for i in user_data.database_id.unique().tolist():
    if True not in db_data['database_id'].str.contains(i, regex=True): print(i)

In [163]:
music_data[music_data['song_id'].str.contains('5xlS0QkVrSH7ssEbBgBzbM', regex=True)]

Unnamed: 0,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,mode,song_id,speechiness,tempo,valence
0,0.108,0.602,0.553,0.0,11,0.0512,-9.336,1,5xlS0QkVrSH7ssEbBgBzbM,0.0328,130.594,0.971


In [158]:
db_data[db_data['spotify_id'].str.contains('0z5E34e7ZT3XKMYQNXh6tH', regex=True)]

Unnamed: 0,database_id,spotify_id,song_name,preview_url,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
