In [1]:
# !wget -O data.tar.gz http://mtg.upf.edu/static/datasets/last.fm/lastfm-dataset-1K.tar.gz

In [2]:
# !tar -xvzf data.tar.gz

In [1]:
import pandas as pd
import numpy as np

In [38]:
user_profile_data = pd.read_csv('lastfm-dataset-1K/userid-profile.tsv', sep='\t', header=0)

In [3]:
user_data = pd.read_csv('lastfm-dataset-1K/userid-timestamp-artid-artname-traid-traname.tsv', sep='\t', on_bad_lines='skip')

In [4]:
user_data.columns = ['#id', 'timestamp', 'artist_id', 'artist_name', 'track_id', 'track_name']

In [5]:
gender_data = user_profile_data[['#id', 'gender']].copy()
age_data = user_profile_data[['#id', 'age']].copy()

In [6]:
gender_data.dropna(inplace=True)
age_data.dropna(inplace=True)
len(gender_data), len(age_data)

(884, 286)

In [7]:
user_data.dropna(inplace=True)

In [8]:
gender_data['gender'] = gender_data['gender'].apply(lambda x: 1 if x =='m' else 0)
gender_data

Unnamed: 0,#id,gender
0,user_000001,1
1,user_000002,0
2,user_000003,1
3,user_000004,0
4,user_000005,1
...,...,...
987,user_000996,0
988,user_000997,1
989,user_000998,1
990,user_000999,0


In [9]:
age_data['age'] = age_data['age'].apply(lambda x: 1 if x >= 24 else 0)
age_data

Unnamed: 0,#id,age
2,user_000003,0
5,user_000006,1
7,user_000008,0
8,user_000009,0
9,user_000010,0
...,...,...
357,user_000360,0
358,user_000361,1
361,user_000364,1
362,user_000366,1


In [10]:
import matplotlib.pyplot as plt
 
# Sklearn modules & classes
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier

In [37]:
def run_SVM(X, y):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)
  sc = StandardScaler()
  sc.fit(X_train)
  X_train_std = sc.transform(X_train)
  X_test_std = sc.transform(X_test)
  svc = SVC(C=1.0, random_state=1, kernel='rbf')
  svc.fit(X_train_std, y_train)
  y_predict = svc.predict(X_test_std)
  print("Accuracy score %.3f" %metrics.accuracy_score(y_test, y_predict))
  return metrics.accuracy_score(y_test, y_predict) * 100

In [12]:
def run_KNN(X, y, k=3):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)
    sc = StandardScaler()
    sc.fit(X_train)
    X_train_std = sc.transform(X_train)
    X_test_std = sc.transform(X_test)
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_std, y_train)
    y_predict = knn.predict(X_test_std)
    print("Accuracy score %.3f" %metrics.accuracy_score(y_test, y_predict))
    return metrics.accuracy_score(y_test, y_predict) * 100
    

In [13]:
def run_LogisticRegression(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)
    sc = StandardScaler()
    sc.fit(X_train)
    X_train_std = sc.transform(X_train)
    X_test_std = sc.transform(X_test)
    logisticRegression = LogisticRegression()
    logisticRegression.fit(X_train_std, y_train)
    y_predict = logisticRegression.predict(X_test_std)
    print("Accuracy score %.3f" %metrics.accuracy_score(y_test, y_predict))
    return metrics.accuracy_score(y_test, y_predict) * 100

In [14]:
def get_valid_ids(data):
  return np.array(list(data['#id']))

In [41]:
age_ids = get_valid_ids(age_data)
gender_ids = get_valid_ids(gender_data)

In [15]:
values = {}
for id in age_ids:
  values[id] = user_data['#id'].value_counts()[id]
print(len(values))

286


In [16]:
new_X = list(values.values())

In [17]:
num_points_accuracy_SVM = run_SVM(np.array(new_X).reshape(-1, 1), np.array(list(age_data['age'])))

Accuracy score 0.535


In [18]:
num_points_accuracy_KNN = run_KNN(np.array(new_X).reshape(-1, 1), np.array(list(age_data['age'])))

Accuracy score 0.523


In [19]:
num_points_accuracy_LinearRegression = run_LogisticRegression(np.array(new_X).reshape(-1, 1), np.array(list(age_data['age'])))

Accuracy score 0.523


In [20]:
user_data["timestamp"]

9           2009-05-04T13:06:09Z
11          2009-05-04T12:55:34Z
13          2009-05-03T15:48:25Z
14          2009-05-03T15:37:56Z
15          2009-05-03T15:14:53Z
                    ...         
19098847    2008-01-27T22:02:35Z
19098848    2008-01-27T21:56:52Z
19098849    2008-01-27T21:52:36Z
19098850    2008-01-27T21:49:12Z
19098851    2008-01-27T21:43:14Z
Name: timestamp, Length: 16936134, dtype: object

In [21]:
import datetime
user_data['datem'] = user_data['timestamp'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%dT%H:%M:%SZ'))

In [22]:
user_data.head()

Unnamed: 0,#id,timestamp,artist_id,artist_name,track_id,track_name,datem
9,user_000001,2009-05-04T13:06:09Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,f7c1f8f8-b935-45ed-8fc8-7def69d92a10,The Last Emperor (Theme),2009-05-04 13:06:09
11,user_000001,2009-05-04T12:55:34Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,475d4e50-cebb-4cd0-8cd4-c3df97987962,Tibetan Dance (Version),2009-05-04 12:55:34
13,user_000001,2009-05-03T15:48:25Z,ba2f4f3b-0293-4bc8-bb94-2f73b5207343,Underworld,dc394163-2b78-4b56-94e4-658597a29ef8,"Boy, Boy, Boy (Switch Remix)",2009-05-03 15:48:25
14,user_000001,2009-05-03T15:37:56Z,ba2f4f3b-0293-4bc8-bb94-2f73b5207343,Underworld,340d9a0b-9a43-4098-b116-9f79811bd508,Crocodile (Innervisions Orchestra Mix),2009-05-03 15:37:56
15,user_000001,2009-05-03T15:14:53Z,a16e47f5-aa54-47fe-87e4-bb8af91a9fdd,Ennio Morricone,0b04407b-f517-4e00-9e6a-494795efc73e,Ninna Nanna In Blu (Raw Deal Remix),2009-05-03 15:14:53


In [23]:
user_data["hour"] = user_data["datem"].apply(lambda x : x.hour)

In [24]:
user_data_grouped = user_data.groupby("#id")

In [25]:
all_hours = set(range(24))

In [26]:
user_hours = {}
for (i, user) in enumerate(user_data_grouped):
  print(f"\r{i}", end="")
  hour_info = (user[1].groupby("hour")["hour"].count())
  hour_info = hour_info / hour_info.sum()
  hour_info = dict(hour_info)
  missing_hours = (all_hours - hour_info.keys())
  if len(missing_hours) > 0:
    hour_info.update((x,0) for x in missing_hours)
  user_hours[user[0]] = hour_info


991

In [27]:
def histogram(column_name,all_keys):
  user_hist = {}
  for (i, user) in enumerate(user_data_grouped):
    print(f"\r{i}", end="")
    col_info = (user[1].groupby(column_name)[column_name].count())
    col_info = col_info / col_info.sum()
    col_info = dict(col_info)
    missing_keys = (all_keys - col_info.keys())
    if len(missing_keys) > 0:
      col_info.update((x,0) for x in missing_keys)
    user_hist[user[0]] = col_info
  return user_hist

In [28]:
hour_of_day = histogram("hour",all_hours) # 1

991

In [46]:
accuracies = {}
hod_age = [list(hour_of_day[id].values()) for id in age_ids]
hod_gender = [list(hour_of_day[id].values()) for id in gender_ids]

accuracies['hod'] = {}
accuracies['hod']['svm'] = {}
accuracies['hod']['knn'] = {}
accuracies['hod']['lr'] = {}

accuracies['hod']['svm']['age'] = run_SVM(np.array(hod_age), np.array(list(age_data['age'])))
accuracies['hod']['svm']['gender'] = run_SVM(np.array(hod_gender), np.array(list(gender_data['gender'])))
accuracies['hod']['knn']['age'] = run_KNN(np.array(hod_age), np.array(list(age_data['age'])))
accuracies['hod']['knn']['gender'] = run_KNN(np.array(hod_gender), np.array(list(gender_data['gender'])))
accuracies['hod']['lr']['age'] = run_LogisticRegression(np.array(hod_age), np.array(list(age_data['age'])))
accuracies['hod']['lr']['gender'] = run_LogisticRegression(np.array(hod_gender), np.array(list(gender_data['gender'])))

Accuracy score 0.547
Accuracy score 0.564
Accuracy score 0.488
Accuracy score 0.541
Accuracy score 0.488
Accuracy score 0.575


In [45]:
user_data["week"] = user_data["datem"].apply(lambda x : x.weekday())

In [47]:
all_days_of_week = set(range(7))

In [48]:
day_of_week = histogram("week",all_days_of_week) # 2

991

In [49]:
dow_age = [list(day_of_week[id].values()) for id in age_ids]
dow_gender = [list(day_of_week[id].values()) for id in gender_ids]

accuracies['dow'] = {}
accuracies['dow']['svm'] = {}
accuracies['dow']['knn'] = {}
accuracies['dow']['lr'] = {}

accuracies['dow']['svm']['age'] = run_SVM(np.array(dow_age), np.array(list(age_data['age'])))
accuracies['dow']['svm']['gender'] = run_SVM(np.array(dow_gender), np.array(list(gender_data['gender'])))
accuracies['dow']['knn']['age'] = run_KNN(np.array(dow_age), np.array(list(age_data['age'])))
accuracies['dow']['knn']['gender'] = run_KNN(np.array(dow_gender), np.array(list(gender_data['gender'])))
accuracies['dow']['lr']['age'] = run_LogisticRegression(np.array(dow_age), np.array(list(age_data['age'])))
accuracies['dow']['lr']['gender'] = run_LogisticRegression(np.array(dow_gender), np.array(list(gender_data['gender'])))

Accuracy score 0.547
Accuracy score 0.575
Accuracy score 0.535
Accuracy score 0.538
Accuracy score 0.547
Accuracy score 0.590


In [50]:
all_months_of_year = set(range(12))

In [51]:
user_data["month"] = user_data["datem"].apply(lambda x : x.month)

In [52]:
month_of_year = histogram("month",all_months_of_year) # 3

991

In [55]:
moy_age = [list(month_of_year[id].values()) for id in age_ids]
moy_gender = [list(month_of_year[id].values()) for id in gender_ids]
print(len(moy_age))
accuracies['moy'] = {}
accuracies['moy']['svm'] = {}
accuracies['moy']['knn'] = {}
accuracies['moy']['lr'] = {}

accuracies['moy']['svm']['age'] = run_SVM(np.array(moy_age), np.array(list(age_data['age'])))
accuracies['moy']['svm']['gender'] = run_SVM(np.array(moy_gender), np.array(list(gender_data['gender'])))
accuracies['moy']['knn']['age'] = run_KNN(np.array(moy_age), np.array(list(age_data['age'])))
accuracies['moy']['knn']['gender'] = run_KNN(np.array(moy_gender), np.array(list(gender_data['gender'])))
accuracies['moy']['lr']['age'] = run_LogisticRegression(np.array(moy_age), np.array(list(age_data['age'])))
accuracies['moy']['lr']['gender'] = run_LogisticRegression(np.array(moy_gender), np.array(list(gender_data['gender'])))

286


  accuracies['moy']['svm']['age'] = run_SVM(np.array(moy_age), np.array(list(age_data['age'])))


ValueError: setting an array element with a sequence.

In [None]:
from scipy.stats import entropy

In [None]:
def cal_entropy(hist_data):
  df = pd.DataFrame.from_dict(hist_data).transpose()
  _ , l = df.shape
  return df.apply(lambda x : entropy(x,base=l),axis=1)

In [None]:
hour_of_day_entropy = cal_entropy(hour_of_day) # 4

In [None]:
day_of_week_entropy = cal_entropy(day_of_week) # 5

In [None]:
month_of_year_entropy = cal_entropy(month_of_year) # 6

In [None]:
def working_ratio(hist_data,lower,upper):
  df = pd.DataFrame.from_dict(hist_data).transpose()
  return df.apply(lambda x : np.sum(x[lower:upper+1]),axis=1)

In [None]:
working_hour_ratio = working_ratio(hour_of_day,8,19) # 7

In [None]:
working_day_ratio = working_ratio(day_of_week,0,4) # 8

In [None]:
working_month_ratio = working_ratio(month_of_year,0,5) + working_ratio(month_of_year,8,11) # 9

In [None]:
working_month_ratio

In [None]:
run_SVM(np.array(new_X).reshape(-1, 1), np.array(list(age_data['age'])))

In [None]:
!pip install spotipy

In [None]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
client_id = "19ec26d3aa3945d69e03af78c1cf8c1a"
client_secret = "503c93d96d254127b3038ecd3fb9051f"
credentials = SpotifyClientCredentials(client_id=client_id,client_secret=client_secret)


sp = spotipy.Spotify(client_credentials_manager = credentials)

In [None]:
def getspotifyfeatures(trackid):
  urn = "spotify:track:" + trackid
  out = sp.audio_features(tracks=urn)
  return out

In [None]:
user_song_features = {}
for (i, user) in enumerate(user_data_grouped):
  print(f"\r{i}", end="")
  track_info = (user[1].groupby("track_id")["track_id"].count())
  print(type(track_info),track_info)
  break
  # hour_info = hour_info / hour_info.sum()
  # hour_info = dict(hour_info)
  # missing_hours = (all_hours - hour_info.keys())
  # if len(missing_hours) > 0:
  #   hour_info.update((x,0) for x in missing_hours)
  # user_hours[user[0]] = hour_info


In [None]:
def getid(artistname, trackname):
  res = sp.search(trackname, limit = 10)
  for i in range(len(res['tracks']['items'])):
    if res['tracks']['items'][i]['artists'][0]['name'].lower() == artistname.lower():
      return res['tracks']['items'][i]['id']
  else:
    return None