In [257]:
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.tree import DecisionTreeClassifier as DecisionTree
from sklearn.ensemble import RandomForestClassifier as RandomForest
from sklearn.svm import LinearSVC
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.cross_validation import KFold
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
import matplotlib.pyplot as plt
% matplotlib inline
from sklearn.model_selection import train_test_split as sk_split

import numpy as np
import pandas as pd

from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans
from sklearn.feature_extraction import DictVectorizer as DV

# import BeautifulSoup and pyQuery to scrape HTML pages
from bs4 import BeautifulSoup as bs
from pyquery import PyQuery as pq
import requests
import csv

# import musicbrainz
import musicbrainzngs as mb

# To Do:
Feature Extraction (MusicBrainz API)

Feature Engineering 

Model Fitting

In [258]:
# read information on the users, can have missing data shown as blanks
profiles = pd.read_csv("profiles.csv")
profiles.head()

Unnamed: 0,user,sex,age,country
0,fa40b43298ba3f8aa52e8e8863faf2e2171e0b5d,f,25.0,Sweden
1,5909125332c108365a26ccf0ee62636eee08215c,m,29.0,Iceland
2,d1867cbda35e0d48e9a8390d9f5e079c9d99ea96,m,30.0,United States
3,63268cce0d68127729890c1691f62d5be5abd87c,m,21.0,Germany
4,02871cd952d607ba69b64e2e107773012c708113,m,24.0,Netherlands


In [259]:
# read information on the artists, can have missing data as blanks
artists = pd.read_csv("artists.csv")
artists.head()

Unnamed: 0,artist,name
0,03098741-08b3-4dd7-b3f6-1b0bfa2c879c,Liars
1,69c4cc43-8163-41c5-ac81-30946d27bb69,CunninLynguists
2,7a2e6b55-f149-4e74-be6a-30a1b1a387bb,The Desert Sessions
3,7002bf88-1269-4965-a772-4ba1e7a91eaa,Glenn Gould
4,dbf7c761-e332-467b-b4d9-aafe06bbcf8f,G. Love & Special Sauce


In [260]:
print profiles.shape
print artists.shape

(233286, 4)
(2000, 2)


In [262]:
# read in training data
df_train = pd.read_csv("train.csv")
df_train.head()

Unnamed: 0,user,artist,plays
0,eb1c57ddc9e0e2d005169d3a1a96e8dd95e3af03,5a8e07d5-d932-4484-a7f7-e700793a9c94,554
1,44ce793a6cd9d20f13f4a576a818ef983314bb5d,a3a92047-be1c-4f3e-8960-c4f8570984df,81
2,da9cf3f557161d54b76f24db64be9cc76db008e3,eeb1195b-f213-4ce1-b28c-8565211f8e43,708
3,8fa49ab25d425edcf05d44bfc1d5aea895287d81,a1419808-65d3-4d40-998c-1a0bac65eabc,265
4,b85fcaef67d2669cd99b334b5e8c8705263db2cf,a3cb23fc-acd3-4ce0-8f36-1e5aa6a18432,220


In [263]:
# read in training data
df_test = pd.read_csv("test.csv")
df_test.head()

Unnamed: 0,Id,user,artist
0,1,306e19cce2522fa2d39ff5dfc870992100ec22d2,4ac4e32b-bd18-402e-adad-ae00e72f8d85
1,2,9450d351278df4938bdea4ed86aec940a4e927ac,1f574ab1-a46d-4586-9331-f0ded23e0411
2,3,801909d6955f59033c88595d3d7f8a6a5dcd53cc,3eb72791-6322-466b-87d3-24d74901eb2d
3,4,e3ed47445c127fbeff47fb58f6bbf2f3b4535d82,61604b45-8a91-4e33-a1b6-45d7b1fec4e5
4,5,a73f46652103f3a5f7429159310f6928f79644aa,5dfdca28-9ddc-4853-933c-8bc97d87beec


In [264]:
print df_train.shape
print df_test.shape

(4154804, 3)
(4154804, 3)


In [None]:
# scrape wikipedia/musicbrainz
artist_genres = {}
artist_n = len(artists)

# scrape musicbrainz page for each artist
for artist in range(artist_n):
    # retrieve page
    a = artists.iloc[artist][0]
    mb_page = "https://musicbrainz.org/artist/" + str(a)
    mb_page_get = requests.get(mb_page)
    soup = BeautifulSoup(mb_page_get.text, "html.parser")
    
    # Find a particular "li" element on the MusicBrainz page, which contains the Wikipedia page link
    li_class = soup.find("li", attrs={"class": "wikipedia-favicon"})
    if li_class == None:
        artist_genres[a] = ['NA']
        continue
    wiki_link = li_class.find('a')['href']
    
    # Get the wikipedia page for the artist
    wiki_page_get = requests.get("https:" + wiki_link)
    wiki_page = wiki_page_get.text

    # extract the genres from the wikipedia page
    m = 0
    urls = []
    genres = []
    for th in pq(wiki_page)(".infobox tr th"):
        if pq(th).text() == "Genres":
            for e in pq(th).nextAll("td a"):
                if pq(e).attr.href.find("#cite_note") == -1:
                    urls.append(pq(e).attr.href)
                    genres.append(pq(e).attr.title)
                    m = 1
    if (m == 0):
        genres = ['NA']
    
    artist_genres[a] = genres

artist_genres

In [52]:
# create one-hot encoded matrix detailing the genres of each artist

# total list of possible genres
set_of_genres = set([])
for value in artist_genres.values():
    for gen in value:
        set_of_genres.add(gen)

# initialize to 0 for number of apperances of each genre
for genr in set_of_genres:
    artists[genr] = 0

# Label each genre with 1 if artist is that genre
for art in range(artist_n):
    id_ = artists.iloc[art]["artist"]
    for gen in artist_genres[id_]:
        artists.loc[art, gen] = 1
        

In [53]:
# save the artists into json file
artists.to_json("artist_genres.json")

# work with new one-hot encoded artist genres dataframe

In [8]:
# read in the artists' info(with all genres) from a json file
artists = pd.read_json("artist_genres.json")
artists.head()

Unnamed: 0,2 Tone,2 Tone (music genre),2-Tone,2-step garage,Acid house,Acid jazz,Acid rock,Acid techno,Acoustic hip hop,Acoustic music,...,Witch house (music genre),Wonky pop,Word play,World Music,World music,Worldbeat,Yass (music),Yé-yé,artist,name
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,03098741-08b3-4dd7-b3f6-1b0bfa2c879c,Liars
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,69c4cc43-8163-41c5-ac81-30946d27bb69,CunninLynguists
10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,4bd95eea-b9f6-4d70-a36c-cfea77431553,Alice in Chains
100,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,bf0f7e29-dfe1-416c-b5c6-f9ebc19ea810,Bee Gees
1000,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,a1ed5e33-22ff-4e7d-a457-42f4309e135f,Aqua


In [9]:
artists.shape

(2000, 664)

In [265]:
# calculate the number of instances of each user in the training data
# this "count" gives us an idea of how much different music the user listens to (# of artists)
user_active = df_train.drop(['plays'],axis = 1).groupby(["user"]).count()
user_active.columns = ["counts"]
user_active['user'] = user_active.index
profiles  = profiles.merge(user_active, how="outer")
profiles.head()

Unnamed: 0,user,sex,age,country,counts
0,fa40b43298ba3f8aa52e8e8863faf2e2171e0b5d,f,25.0,Sweden,20
1,5909125332c108365a26ccf0ee62636eee08215c,m,29.0,Iceland,15
2,d1867cbda35e0d48e9a8390d9f5e079c9d99ea96,m,30.0,United States,20
3,63268cce0d68127729890c1691f62d5be5abd87c,m,21.0,Germany,14
4,02871cd952d607ba69b64e2e107773012c708113,m,24.0,Netherlands,16


In [266]:
profiles.shape

(233286, 5)

# Artist popularity

In [50]:
sum(artists['counts'])

4154804

In [12]:
# calculate the number of unique "listens by users" for each artist
item_popularity = df_train.drop(['plays'],axis = 1).groupby(["artist"]).count()
item_popularity.columns = ["counts"]
item_popularity["artist"] = item_popularity.index
artists = artists.merge(item_popularity,how = "outer")
artists.head()

Unnamed: 0,2 Tone,2 Tone (music genre),2-Tone,2-step garage,Acid house,Acid jazz,Acid rock,Acid techno,Acoustic hip hop,Acoustic music,...,Wonky pop,Word play,World Music,World music,Worldbeat,Yass (music),Yé-yé,artist,name,counts
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,03098741-08b3-4dd7-b3f6-1b0bfa2c879c,Liars,847
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,69c4cc43-8163-41c5-ac81-30946d27bb69,CunninLynguists,1022
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,4bd95eea-b9f6-4d70-a36c-cfea77431553,Alice in Chains,4725
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,bf0f7e29-dfe1-416c-b5c6-f9ebc19ea810,Bee Gees,1457
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,a1ed5e33-22ff-4e7d-a457-42f4309e135f,Aqua,681


# Run clustering algorithms on the artists

In [25]:
# cluster artists
artists_array = artists.drop(["name","artist"],axis = 1).values
len(artists_array)

2000

In [27]:
K = 200
kmeans_artists = KMeans(n_clusters=K, init='k-means++', n_init=10, max_iter=300)
labels_artists = kmeans_artists.fit_predict(artists_array)
labels_artists

array([ 20,   0, 103, ...,  51, 164,  86])

# Run clustering algorithms on the users

In [267]:
# Vectorize gender and countries information
cat_colmns = ['sex','country']
cat_df = profiles[cat_colmns]

# fill missing data with NAs since we have limited features with which to impute
cat_df = cat_df.fillna('NA')
cat_dict = cat_df.T.to_dict().values()

vectorizer = DV(sparse = False)
vec_x_cat_train = vectorizer.fit_transform(cat_dict)
print vec_x_cat_train.shape

(233286L, 242L)


In [268]:
# add those features and drop the originals
# we are one-hot encoding gender and country
profiles_vectorized = profiles.drop(['sex','country','user'],axis = 1)
vect_df = pd.DataFrame(data=vec_x_cat_train)
frames = [profiles_vectorized, vect_df]
profiles_vectorized = pd.concat(frames,axis = 1)
profiles_vectorized = profiles_vectorized.fillna(0)
profiles_vectorized.head()

Unnamed: 0,age,counts,0,1,2,3,4,5,6,7,...,232,233,234,235,236,237,238,239,240,241
0,25.0,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,29.0,15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,30.0,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,21.0,14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,24.0,16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [269]:
# Cluster the users
K = 400
users_array = profiles_vectorized.values
kmeans_users = KMeans(n_clusters=K, init='k-means++', n_init=10, max_iter=300)
labels_users = kmeans_users.fit_predict(users_array)

In [270]:
len(labels_users) 

233286

In [271]:
# construct the user clusters
clusters_users = pd.DataFrame(data = profiles["user"])
clusters_users["cluster"] = labels_users

In [35]:
# construct the artists cluster
clusters_artists = pd.DataFrame(data = artists["artist"])
clusters_artists["cluster"] = labels_artists

In [272]:
clusters_users.to_csv("user_clusters.csv")
clusters_users.to_json("user_clusters.json")
clusters_users.head()

Unnamed: 0,user,cluster
0,fa40b43298ba3f8aa52e8e8863faf2e2171e0b5d,202
1,5909125332c108365a26ccf0ee62636eee08215c,102
2,d1867cbda35e0d48e9a8390d9f5e079c9d99ea96,300
3,63268cce0d68127729890c1691f62d5be5abd87c,113
4,02871cd952d607ba69b64e2e107773012c708113,83


In [37]:
clusters_artists.to_csv("artists_clusters.csv")
clusters_artists.to_json("artists_clusters.json")
clusters_artists.head()

Unnamed: 0,artist,cluster
0,03098741-08b3-4dd7-b3f6-1b0bfa2c879c,20
1,69c4cc43-8163-41c5-ac81-30946d27bb69,0
2,4bd95eea-b9f6-4d70-a36c-cfea77431553,103
3,bf0f7e29-dfe1-416c-b5c6-f9ebc19ea810,128
4,a1ed5e33-22ff-4e7d-a457-42f4309e135f,113


In [None]:
# split the data frame by clusters
gb = clusters_artists.groupby(['cluster'])
artists_groups = [gb.get_group(x) for x in gb.groups]

In [273]:
gb = clusters_users.groupby(['cluster'])
users_groups = [gb.get_group(x) for x in gb.groups]

In [275]:
# read in users' clusters
clusters_users_read = pd.read_csv("user_clusters.csv").drop(["Unnamed: 0"],axis = 1)
clusters_users_read.head()

Unnamed: 0,user,cluster
0,fa40b43298ba3f8aa52e8e8863faf2e2171e0b5d,202
1,5909125332c108365a26ccf0ee62636eee08215c,102
2,d1867cbda35e0d48e9a8390d9f5e079c9d99ea96,300
3,63268cce0d68127729890c1691f62d5be5abd87c,113
4,02871cd952d607ba69b64e2e107773012c708113,83


# Calculate each user's median/mean views

In [276]:
# Predict via the user-specific median.
# If the user has no data, use the global median.

train_file = 'train.csv'
test_file  = 'test.csv'
soln_file  = 'user_median.csv'

# Load the training data.
train_data = {}
with open(train_file, 'r') as train_fh:
    train_csv = csv.reader(train_fh, delimiter=',', quotechar='"')
    next(train_csv, None)
    for row in train_csv:
        user   = row[0]
        artist = row[1]
        plays  = row[2]
    
        if not user in train_data:
            train_data[user] = {}
        
        train_data[user][artist] = int(plays)

# Compute the global median and per-user medians/means.
plays_array  = []
user_medians = {}
user_means = {}
for user, user_data in train_data.iteritems():
    user_plays = []
    for artist, plays in user_data.iteritems():
        plays_array.append(plays)
        user_plays.append(plays)

    user_medians[user] = np.median(np.array(user_plays))
    user_means[user] = np.mean(np.array(user_plays))
global_median = np.median(np.array(plays_array))

In [277]:
# Find the user's cluster: Construct a dict that you input the cluster and return users in this cluster
user_cluster_dict = clusters_users.set_index(['user'])['cluster'].to_dict()

# find the similar users in the same cluster
groups_lookup = {}
gb = clusters_users.groupby(['cluster'])
clusters_groups = [gb.get_group(x) for x in gb.groups]

for index in range(len(clusters_groups)):
    groups_lookup[index] = [item[0] for item in clusters_groups[index].set_index(["cluster"]).values]


In [278]:
# find all users who listened to the same artist (key = artist, value = list of users who listened to artist)
gb = df_train.drop(["plays"],axis = 1).groupby(["artist"])
users_listened_groups = [gb.get_group(x) for x in gb.groups]

# put in the artist's name, return the users who have listened to this artist
users_lookup = {}
for index in range(len(users_listened_groups)):
    users_lookup[users_listened_groups[index].set_index(['artist']).index[0]] =[item[0] for item in users_listened_groups[index].set_index(['artist']).values]

In [279]:
# find all artists listened to by some user (key = user, value = list of artists he listened to)
gb = df_train.drop(["plays"],axis = 1).groupby("user")
users_plays = [gb.get_group(x) for x in gb.groups]

# Values: list of artists he/she listened to before
users_listened = {}
for index in range(len(users_plays)):
    users_listened[users_plays[index].set_index(['user']).index[0]] = [item[0] for item in users_plays[index].set_index(["user"]).values]

In [282]:
# Here is the function we calculates the similartity beween two users
from scipy.stats.stats import pearsonr as pr
def similarity_users(user1, user2):
    rate_u1 = []
    rate_u2 = []
    
    # Extract artists listened to by both users
    simi_1 = users_listened[user1]
    simi_2 = users_listened[user2]
    
    # find co-listened-to items
    inter = list(set(simi_1) & set(simi_2))
    
    # if there is no co-listened-to item, no similarity
    if(len(inter) == 0):
        return 0
    else:
        # build lists containing number of plays by users for each co-listened-to artist
        for item in inter:
            rate_u1.append(train_data[user1][item])
            rate_u2.append(train_data[user2][item])

    # Pearson coefficient r for the number of times each user played a given artist
    corrxy = pr(rate_u1,rate_u2)
    if (np.isnan(corrxy[0])):
        return 0

    if(corrxy[0] < 0):
        return 0
    
    # we regularize our similarity to make our plays predictions converge to user average 
    # this is to avoid over-fitting to our training data
    # this is especially important because there are usually few co-listened to authors
    
    return corrxy[0] * 0.6
    #return corrxy[0]

# Training Predictions via cross-validation

In [310]:
# 40% of data for validation set
from sklearn.cross_validation import train_test_split
itrain, itest = train_test_split(xrange(df_train.shape[0]), train_size=0.6)
indexes = np.ones(df_train.shape[0], dtype='int')
indexes[itrain] = 1
indexes[itest] = 0
indexes = (indexes == 1)
indexes

array([ True, False, False, ...,  True,  True,  True], dtype=bool)

In [311]:
# This is to predict # of plays
# at a high level the goal is to leverage our clusterings of users
# we look at only the users who are in the same cluster and 
# who have listened to the given artist
# we then take a weighted-mean-deviation of each of the similar users' plays of the artist, weighted by the similarity between the user
# we are predicting on and the users in the training data. 
# We subtract this weighted
def predict_with_clusters(user_list, artist_list):
    r_exp = []
    index = 0
    n = len(user_list)

    for i in range(n):
        # find the median plays for user i
        if user_list[i] in user_medians:
            user_med = user_medians[user_list[i]]
        # if there's no median for the user, use global median
        else:
            user_med =  global_median

        # Here I get the clusters
        cluster = user_cluster_dict[user_list[i]]
        # find other users in that cluster
        similar_users = groups_lookup[cluster]

        # find all of the users who played the artists' song
        user_listened  = users_lookup[artist_list[i]]

        # find intersection of two groups above (those who are similar and those who listened to the artist previously)
        inter = list(set(similar_users) & set(user_listened))

        # for all users in the intersection, calculate the weighted play difference
        # between the intersection-users and user i based on similarity between users
        denom = 0
        numer = 0
        for j in inter:
            # calculate similarity between user j (those users in the intersection) and user i
            similarity = similarity_users(j,user_list[i])
            if(similarity == 0):
                continue
            # denominator should be normalized to 1 for regularization - prevents overfitting
            denom += abs(similarity) / 0.6
            # denom += abs(similarity)
            
            # find user j's average if it exists
            if j in user_medians:
                j_avg = user_medians[j]
            else:
                j_avg = global_median
            
            # difference between the number of times user j listened to artist i and the average listens by user j
            r_diff = train_data[j][artist_list[i]] - j_avg
            
            # weight the difference in the number of listens by similarity of users
            numer += similarity * r_diff

        # show progress
        if(i % 100000 == 0):
            print i

        # avoid division by 0 issues
        #print numer
        if denom != 0:
            #print user_med
            #print float(numer)/denom
            r_exp.append(max(user_med + float(numer)/denom, 1.0))
        else:
            r_exp.append(user_med)
    return r_exp

In [312]:
new_train = df_train[indexes]
new_test = df_train[indexes == False]

test_label = new_test["plays"].values
len(new_test)

1661922

In [313]:
user_list = new_test['user'].values
artist_list = new_test['artist'].values
len(user_list)

1661922

In [314]:
pred_result = predict_with_clusters(user_list, artist_list)

0
100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
1500000
1600000


In [315]:
pred_result

[19.0,
 127.95154813603156,
 236.84999999999997,
 468.63646451979446,
 209.68188152234373,
 4.0271726292282608,
 16.600000000000001,
 40.420562484075155,
 24.305835984910271,
 139.25,
 90.871205987628102,
 42.200000000000003,
 1.0,
 191.30019392065566,
 587.86806285229272,
 167.66505372070486,
 358.89999999999998,
 191.52358716770564,
 51.600000000000001,
 23.375,
 193.16249814883352,
 325.61747017335119,
 351.60000000000002,
 31.041533072897707,
 41.111000676737618,
 88.599999999999994,
 51.18573057377975,
 135.16395555972576,
 141.59999999999999,
 156.67936754323091,
 256.12523270386998,
 40.600000000000001,
 21.63155269567406,
 360.09379913182659,
 2.0,
 325.60000000000002,
 50.410825805125114,
 46.945692501103039,
 121.93520422842552,
 76.821553586341096,
 624.84374942072645,
 128.96106997038484,
 21.824252987788,
 211.50545954534081,
 465.09199709497523,
 243.0,
 210.71491397018283,
 75.0,
 492.6898289224236,
 359.90683412205703,
 639.20000000000005,
 1.0,
 212.0,
 99.799999999999

In [197]:
save_preds = pred_result[:]
save_preds

[371.3347260441617,
 -95.71875,
 93.207049938176439,
 274.04166666666669,
 1134.3177373551184,
 227.65520362273284,
 209.04567924102588,
 -109.91316682696961,
 -5.3076923076923066,
 -60.143675229079292,
 130.26499771799686,
 -42.789321550017199,
 410.93333333333334,
 95.144883466476927,
 243.79166666666669,
 335.14999999999998,
 385.72013496103085,
 255.90922033359368,
 54.928571428571431,
 272.51153219995638,
 9.2777777777777715,
 51.816666666666663,
 365.65587816221404,
 628.87004235105871,
 -27.475212416057786,
 54.124300347496508,
 214.60000000000002,
 34.61904761904762,
 528.90625,
 -34.282608695652158,
 -45.241542264309459,
 93.316738226102743,
 69.599645702709992,
 22.882352941176471,
 129.76425795566172,
 498.75657335185144,
 95.327962041705817,
 -12.709073999370723,
 177.77038574524391,
 262.7379714895298,
 -33.033966628197369,
 -245.03634006788457,
 217.84615384615384,
 800.06941462692248,
 32.769617454926617,
 348.69548953873084,
 -125.32857142857142,
 181.78571428571428,
 2

In [316]:
error = mean_absolute_error(pred_result, test_label)
print error

106.546404092


# cross-validation results

K = 400, reg w/ 0.6 --> 106.5464 Mean absolute error

K = 500, unreg --> 106.2873 Mean absolute error

K = 600, reg w/ 0.6 --> 108.952 Mean absolute error

K = 600, unreg --> 107.1245 Mean absolute error

# Testing predictions (for kaggle)

In [301]:
user_list_test = df_test['user'].values
artist_list_test = df_test['artist'].values
len(user_list_test)

4154804

In [302]:
pred_result_test = predict_with_clusters(user_list_test, artist_list_test)

0
100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
1500000
1600000
1700000
1800000
1900000
2000000
2100000
2200000
2300000
2400000
2500000
2600000
2700000
2800000
2900000
3000000
3100000
3200000
3300000
3400000
3500000
3600000
3700000
3800000
3900000
4000000
4100000


In [303]:
pred_result_test

[3263.0,
 143.5,
 207.0,
 307.0,
 109.0,
 111.40000000000001,
 366.06,
 596.79999999999995,
 505.30000000000001,
 148.0,
 142.25,
 287.0,
 7.4000000000000004,
 72.5,
 295.5,
 147.51973187781917,
 85.5,
 476.5,
 156.41458881157016,
 131.88868100277227,
 46.899999999999999,
 73.0,
 108.36321587894547,
 481.0,
 333.61638028522987,
 144.0,
 50.587037758046264,
 12.5,
 48.348344964899347,
 228.5,
 99.0,
 22.0,
 112.29573423406789,
 200.84888968054554,
 40.5,
 148.59999999999999,
 23.684889368412684,
 33.5,
 332.55000000000001,
 328.19999999999999,
 27.0,
 65.0,
 142.45759025356432,
 94.169748386139801,
 132.0,
 520.0,
 364.90601375503121,
 215.69999999999999,
 95.400000000000006,
 156.13559988636143,
 183.0,
 50.899999999999999,
 167.0,
 809.0,
 183.97567063640668,
 148.0,
 138.69999999999999,
 91.766967125785698,
 286.78006477597148,
 54.0,
 31.0,
 209.83744173778905,
 77.159854443729429,
 125.01128142680037,
 145.0595751520284,
 496.3842174814427,
 29.5,
 139.09370322019399,
 17.0,
 222.3

In [304]:
len(pred_result_test)

4154804

# Write predictions to file

In [305]:
def write_to_file(filename, predictions):
    with open(filename, "w") as f:
        f.write("Id,plays\n")
        for i,p in enumerate(predictions):
            f.write(str(i+1) + "," + str(p) + "\n")

In [306]:
filename = "predictions_P3.csv"
write_to_file(filename,pred_result_test)

In [309]:
len(set(labels_users))

400