In [2]:
import pandas as pd

In [3]:
df_rate = pd.read_csv('../data/play_rate.csv')

In [4]:
df_rate.head(5)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,uid,song_id,play_ratio,freq,user_id_map,song_id_map
0,0,0,12333,703693.0,0.821429,12,0,0
1,1,1,12333,708428.0,9.682796,13,0,1
2,2,2,12333,875447.0,8.418803,11,0,2
3,3,3,12333,5114569.0,7.80212,4092,0,3
4,4,4,12333,5237384.0,8.0,6079,0,4


In [7]:
condition1 = df_rate['play_ratio'] >= 0
condition2 = df_rate['freq'] >= 10
df_rate = df_rate[condition1 & condition2]

In [8]:
df_rate.size

10014056

In [9]:
len(df_rate.song_id.unique())

20814

In [11]:
from scipy import sparse
highest_user_id = df_rate.user_id_map.max() 
highest_song_id = df_rate.song_id_map.max() 
ratings_mat = sparse.lil_matrix((highest_user_id, highest_song_id)) 
ratings_mat

<49816x20813 sparse matrix of type '<type 'numpy.float64'>'
	with 0 stored elements in LInked List format>

In [13]:
for _, row in df_rate.iterrows():
    # subtract 1 from id's due to match 0 indexing 
    ratings_mat[row.user_id_map-1, row.song_id_map-1] = row.play_ratio

In [14]:
from sklearn.decomposition import TruncatedSVD

In [15]:
def fit_uvd(M,k):
    # use TruncatedSVD to realize UVD 
    svd = TruncatedSVD(n_components=k, n_iter=7, random_state=0) 
    svd.fit(M)

    V = svd.components_ 
    U = svd.transform(M)
    
    return U,V, svd

U,V,svd = fit_uvd(ratings_mat,200)

In [16]:
print(U.shape,V.shape)

((49816, 200), (200, 20813))


In [18]:
import numpy as np
# reconstruct 
ratings_mat_fitted = U.dot(V) # U*V
errs = np.array((ratings_mat-ratings_mat_fitted).flatten()).squeeze() 
mask = np.array((ratings_mat.todense()).flatten()).squeeze()>0

# calculate errs
mse = np.mean(errs[mask]**2) 
average_abs_err = abs(errs[mask]).mean() 
print(mse) 
print(average_abs_err)

704.7594953676677
2.229688318700422


In [19]:
# compare with another way to reconstruct matrix # with the above "tranformed to the new space and back" language # without the UV language, we can do:

# reconstruct M 
with inverse_transform 
ratings_mat_fitted_2 = svd.inverse_transform(svd.transform(ratings_mat)) 
ratings_mat_fitted = U.dot(V) 
print(sum(sum(ratings_mat_fitted - ratings_mat_fitted_2)))

0.0


In [20]:
user_id = 12311
n = 10

pred_ratings = ratings_mat_fitted[user_id,:] 
item_index_sorted_by_pred_rating = list(np.argsort(pred_ratings))[::-1]

items_rated_by_this_user = ratings_mat[user_id].nonzero()[1]

unrated_items_by_pred_rating = [item for item in item_index_sorted_by_pred_rating if item not in items_rated_by_this_user]

unrated_items_by_pred_rating[:n]

[183, 2215, 1833, 3579, 578, 2373, 2109, 790, 252, 10217]

In [22]:
df_song_info = pd.read_csv('../data/play/song_info.csv')
df_song_info = df_song_info.drop(['Unnamed: 0'],axis=1)

  interactivity=interactivity, compiler=compiler, result=result)


In [23]:
recommend_list = []
for i in range(10):
    song_map_id = unrated_items_by_pred_rating[i]
    song_id = df_rate[df_rate['song_id_map'] == song_map_id]['song_id'].unique()[0]
    recommend_list.append(df_song_info[df_song_info['song_id'] == song_id].iloc[0])
recommend_list

[song_id      6.61337e+06
 song_type              0
 song_name        My Type
 singer           iKON[韩]
 Name: 37402, dtype: object, song_id        462233
 song_type           2
 song_name    我和草原有个约定
 singer           降央卓玛
 Name: 2546, dtype: object, song_id      6.63714e+06
 song_type              0
 song_name     酒干倘卖无(现场版)
 singer               汤晶锦
 Name: 15689, dtype: object, song_id         7.08001e+06
 song_type                 1
 song_name    你开心所以我快乐(DJ何鹏)
 singer             司徒兰芳&安东阳
 Name: 86469, dtype: object, song_id      7.01065e+06
 song_type              0
 song_name           虎口脱险
 singer                老狼
 Name: 8554, dtype: object, song_id      3.42906e+06
 song_type              0
 song_name         男人酒女人泪
 singer                陈瑞
 Name: 132879, dtype: object, song_id      5.9125e+06
 song_type             1
 song_name    我的天空(小黄人版)
 singer             网络歌手
 Name: 113646, dtype: object, song_id      1.09788e+06
 song_type              0
 song_name             兄弟
 s