# Import the Library

In [1]:
import pandas as pd
import numpy as np

import pathlib
import os
import pyarrow

from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity
import math
from sklearn.preprocessing import MinMaxScaler

import umap
#import umap.plot
import hdbscan

import pickle
import matplotlib.pyplot as plt

# Memory reduction

In [2]:
def reduce_memory_usage(df):

    start_memory = df.memory_usage().sum() / 1024**2
    print(f"Memory usage of dataframe is {start_memory} MB")

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != 'object':
            c_min = df[col].min()
            c_max = df[col].max()

            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    pass
        else:
            df[col] = df[col].astype('category')

    end_memory = df.memory_usage().sum() / 1024**2
    print(f"Memory usage of dataframe after reduction {end_memory} MB")
    print(f"Reduced by {100 * (start_memory - end_memory) / start_memory} % ")
    return df

# Read each file to dataframe

In [3]:
song_attributes = reduce_memory_usage(pd.read_csv(
    "song-attributes.txt", sep='\t', header=None, names=['song id', 'album id', 'artist id', 'genre id']))
genre_hierarchy = pd.read_csv("genre-hierarchy.txt", sep='\t', header=None, names=[
                              'genre id', 'parent genre id', 'level', 'genre name'])
for i in range(0, 10):
    globals()[f'train_{i}'] = reduce_memory_usage(pd.read_csv(
        f"train_{i}.txt", sep='\t', header=None, names=['user id', 'song id', 'rating']))

Memory usage of dataframe is 4.1729736328125 MB
Memory usage of dataframe after reduction 1.30413818359375 MB
Reduced by 68.74798888401345 % 
Memory usage of dataframe is 1747.3899612426758 MB
Memory usage of dataframe after reduction 655.2713117599487 MB
Reduced by 62.49999563383406 % 
Memory usage of dataframe is 1756.495704650879 MB
Memory usage of dataframe after reduction 658.6859655380249 MB
Reduced by 62.499995656468435 % 
Memory usage of dataframe is 1760.3262252807617 MB
Memory usage of dataframe after reduction 660.122410774231 MB
Reduced by 62.49999566592009 % 
Memory usage of dataframe is 1760.689826965332 MB
Memory usage of dataframe after reduction 660.2587614059448 MB
Reduced by 62.499995666815124 % 
Memory usage of dataframe is 1760.467514038086 MB
Memory usage of dataframe after reduction 660.1753940582275 MB
Reduced by 62.49999566626793 % 
Memory usage of dataframe is 1758.2516403198242 MB
Memory usage of dataframe after reduction 659.3444414138794 MB
Reduced by 62.49

# save as feather

In [4]:
song_attributes.to_feather('song_attributes.feather')
genre_hierarchy.to_feather('genre_hierarchy.feather')
for i in range(0, 10):
    globals()[f'train_{i}'].to_feather(f'train_{i}.feather')

# re-read feather files

In [5]:
song_attributes = pd.read_feather('song_attributes.feather')
genre_hierarchy = pd.read_feather('genre_hierarchy.feather')

In [6]:
train_0 = pd.read_feather('train_0.feather')
dummy = pd.merge(train_0, song_attributes, on='song id')
genre_hierarchy['large_category'] = pd.Series()
genre_hierarchy['large_category_genre_id']= pd.Series()

  genre_hierarchy['large_category'] = pd.Series()
  genre_hierarchy['large_category_genre_id']= pd.Series()


In [7]:
for i in range(len(genre_hierarchy)):
    if genre_hierarchy.loc[i].level == 1:
        genre_hierarchy['large_category'][i] = genre_hierarchy.loc[i]['genre name']
        genre_hierarchy['large_category_genre_id'][i] = genre_hierarchy.loc[i]['genre id']
    elif genre_hierarchy.loc[i].level == 2:
        genre_hierarchy['large_category'][i] = genre_hierarchy.loc[genre_hierarchy['parent genre id'][i]]['genre name']
        genre_hierarchy['large_category_genre_id'][i] = genre_hierarchy.loc[genre_hierarchy['parent genre id'][i]]['genre id']
    elif genre_hierarchy.loc[i].level == 3:
        genre_hierarchy['large_category'][i] = genre_hierarchy.loc[genre_hierarchy['parent genre id']
                                                                   [i]]['large_category']
        genre_hierarchy['large_category_genre_id'][i] = genre_hierarchy.loc[genre_hierarchy['parent genre id']
                                                                   [i]]['large_category_genre_id']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genre_hierarchy['large_category'][i] = genre_hierarchy.loc[i]['genre name']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genre_hierarchy['large_category_genre_id'][i] = genre_hierarchy.loc[i]['genre id']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html

# merging the files

In [8]:
df = pd.merge(dummy, genre_hierarchy, on='genre id')
df.rename(columns={"user id": 'user_id', 'song id': 'song_id', 'album id': 'album_id',
                   'artist id': 'artist_id','genre id': 'genre_id', 
                   'parent genre id': 'parent_genre_id',
                   'genre name': 'genre_name'}, inplace=True)
df['large_category_genre_id'] = df['large_category_genre_id'].astype(int)

In [9]:
df.to_feather('rating with metadata_train1.feather')

In [10]:
df

Unnamed: 0,user_id,song_id,rating,album_id,artist_id,genre_id,parent_genre_id,level,genre_name,large_category,large_category_genre_id
0,0,166,5,5303,7231,0,0,1,Unknown,Unknown,0
1,2941,166,5,5303,7231,0,0,1,Unknown,Unknown,0
2,3476,166,5,5303,7231,0,0,1,Unknown,Unknown,0
3,3748,166,1,5303,7231,0,0,1,Unknown,Unknown,0
4,4086,166,2,5303,7231,0,0,1,Unknown,Unknown,0
...,...,...,...,...,...,...,...,...,...,...,...
76344622,192534,10299,4,7470,8975,144,134,2,Hard Rock,Rock,134
76344623,194427,10299,1,7470,8975,144,134,2,Hard Rock,Rock,134
76344624,198426,10299,1,7470,8975,144,134,2,Hard Rock,Rock,134
76344625,199184,10299,4,7470,8975,144,134,2,Hard Rock,Rock,134


# select 10000 user as randomly

In [11]:
df = df[df['user_id']<10000]

In [12]:
len(df)

3732463

# Unknown genre->Clustering

In [13]:
unknown_df = pd.DataFrame(df[df['large_category_genre_id'] == 0])

In [14]:
print('Total unique users in the dataset', unknown_df['user_id'].nunique())
print('Total unique songs in the dataset', unknown_df['song_id'].nunique())
print('Total unique artists in the dataset', unknown_df['artist_id'].nunique())
print('Total unique albums in the dataset', unknown_df['album_id'].nunique())

Total unique users in the dataset 10000
Total unique songs in the dataset 118204
Total unique artists in the dataset 9278
Total unique albums in the dataset 18312


In [15]:
temp = unknown_df.groupby(['user_id','album_id'])['rating'].agg(['mean','count']).reset_index()
temp['rating'] = temp['mean'] + 0.5 * np.sqrt(temp['count'])

In [16]:
temp.drop({'count', 'mean'}, axis=1,inplace = True)

In [17]:
temp[temp.album_id == 7470]

Unnamed: 0,user_id,album_id,rating
61172,263,7470,4.5
173226,828,7470,3.5
183346,882,7470,2.5
261876,1319,7470,3.5
355060,1783,7470,3.118034
434812,2175,7470,1.5
501687,2549,7470,4.25
550784,2796,7470,2.5
712659,3626,7470,1.5
795114,4066,7470,3.5


In [18]:
uid_to_nuid = {uid: nuid for nuid, uid in enumerate(temp["user_id"].unique())}
nuid_to_uid = {nuid: uid for uid, nuid in uid_to_nuid.items()}
nuid = temp["user_id"].apply(lambda x: uid_to_nuid[x]).values

In [19]:
aid_to_naid = {aid: naid for naid, aid in enumerate(temp["album_id"].unique())}
naid_to_aid = {naid: aid for aid, naid in aid_to_naid.items()}
naid = temp["album_id"].apply(lambda x: aid_to_naid[x]).values

In [20]:
"""
row_pos= naid
col_pos = nuid
data = df["rating"].values
"""
csr_matrix = sparse.csr_matrix((temp["rating"].values, (naid,nuid)))
csr_matrix

<18312x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 1969195 stored elements in Compressed Sparse Row format>

In [21]:
similarity = cosine_similarity(csr_matrix)

In [22]:
similarity_df = pd.DataFrame(similarity)
similarity_df.index = [naid_to_aid[index] for index in similarity_df.index.to_list()]
similarity_df.columns = [naid_to_aid[index] for index in similarity_df.columns.to_list()]
similarity_df

Unnamed: 0,910,2517,5303,8324,10686,10758,14290,15761,16694,16721,...,5378,8523,10748,14438,9532,4797,539,10565,8640,915
910,1.000000,0.354966,0.126543,0.061369,0.254352,0.248518,0.401366,0.207670,0.375801,0.433119,...,0.025409,0.035934,0.035934,0.0,0.0,0.006265,0.0,0.0,0.013492,0.027185
2517,0.354966,1.000000,0.067642,0.018219,0.339252,0.277733,0.365938,0.192717,0.316942,0.349107,...,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000
5303,0.126543,0.067642,1.000000,0.037279,0.047945,0.028583,0.063936,0.065839,0.047442,0.054861,...,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000
8324,0.061369,0.018219,0.037279,1.000000,0.045720,0.014777,0.017424,0.036957,0.051683,0.078057,...,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000
10686,0.254352,0.339252,0.047945,0.045720,1.000000,0.294200,0.350262,0.179885,0.237464,0.272917,...,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.035449,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4797,0.006265,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.447767,0.000000,0.000000,0.0,0.0,1.000000,0.0,0.0,0.000000,0.000000
539,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.0,0.000000,1.0,0.0,0.000000,0.000000
10565,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,1.0,0.000000,0.000000
8640,0.013492,0.000000,0.000000,0.000000,0.035449,0.000000,0.022147,0.049720,0.026642,0.021066,...,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,1.000000,0.000000


In [23]:
del unknown_df,similarity, csr_matrix

In [24]:
embedding = umap.UMAP(n_neighbors=25,n_components=5,min_dist=0.0, random_state=20210611).fit_transform(similarity_df)

In [25]:
labels = hdbscan.HDBSCAN(
    min_samples=50,
    min_cluster_size=100,
).fit_predict(embedding)

In [26]:
unique, counts = np.unique(labels, return_counts = True)
uniq_cnt_dict = dict(zip(unique, counts))

In [27]:
similarity_df['large_category_genre_id']= labels
unknown_genre_get = similarity_df.iloc[:,-1:]
unknown_genre_get['genre_name'] = unknown_genre_get['large_category_genre_id'].apply(lambda x : f"Unknown_{x+1}")
unknown_genre_get.large_category_genre_id +=  217

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unknown_genre_get['genre_name'] = unknown_genre_get['large_category_genre_id'].apply(lambda x : f"Unknown_{x+1}")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [28]:
tttt=unknown_genre_get.reset_index()
tttt.rename({'index':'album_id'}, axis=1 ,inplace=True)

In [29]:
dict_tttt = dict(zip(tttt.album_id.values,tttt.genre_name.values))
df.loc[df.large_category == "Unknown", "category"]= df.loc[df.large_category == "Unknown"].album_id.apply(lambda x: dict_tttt[x])
df.loc[df.large_category != "Unknown", "category"] = df.loc[df.large_category != "Unknown", "large_category"]

dict_tttt2 = dict(zip(tttt.album_id.values,tttt.large_category_genre_id.values))
df.loc[df.large_category == "Unknown", "category_id"]= df.loc[df.large_category == "Unknown"].album_id.apply(lambda x: dict_tttt2[x])
df.loc[df.large_category != "Unknown", "category_id"] = df.loc[df.large_category != "Unknown", "large_category_genre_id"]

df.drop({'large_category','large_category_genre_id'},axis=1,inplace=True)
df.rename({'category' : 'large_category', 'category_id' : 'large_category_genre_id'},axis =1 , inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#

In [30]:
with open('df_after_preprocessing.pickle', 'wb') as outfile:
    pickle.dump(df, outfile)

In [31]:
df

Unnamed: 0,user_id,song_id,rating,album_id,artist_id,genre_id,parent_genre_id,level,genre_name,large_category,large_category_genre_id
0,0,166,5,5303,7231,0,0,1,Unknown,Unknown_0,216.0
1,2941,166,5,5303,7231,0,0,1,Unknown,Unknown_0,216.0
2,3476,166,5,5303,7231,0,0,1,Unknown,Unknown_0,216.0
3,3748,166,1,5303,7231,0,0,1,Unknown,Unknown_0,216.0
4,4086,166,2,5303,7231,0,0,1,Unknown,Unknown_0,216.0
...,...,...,...,...,...,...,...,...,...,...,...
76344507,1783,10299,2,7470,8975,144,134,2,Hard Rock,Rock,134.0
76344508,2549,10299,5,7470,8975,144,134,2,Hard Rock,Rock,134.0
76344509,4066,10299,3,7470,8975,144,134,2,Hard Rock,Rock,134.0
76344510,8231,10299,1,7470,8975,144,134,2,Hard Rock,Rock,134.0


# Our algorithm

##### 1,category의 unknown을 'rating', '들은 수'를 이용해 분류한다. 

In [32]:
df=df[['user_id','song_id','rating','large_category']]

In [84]:
df[df['user_id']==0]

Unnamed: 0,user_id,song_id,rating,large_category
0,0,166,5,Unknown_0
289,0,2245,4,Unknown_22
10202,0,3637,4,Unknown_20
11513,0,5580,4,Unknown_16
11788,0,5859,4,Unknown_0
12002,0,7121,3,Unknown_0
12230,0,10405,4,Unknown_0
12494,0,16794,5,Unknown_0
34875,0,21252,4,Unknown_20
36172,0,27331,5,Unknown_20


##### 2, 각 category 별로 평균점수를 계산한다.

In [33]:
category_mean = df.groupby(['user_id','large_category'],as_index=False).mean().drop(['song_id'],axis=1)

In [34]:
category_mean

Unnamed: 0,user_id,large_category,rating
0,0,Rock,4.500000
1,0,Unknown_0,4.000000
2,0,Unknown_16,4.000000
3,0,Unknown_20,4.583333
4,0,Unknown_22,4.200000
...,...,...,...
125241,9999,Unknown_18,4.250000
125242,9999,Unknown_19,3.000000
125243,9999,Unknown_22,3.800000
125244,9999,Unknown_4,1.000000


In [83]:
category_mean[category_mean['user_id']==0]

Unnamed: 0,user_id,large_category,rating
0,0,Rock,4.5
1,0,Unknown_0,4.0
2,0,Unknown_16,4.0
3,0,Unknown_20,4.583333
4,0,Unknown_22,4.2


In [35]:
category_count = df.groupby(['user_id','large_category'],as_index=False).count()

In [36]:
category_count

Unnamed: 0,user_id,large_category,song_id,rating
0,0,Rock,2,2
1,0,Unknown_0,14,14
2,0,Unknown_16,1,1
3,0,Unknown_20,12,12
4,0,Unknown_22,5,5
...,...,...,...,...
125241,9999,Unknown_18,4,4
125242,9999,Unknown_19,1,1
125243,9999,Unknown_22,10,10
125244,9999,Unknown_4,1,1


In [37]:
category_df = pd.merge(category_mean,category_count,on=['user_id','large_category'])

In [38]:
category_df=category_df.drop(['rating_y'],axis=1)
category_df.columns=['user_id','large_category','rating','number']
category_df

Unnamed: 0,user_id,large_category,rating,number
0,0,Rock,4.500000,2
1,0,Unknown_0,4.000000,14
2,0,Unknown_16,4.000000,1
3,0,Unknown_20,4.583333,12
4,0,Unknown_22,4.200000,5
...,...,...,...,...
125241,9999,Unknown_18,4.250000,4
125242,9999,Unknown_19,3.000000,1
125243,9999,Unknown_22,3.800000,10
125244,9999,Unknown_4,1.000000,1


In [85]:
category_df[category_df['user_id']==0]

Unnamed: 0,user_id,large_category,rating,number,upperbound
0,0,Rock,4.5,2,4.641421
1,0,Unknown_0,4.0,14,4.374166
2,0,Unknown_16,4.0,1,4.1
3,0,Unknown_20,4.583333,12,4.929743
4,0,Unknown_22,4.2,5,4.423607


In [39]:
# upperbound 주기

In [40]:
category_df['upperbound']=category_df['rating']+0.1*(category_df['number']**(1/2))

In [41]:
category_df.sort_values(by='upperbound', ascending=False)
category_df.sort_values(by='user_id')

Unnamed: 0,user_id,large_category,rating,number,upperbound
0,0,Rock,4.500000,2,4.641421
1,0,Unknown_0,4.000000,14,4.374166
2,0,Unknown_16,4.000000,1,4.100000
3,0,Unknown_20,4.583333,12,4.929743
4,0,Unknown_22,4.200000,5,4.423607
...,...,...,...,...,...
125237,9999,Unknown_0,4.875000,8,5.157843
125235,9999,Reggae,4.500000,2,4.641421
125244,9999,Unknown_4,1.000000,1,1.100000
125239,9999,Unknown_11,4.875000,8,5.157843


In [42]:
df_user_category= category_df.loc[category_df.groupby(['user_id'])['upperbound'].idxmax()]
df_user_category

Unnamed: 0,user_id,large_category,rating,number,upperbound
3,0,Unknown_20,4.583333,12,4.929743
12,1,Unknown_10,5.000000,1,5.100000
29,2,Unknown_10,3.571429,84,4.487944
33,3,Country,4.000000,1,4.100000
47,4,R&B,5.000000,1,5.100000
...,...,...,...,...,...
125206,9995,Unknown_1,5.000000,1,5.100000
125222,9996,Unknown_10,4.920000,100,5.920000
125223,9997,Pop,5.000000,1,5.100000
125234,9998,Unknown_11,4.000000,1,4.100000


##### 3, 이 dataframe으로 svd를 한번 돌린다

In [43]:
#df_user_category = category_mean.pivot(index='user_id',
 #                                     columns='large_category',
  #                                    values='rating').fillna(0)

In [44]:
#df_user_category

In [45]:
#df_user_category.shape

In [46]:
#df_array = df_user_category.to_numpy()
#df_array[:5]

In [47]:
import numpy as np


class MatrixFactorization():
    def __init__(self, R, k, learning_rate, reg_param, epochs, verbose=False):
        """
        :param R: rating matrix
        :param k: latent parameter
        :param learning_rate: alpha on weight update
        :param reg_param: beta on weight update
        :param epochs: training epochs
        :param verbose: print status
        """

        self._R = R
        self._num_users, self._num_items = R.shape
        self._k = k
        self._learning_rate = learning_rate
        self._reg_param = reg_param
        self._epochs = epochs
        self._verbose = verbose


    def fit(self):
        """
        training Matrix Factorization : Update matrix latent weight and bias

        참고: self._b에 대한 설명
        - global bias: input R에서 평가가 매겨진 rating의 평균값을 global bias로 사용
        - 정규화 기능. 최종 rating에 음수가 들어가는 것 대신 latent feature에 음수가 포함되도록 해줌.

        :return: training_process
        """

        # init latent features
        self._P = np.random.normal(size=(self._num_users, self._k))
        self._Q = np.random.normal(size=(self._num_items, self._k))

        # init biases
        self._b_P = np.zeros(self._num_users)
        self._b_Q = np.zeros(self._num_items)
        self._b = np.mean(self._R[np.where(self._R != 0)])

        # train while epochs
        self._training_process = []
        for epoch in range(self._epochs):

            # rating이 존재하는 index를 기준으로 training
            for i in range(self._num_users):
                for j in range(self._num_items):
                    if self._R[i, j] > 0:
                        self.gradient_descent(i, j, self._R[i, j])
            cost = self.cost()
            self._training_process.append((epoch, cost))

            # print status
            if self._verbose == True and ((epoch + 1) % 10 == 0):
                print("Iteration: %d ; cost = %.4f" % (epoch + 1, cost))


    def cost(self):
        """
        compute root mean square error
        :return: rmse cost
        """

        # xi, yi: R[xi, yi]는 nonzero인 value를 의미한다.
        # 참고: http://codepractice.tistory.com/90
        xi, yi = self._R.nonzero()
        predicted = self.get_complete_matrix()
        cost = 0
        for x, y in zip(xi, yi):
            cost += pow(self._R[x, y] - predicted[x, y], 2)
        return np.sqrt(cost) / len(xi)


    def gradient(self, error, i, j):
        """
        gradient of latent feature for GD

        :param error: rating - prediction error
        :param i: user index
        :param j: item index
        :return: gradient of latent feature tuple
        """

        dp = (error * self._Q[j, :]) - (self._reg_param * self._P[i, :])
        dq = (error * self._P[i, :]) - (self._reg_param * self._Q[j, :])
        return dp, dq


    def gradient_descent(self, i, j, rating):
        """
        graident descent function

        :param i: user index of matrix
        :param j: item index of matrix
        :param rating: rating of (i,j)
        """

        # get error
        prediction = self.get_prediction(i, j)
        error = rating - prediction

        # update biases
        self._b_P[i] += self._learning_rate * (error - self._reg_param * self._b_P[i])
        self._b_Q[j] += self._learning_rate * (error - self._reg_param * self._b_Q[j])

        # update latent feature
        dp, dq = self.gradient(error, i, j)
        self._P[i, :] += self._learning_rate * dp
        self._Q[j, :] += self._learning_rate * dq


    def get_prediction(self, i, j):
        """
        get predicted rating: user_i, item_j
        :return: prediction of r_ij
        """
        return self._b + self._b_P[i] + self._b_Q[j] + self._P[i, :].dot(self._Q[j, :].T)


    def get_complete_matrix(self):
        """
        computer complete matrix PXQ + P.bias + Q.bias + global bias

        - PXQ 행렬에 b_P[:, np.newaxis]를 더하는 것은 각 열마다 bias를 더해주는 것
        - b_Q[np.newaxis:, ]를 더하는 것은 각 행마다 bias를 더해주는 것
        - b를 더하는 것은 각 element마다 bias를 더해주는 것

        - newaxis: 차원을 추가해줌. 1차원인 Latent들로 2차원의 R에 행/열 단위 연산을 해주기위해 차원을 추가하는 것.

        :return: complete matrix R^
        """
        return self._b + self._b_P[:, np.newaxis] + self._b_Q[np.newaxis:, ] + self._P.dot(self._Q.T)


    def print_results(self):
        """
        print fit results
        """

        print("User Latent P:")
        print(self._P)
        print("Item Latent Q:")
        print(self._Q.T)
        print("P x Q:")
        print(self._P.dot(self._Q.T))
        print("bias:")
        print(self._b)
        print("User Latent bias:")
        print(self._b_P)
        print("Item Latent bias:")
        print(self._b_Q)
        print("Final R matrix:")
        print(self.get_complete_matrix())
        print("Final RMSE:")
        print(self._training_process[self._epochs-1][1])

##### 4, 각 평균점수에 upper bound를 줘서 가장 큰 값을 가지는 category를 구한다.

In [48]:
# 위에서 이미 함

In [49]:
#df_category_svd=pd.DataFrame(category_svd,
 #                           index=df_user_category.index,
  #                          columns=df_user_category.columns)

In [50]:
#df_category_svd

In [52]:
#category_df = pd.merge(category_df,category_df.groupby('user_id').sum()['number'],on='user_id')

In [53]:
#upper bound 주기
#alpha = 0.5

#for i in df_category_svd.index:
 #   for j in df_category_svd.columns:
  #      check = category_df[(category_df['user_id']==i)&(category_df['large_category']==j)]['number']
   #     if check.any() == True:
    #        df_category_svd.loc[i,j] = df_category_svd.loc[i,j]+(alpha*((check.iloc[0,])**(1/2)))
            

In [54]:
#df_category_svd

In [55]:
#best_category_df= pd.DataFrame(df_category_svd.idxmax(axis=1),columns=['genre'])
#best_category_df['user_id']=best_category_df.index
#best_category_df.index.name = None
#best_category_df

##### 5, category에 해당하는 노래와 user간의 svd를 구한다.

In [56]:
best_category_df=df_user_category[['user_id','large_category']]

In [57]:
category_df_svd = pd.merge(best_category_df,df,on=['user_id','large_category'])

In [58]:
category_df_svd_df = category_df_svd[['user_id','song_id','rating','large_category']]
category_df_svd_df

Unnamed: 0,user_id,song_id,rating,large_category
0,0,3637,4,Unknown_20
1,0,21252,4,Unknown_20
2,0,27331,5,Unknown_20
3,0,32438,5,Unknown_20
4,0,34995,5,Unknown_20
...,...,...,...,...
1149240,9999,125678,5,Unknown_0
1149241,9999,11385,4,Unknown_0
1149242,9999,122285,5,Unknown_0
1149243,9999,64945,5,Unknown_0


In [91]:
import time

In [100]:
%%time

category_svd_total = {}

for i in category_df_svd_df['large_category'].unique():
    genre_df = category_df_svd_df[category_df_svd_df['large_category']==i]
 
    df_matrix= genre_df.pivot(index='user_id',
                              columns='song_id',
                              values='rating').fillna(0)
    
    df_array = df_matrix.to_numpy()
    
    
    factorizer = MatrixFactorization(df_array, k=3, learning_rate=0.01, reg_param=0.01, epochs=100, verbose=True)
    factorizer.fit()
    
    category_svd = factorizer.get_complete_matrix()
    df_category_svd=pd.DataFrame(category_svd,
                            index=df_matrix.index,
                            columns=df_matrix.columns)
     
    category_svd_total[i]=df_category_svd
        

Iteration: 10 ; cost = 0.0115
Iteration: 20 ; cost = 0.0107
Iteration: 30 ; cost = 0.0102
Iteration: 40 ; cost = 0.0098
Iteration: 50 ; cost = 0.0094
Iteration: 60 ; cost = 0.0091
Iteration: 70 ; cost = 0.0088
Iteration: 80 ; cost = 0.0085
Iteration: 90 ; cost = 0.0083
Iteration: 100 ; cost = 0.0080
Iteration: 10 ; cost = 0.0018
Iteration: 20 ; cost = 0.0018
Iteration: 30 ; cost = 0.0018
Iteration: 40 ; cost = 0.0017
Iteration: 50 ; cost = 0.0017
Iteration: 60 ; cost = 0.0017
Iteration: 70 ; cost = 0.0017
Iteration: 80 ; cost = 0.0016
Iteration: 90 ; cost = 0.0016
Iteration: 100 ; cost = 0.0016
Iteration: 10 ; cost = 0.0318
Iteration: 20 ; cost = 0.0252
Iteration: 30 ; cost = 0.0218
Iteration: 40 ; cost = 0.0194
Iteration: 50 ; cost = 0.0174
Iteration: 60 ; cost = 0.0158
Iteration: 70 ; cost = 0.0145
Iteration: 80 ; cost = 0.0134
Iteration: 90 ; cost = 0.0124
Iteration: 100 ; cost = 0.0116
Iteration: 10 ; cost = 0.0251
Iteration: 20 ; cost = 0.0195
Iteration: 30 ; cost = 0.0165
Iterati

Iteration: 60 ; cost = 0.0097
Iteration: 70 ; cost = 0.0072
Iteration: 80 ; cost = 0.0054
Iteration: 90 ; cost = 0.0041
Iteration: 100 ; cost = 0.0032
Iteration: 10 ; cost = 0.0506
Iteration: 20 ; cost = 0.0347
Iteration: 30 ; cost = 0.0276
Iteration: 40 ; cost = 0.0234
Iteration: 50 ; cost = 0.0204
Iteration: 60 ; cost = 0.0182
Iteration: 70 ; cost = 0.0164
Iteration: 80 ; cost = 0.0149
Iteration: 90 ; cost = 0.0137
Iteration: 100 ; cost = 0.0127
Iteration: 10 ; cost = 0.0401
Iteration: 20 ; cost = 0.0282
Iteration: 30 ; cost = 0.0230
Iteration: 40 ; cost = 0.0197
Iteration: 50 ; cost = 0.0173
Iteration: 60 ; cost = 0.0153
Iteration: 70 ; cost = 0.0137
Iteration: 80 ; cost = 0.0123
Iteration: 90 ; cost = 0.0111
Iteration: 100 ; cost = 0.0101
Iteration: 10 ; cost = 0.1372
Iteration: 20 ; cost = 0.0883
Iteration: 30 ; cost = 0.0642
Iteration: 40 ; cost = 0.0499
Iteration: 50 ; cost = 0.0406
Iteration: 60 ; cost = 0.0342
Iteration: 70 ; cost = 0.0295
Iteration: 80 ; cost = 0.0258
Iterati

In [98]:
import pickle

In [99]:
with open('category_svd_total.pickle','wb') as fw:
    pickle.dump(category_svd_total, fw)


In [97]:
category_svd_total

{'Unknown_20': song_id    209       256       319       320       347       363       380     \
 user_id                                                                         
 0        4.567945  4.394401  4.245105  4.333578  4.474096  4.677225  4.099076   
 13       0.725647  6.104708  7.485322  5.643542  6.181627  3.030663  5.821176   
 22       3.848855  3.941563  3.242041  3.661852  3.890030  3.935802  3.095361   
 30       5.231690  4.111531  3.341401  4.099190  4.333943  4.937428  3.537105   
 140      5.632212  3.882145  4.041892  4.855829  5.676705  5.588082  4.535908   
 ...           ...       ...       ...       ...       ...       ...       ...   
 9933     4.449188  3.745322  3.766738  4.011339  4.318870  4.500723  3.809911   
 9942     2.447504  3.306929  3.633004  2.953193  2.788464  2.874171  3.061707   
 9963     2.699768  1.942890  4.986097  2.552312  1.711114  3.057170  4.419068   
 9970     6.750577  3.678992  2.841658  4.339794  4.872354  5.958666  3.681431   
 9

In [60]:
category_svd_total

{'Unknown_20': song_id    209       256       319       320       347       363       380     \
 user_id                                                                         
 0        6.195019  6.449271  6.596405  7.890145  5.407030  7.220720  4.548959   
 13       5.154584  5.088111  3.758022  5.999788  2.354009  5.013039  5.295045   
 22       5.197904  4.900934  4.969257  4.773148  5.914698  3.759785  3.399978   
 30       5.573394  5.500587  6.164991  5.774582  6.439957  5.549207  3.336723   
 140      5.092162  4.961825  5.618705  4.950137  5.578059  5.551188  3.180080   
 ...           ...       ...       ...       ...       ...       ...       ...   
 9933     5.870685  5.885545  4.690485  7.162332  3.835132  5.056001  5.404602   
 9942     5.968544  5.664897  5.013396  5.838140  5.817167  3.859552  4.753227   
 9963     5.708438  5.252317  4.653389  4.515174  4.102160  5.680179  5.633447   
 9970     4.764461  4.791453  5.119446  5.339995  4.193139  5.880458  3.304524   
 9

##### 6. 예측된 점수가 높은 순서대로 top100 노래를 선택한다.

In [77]:
top_df = pd.DataFrame()
nlargest = 100


for i in category_svd_total:
    order = np.argsort(-category_svd_total[i].values, axis=1)[:, :nlargest]
    if category_svd_total[i].shape[1]<100:
        result = pd.DataFrame(category_svd_total[i].columns[order], 
                      columns=['top{}'.format(i) for i in range(1, category_svd_total[i].shape[1]+1 )],
                      index=category_svd_total[i].index)
    else:
        result = pd.DataFrame(category_svd_total[i].columns[order], 
                              columns=['top{}'.format(i) for i in range(1, nlargest+1)],
                              index=category_svd_total[i].index)
    
    top_df = pd.concat([top_df,result])
    
    

  result = pd.DataFrame(category_svd_total[i].columns[order],
  result = pd.DataFrame(category_svd_total[i].columns[order],


           top1    top2      top3      top4      top5      top6      top7  \
user_id                                                                     
0        134980   82690  101013.0   77735.0   27032.0    8375.0   16451.0   
13       134980   20864    8375.0   82690.0   36378.0  102329.0   77789.0   
22        57800  114455   61371.0    9589.0  108247.0   39275.0   27009.0   
30        57800   92172   61371.0   44958.0  115405.0   85514.0  103558.0   
140       57800  103558   44958.0    7723.0   51457.0   92172.0   39088.0   
...         ...     ...       ...       ...       ...       ...       ...   
7203      94790   98775   80694.0    8560.0   73917.0   87677.0   61441.0   
7363      70111   94790   17285.0  126901.0     875.0   84379.0   52366.0   
8272      87677   84379  131899.0   98775.0   85065.0   54312.0  132798.0   
8744       9226   94790   59363.0    8560.0    9576.0   98775.0   61441.0   
9099      84379  131899   98775.0   16473.0   54312.0    9576.0   94790.0   

In [78]:
top_df

Unnamed: 0_level_0,top1,top2,top3,top4,top5,top6,top7,top8,top9,top10,...,top91,top92,top93,top94,top95,top96,top97,top98,top99,top100
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,134980,82690,101013.0,77735.0,27032.0,8375.0,16451.0,118601.0,32256.0,118313.0,...,390.0,96434.0,783.0,59533.0,68906.0,35132.0,64443.0,90908.0,48030.0,104170.0
13,134980,20864,8375.0,82690.0,36378.0,102329.0,77789.0,57524.0,22821.0,69575.0,...,27032.0,3889.0,136005.0,107343.0,53647.0,16019.0,120143.0,31443.0,41923.0,27624.0
22,57800,114455,61371.0,9589.0,108247.0,39275.0,27009.0,76485.0,131226.0,37009.0,...,71462.0,8144.0,119608.0,131065.0,66918.0,120320.0,121622.0,55141.0,134598.0,52338.0
30,57800,92172,61371.0,44958.0,115405.0,85514.0,103558.0,7723.0,51457.0,32855.0,...,28873.0,34154.0,4368.0,69411.0,47763.0,90257.0,74984.0,33658.0,129190.0,103861.0
140,57800,103558,44958.0,7723.0,51457.0,92172.0,39088.0,78356.0,35697.0,61371.0,...,91046.0,17539.0,76117.0,38043.0,114018.0,34633.0,41448.0,134598.0,75897.0,48563.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7203,94790,98775,80694.0,8560.0,73917.0,87677.0,61441.0,85065.0,9576.0,9226.0,...,,,,,,,,,,
7363,70111,94790,17285.0,126901.0,875.0,84379.0,52366.0,48197.0,85065.0,132268.0,...,,,,,,,,,,
8272,87677,84379,131899.0,98775.0,85065.0,54312.0,132798.0,80694.0,16473.0,17285.0,...,,,,,,,,,,
8744,9226,94790,59363.0,8560.0,9576.0,98775.0,61441.0,73917.0,10134.0,16473.0,...,,,,,,,,,,


In [80]:
top_df.to_csv('final_top100.txt', sep = '\t')

In [81]:
top_df= pd.read_csv('final_top100.txt', sep = "\t")