## User Based Collaborative Filtering

In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

from pprint import pprint

import matplotlib.pyplot as plt
from matplotlib import font_manager, rc

from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split

import pickle
import random



### 파일 불러오기

In [2]:
cc = ['user', 'title', 'rate']
df = pd.read_csv('watchaFinal0220.csv', encoding = 'utf-8-sig', sep = '\t', names = cc)

In [3]:
df02 = df.copy()

In [4]:
%%time
# 10개 이하의 평가를 받은 도서를 제거한 새로운 데이터프레임 생성
def refining_df(df02):

    title_number_df = df02.groupby('title').count().sort_values('user')
    title_under_10 = title_number_df[title_number_df.user <= 10]
    title_under_10_list = list(title_under_10.index)

    for title in title_under_10_list:
        df02 = df02[df02['title'] != title]

    return df02

df02 = refining_df(df02)
new_df = df02.reset_index(drop=True)

Wall time: 4min 27s


In [5]:
# 61941개 -> 31750개
new_df.tail()

Unnamed: 0,user,title,rate
31745,unknown400,이방인,3.5
31746,unknown400,설국,4.0
31747,unknown400,그리스인 조르바,5.0
31748,unknown400,1984,5.0
31749,unknown400,참을 수 없는 존재의 가벼움,5.0


In [6]:
df02 = new_df.copy()
df02.head()

Unnamed: 0,user,title,rate
0,encore01,완득이,4.0
1,encore01,"스물아홉 생일, 1년 후 죽기로 결심했다",5.0
2,encore01,청소부 밥,5.0
3,encore01,말의 품격,5.0
4,encore01,예감은 틀리지 않는다,4.0


In [7]:
def execute_labeling(column_name):
    Lec = LabelEncoder()
    df_copy = df02.copy()
    df_label = pd.DataFrame(Lec.fit_transform(df_copy[column_name].values), columns = [column_name+'_label'])
    return df_label

df_user = execute_labeling('user')
df_title = execute_labeling('title')

print(len(df_user['user_label'].value_counts()))
print(len(df_title['title_label'].value_counts()))

df_copy = df02.copy()    # copy df
df_copy = df_copy.drop(df_copy.columns[[0, 1]], axis = 1)    # delete 'user', 'title' columns
df_label = pd.concat([df_user, df_title, df_copy], axis = 1)    # 라벨링한 컬럼과 기존 데이터프레임 합치기

df_label.tail(5)

435
1087


Unnamed: 0,user_label,title_label,rate
31745,434,798,3.5
31746,434,525,4.0
31747,434,117,5.0
31748,434,3,5.0
31749,434,901,5.0


### Utility Matrix 만들기

In [8]:
book_list = list(df02['title'].unique())
book_list = sorted(book_list, key=str)
df_matrix = pd.DataFrame(columns = ['user'] + book_list)
df_matrix    #결과확인

Unnamed: 0,user,11분,13계단,15소년 표류기,1984,"1F/B1 일층, 지하 일층",1Q84,1cm+ 일 센티 플러스,1리터의 눈물,1만 시간의 법칙,...,환상의 빛,황금 물고기,황태자비 납치사건,후레자식,후르츠 바스켓,후르츠 바스켓 8,흐르는 강물처럼,흑설공주 이야기,희랍어 시간,흰


In [9]:
%%time

user_num = len(df_label.user_label.unique()) # 35
book_num = len(df_label.title_label.unique()) # 1009

user_score_list = []    # df_matrix에서 'user' 컬럼을 제외한 컬럼. 즉, title 컬럼
for num in range(0, user_num):
    
    user_score_list = [0 for i in range(book_num)]   # 값을 0으로 초기화
    df_tmp = df_label[df_label['user_label'] == num]    # 각 유저별로 묶는다
    for j in df_tmp.index:
        intL = int(df_tmp.ix[j]['title_label']-1)
        user_score_list[intL] = df_tmp.ix[j]['rate']
        #print(user_score_list)
    
    df_matrix.loc[num] = [num] + user_score_list 
    #df_tmp.ix[]['score']
    #user_score_list[df_tmp.ix[j]['title_label']-1]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  # This is added back by InteractiveShellApp.init_path()
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  if sys.path[0] == '':


Wall time: 29.5 s


In [10]:
df_matrix

Unnamed: 0,user,11분,13계단,15소년 표류기,1984,"1F/B1 일층, 지하 일층",1Q84,1cm+ 일 센티 플러스,1리터의 눈물,1만 시간의 법칙,...,환상의 빛,황금 물고기,황태자비 납치사건,후레자식,후르츠 바스켓,후르츠 바스켓 8,흐르는 강물처럼,흑설공주 이야기,희랍어 시간,흰
0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2.0,0.0,0.0,0.0,0.0,5.0,3.5,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3.0,0.0,0.0,0.0,0.0,3.5,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
7,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,9.0,0.0,0.0,3.5,0.0,4.5,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# 유틸리티행렬 pickle 저장
file = df_matrix
directory = open('utilitymatrix', 'wb')
pickle.dump(file, directory)
directory.close()

In [12]:
user_name = list(df02['user'].unique())
#user_name = sorted(user_name, key = str)
user_name = pd.DataFrame(user_name)
user_name.columns = ['user_name']

# user_name pickle 저장
file_user = user_name
directory_user = open('user_name', 'wb')
pickle.dump(file_user, directory_user)
directory_user.close()

In [13]:
# pickle로 저장한 유틸리티 행렬 DATA LOAD
f = open('utilitymatrix', 'rb')
utilitymatrix = pickle.load(f)

df2 = utilitymatrix
df2.head()


Unnamed: 0,user,11분,13계단,15소년 표류기,1984,"1F/B1 일층, 지하 일층",1Q84,1cm+ 일 센티 플러스,1리터의 눈물,1만 시간의 법칙,...,환상의 빛,황금 물고기,황태자비 납치사건,후레자식,후르츠 바스켓,후르츠 바스켓 8,흐르는 강물처럼,흑설공주 이야기,희랍어 시간,흰
0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2.0,0.0,0.0,0.0,0.0,5.0,3.5,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3.0,0.0,0.0,0.0,0.0,3.5,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
df_matrix2= df_matrix.copy()
df_matrix2['user']=df_matrix['user'].astype(int)
df_matrix2.head()

Unnamed: 0,user,11분,13계단,15소년 표류기,1984,"1F/B1 일층, 지하 일층",1Q84,1cm+ 일 센티 플러스,1리터의 눈물,1만 시간의 법칙,...,환상의 빛,황금 물고기,황태자비 납치사건,후레자식,후르츠 바스켓,후르츠 바스켓 8,흐르는 강물처럼,흑설공주 이야기,희랍어 시간,흰
0,0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,0.0,0.0,0.0,0.0,5.0,3.5,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,0.0,0.0,0.0,0.0,3.5,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
df_concat = pd.concat([user_name, df_matrix2], axis = 1)
df_concat.head()

Unnamed: 0,user_name,user,11분,13계단,15소년 표류기,1984,"1F/B1 일층, 지하 일층",1Q84,1cm+ 일 센티 플러스,1리터의 눈물,...,환상의 빛,황금 물고기,황태자비 납치사건,후레자식,후르츠 바스켓,후르츠 바스켓 8,흐르는 강물처럼,흑설공주 이야기,희랍어 시간,흰
0,encore01,0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,encore02,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,encore03,2,0.0,0.0,0.0,0.0,5.0,3.5,0.0,0.0,...,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,encore04,3,0.0,0.0,0.0,0.0,3.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,encore05,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Cosine Similarity

In [16]:
def cosine_similarity(data_name):    
    from sklearn.metrics.pairwise import cosine_distances
    similarity = 1 - cosine_distances(data_name)    # sklearn은 정의와 반대이므로 1에서 빼준다.
    return similarity

cos_sim = cosine_similarity(df_matrix2)    # data set으로 df를 넣음
print(cos_sim.shape)
cos_sim

(435, 435)


array([[1.        , 0.23953469, 0.11320227, ..., 0.        , 0.01236015,
        0.00618488],
       [0.23953469, 1.        , 0.17023506, ..., 0.03016244, 0.04591229,
        0.03393814],
       [0.11320227, 0.17023506, 1.        , ..., 0.07495646, 0.0858743 ,
        0.08067435],
       ...,
       [0.        , 0.03016244, 0.07495646, ..., 1.        , 0.99662396,
        0.99816022],
       [0.01236015, 0.04591229, 0.0858743 , ..., 0.99662396, 1.        ,
        0.99601678],
       [0.00618488, 0.03393814, 0.08067435, ..., 0.99816022, 0.99601678,
        1.        ]])

In [18]:
class Basic(object):
    
    def __init__(self, user_name, neigh_num):
        self.user_name = user_name
        self.neigh_num = neigh_num
    
    """유저의 이름이 들어오면 유저 번호로 변환"""
    def convert_user_num(self):
        
        user_name_list = list(df_concat['user_name'].unique())
        
        for num in range(len(user_name_list)):
            if user_name_list[num] == self.user_name:
                user_num = num
        
        return user_num
    
    
    """target 유저와 유사한 유저 K명을 찾고, cosine 유사도를 이용하여 거리를 구한다"""
    def find_near_neighbor(self):
    
        from sklearn.neighbors import NearestNeighbors
        
        user_num = Basic.convert_user_num(self)
            
        # n_neighbors에는 본인이 포함되기 때문에 +1을 해준다.
        KNN = NearestNeighbors(n_neighbors = self.neigh_num, metric = 'cosine')    
        KNN.fit(df_matrix2)    # data set은 utility matrix인 df를 사용
        similar_distance, similar_users = KNN.kneighbors(df_matrix2)
    
        similars = {}    #유사한 유저와 거리를 dict형식으로 저장
    
        # 유사한 유저
        similar_users = similar_users[user_num][1:]
        similars['sim_users'] = list(similar_users)
    
        # 유사한 유저들과의 거리
        similar_distance = similar_distance[user_num][1:]
        similars['sim_distance'] = similar_distance
    
        return similars
    
    
    """target유저 + 유사한 유저 K명으로 이루어진 새로운 data frame 형성하고,
       narray 형식으로 반환"""
    def near_neighbors_narray(self):
    
        similars = Basic.find_near_neighbor(self)
        similiar_users_list = similars['sim_users']
        similiar_distances = similars['sim_distance']

        columns = list(df_matrix2.columns)
        new_df = pd.DataFrame(columns = columns)
    
        for i in range(len(similiar_users_list)):
        
            def concat_row(i):
                neighbor_df = df_matrix2[df_matrix2['user'] == similiar_users_list[i]]
                return neighbor_df
        
            neighbor_df = pd.concat([new_df, concat_row(i)])
            new_df = neighbor_df
            
        narray = new_df.values
        narray = narray[:, 1:]
        
        return narray

In [23]:
user1 = Basic('encore01', 8)

In [24]:
print(user1.convert_user_num())

0


In [25]:
print(user1.near_neighbors_narray())

[[0.0 0.0 0.0 ... 0.0 0.0 0.0]
 [0.0 0.0 0.0 ... 0.0 0.0 0.0]
 [0.0 0.0 0.0 ... 0.0 0.0 0.0]
 ...
 [0.0 0.0 3.5 ... 0.0 0.0 0.0]
 [0.0 0.0 0.0 ... 0.0 0.0 0.0]
 [0.0 4.5 0.0 ... 0.0 0.0 0.0]]


In [26]:
len(user1.near_neighbors_narray())

7

In [27]:
print(user1.find_near_neighbor())

{'sim_users': [5, 7, 1, 25, 9, 8, 16], 'sim_distance': array([0.73551391, 0.75722665, 0.76046531, 0.78896027, 0.80286808,
       0.81554328, 0.83529508])}


In [28]:
class Calculation_rating(Basic):
    
    def __init__(self, user_name, neigh_num):
        Basic.__init__(self, user_name, neigh_num)
        
    """가중평균 값으로 도서에 대한 target 유저의 평점을 예측"""
    def predict_rating(self):
        
        narray = Basic.near_neighbors_narray(self)    #narray 받음
        similars = Basic.find_near_neighbor(self)
        
        similiar_distances = similars['sim_distance']
    
        rating_list = []    # 가중평균값을 담는 리스트
    
        # 범위 0 ~ K-1
        for col_num in range(narray.shape[1]):
        
            sum = 0
            rating = 0
            for i in range(1, len(narray[:, col_num])):
                 sum += float(narray[:, col_num][i]) * float(similiar_distances[i])
            rating = sum/similiar_distances.sum()
        
            if rating < 0:
                rating = 0    # 만약 가중평균값이 0보다 작으면 0점으로 함
            elif rating > 10:
                rating = 10    # 만약 가중평균값이 10보다 크면 10점으로 함
            else:
                rating = int(rating)    # 평점은 정수형
    
            rating_list.append(rating)

        return rating_list
        
    
    """target 유저의 평점을 리스트로 변환하는 함수"""
    def original_rating(self):
        
        user_num = Basic.convert_user_num(self)
    
        # target 유저의 평점을 narray로 변환
        target_df = df_matrix2[df_matrix2['user'] == user_num]
        target_narray = target_df.values
        target_narray = target_narray[:, 1:]    # user column 삭제
    
        #narray로 변환된 target 유저의 평점을 리스트로 변환
        target_user_rating_list = []
        for i in range(target_narray.shape[1]):
            raw_rating = int(target_narray[0][i])
            target_user_rating_list.append(raw_rating)
    
        return target_user_rating_list

In [29]:
user_encore01 = Calculation_rating('encore01', 8)

In [30]:
print(user_encore01.original_rating()[100:120])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0]


In [31]:
print(user_encore01.predict_rating()[100:120])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [32]:
len(user_encore01.predict_rating())

1087

###  성능평가

In [33]:
# MSE를 이용한 성능평가 함수
def evaluation(neigh_num):
    
    from sklearn.metrics import mean_squared_error
    
    neigh_num = neigh_num
    rmse = 0
    user_name_list = list(df_concat['user_name'].unique())
    
    for user_name in user_name_list:
        
        cal = Calculation_rating(user_name, neigh_num)
        predict_list = cal.predict_rating()
        original_list = cal.original_rating()
        rmse_onebyone = mean_squared_error(original_list, predict_list)
        
        rmse += rmse_onebyone
        
    rmse = rmse/len(user_name_list)

    return print('이웃의 수가 {}일때, MSE 값은 {}입니다.'.format(neigh_num-1, rmse))

In [50]:
%%time
neigh_num = [21, 41, 61]    # 이웃의 수가 20명, 40명, 60명이 됨
for num in neigh_num:
    evaluation(num)

이웃의 수가 20일때, MSE 값은 0.7965168289820126입니다.
이웃의 수가 40일때, MSE 값은 0.8094661041144556입니다.
이웃의 수가 60일때, MSE 값은 0.8157387727479405입니다.
Wall time: 7min 48s


In [51]:
%%time
neigh_num = [6, 8, 11]    # 이웃의 수가 5명, 7명, 10명이 됨
for num in neigh_num:
    evaluation(num)

이웃의 수가 5일때, MSE 값은 0.7893749537374827입니다.
이웃의 수가 7일때, MSE 값은 0.7816049656864297입니다.
이웃의 수가 10일때, MSE 값은 0.7860800050756583입니다.
Wall time: 5min 5s


### 도서 추천

In [67]:
class UBCF(Calculation_rating):
    
    def __init__ (self, user_name, neigh_num):
        Basic.__init__(self, user_name, neigh_num)
        Calculation_rating.__init__(self, user_name, neigh_num)        
        
    def recommend_book_list(self):
        
        user_num = Basic.convert_user_num(self)
        predict_list = Calculation_rating.predict_rating(self)
        original_list = Calculation_rating.original_rating(self)
        all_book_list = list(df_matrix2.columns)[1:]    # 전체 도서 리스트
        
        """
        target 유저가 이미 평가했던 도서 외의 도서를 추천받기 위해
        target 유저의 평점이 0이면, 가중평균값을 넣고 그렇지 않으면 0을 넣는다
        """
        temp_list = []
        for i in range(len(predict_list)):
            if int(original_list[i]) != 0:
                temp_list.append(0)
            else:
                temp_list.append(int(predict_list[i]))
        
        # 예상 평점이 2점 이상인 도서들만 선택하여 index를 추천 리스트에 담는다.
        recommend_list_index =[]
        for i in range(len(temp_list)):
            if temp_list[i] >= 2:
                recommend_list_index.append(i)
        
        # recommend_list_index로 부터 도서명을 str로 저장
        recommend_list_str = []
        for i in recommend_list_index:
            recommend_list_str.append(all_book_list[i])
        
        """전체 도서 리스트에서 target 유저가 이미 평가한 도서를 제거"""
        already_rating_book_num = [i for i in range(len(temp_list)) if temp_list[i] == 0]
        user_book_list = [all_book_list[i] for i in range(len(all_book_list)) if i not in already_rating_book_num]

        
        final_dict = {}
        final_dict['by_rating'] = recommend_list_str
        final_dict['by_delete'] = user_book_list
        
        return final_dict
    
    
    """도서 추천 실행 함수"""
    def recommendation(self):
        user_list = df02['user'].unique().tolist()
        user_number = Basic.convert_user_num(self)
        book_dict = UBCF.recommend_book_list(self)
        
        by_rating_list = book_dict['by_rating']
        by_delete_list = book_dict['by_delete']
        
        user_name = user_list[user_number]    # user number를 user의 아이디로 변경
        
        # 총 10권의 책을 추천하는데 만약 추천 리스트가 n권 이상이라면 
        # 가중평균 리스트에서 n권의 도서를(최대 10권) 도서를 추천리스트에서 추천
        # 추천 리스트가 10권보다 작을 경우, 그렇지 않으면 유저가 이미 평가했던 도서를 제외한 도서 리스트에서 10-n권의 도서 추천
        
        if len(by_rating_list) >= 10:
            recommendation_selection = random.sample(by_rating_list, 10)
         
            
        else:
            recommendation_selection1 = random.sample(by_rating_list, len(by_rating_list))
            recommendation_selection2 = random.sample(by_delete_list, 10- len(by_rating_list))
            recommendation_selection = recommendation_selection1 + recommendation_selection2
        
    
        print('{}님을 위한 추천 도서입니다.'.format(user_name))
        return print(recommendation_selection)
     

### 실행결과

In [70]:
recommendation1 = UBCF('encore77', 8)
recommendation1.recommendation()

encore77님을 위한 추천 도서입니다.
['정글만리', '어떻게 원하는 것을 얻는가', '초원에서 살아남기', '젊은 느티나무', '내 어린고양이와 늙은개', '정글만리', '노다메 칸타빌레', '인간', '80일간의 세계일주', '호모 데우스']


In [71]:
recommendation1 = UBCF('encore16', 8)
recommendation1.recommendation()

encore16님을 위한 추천 도서입니다.
['초원에서 살아남기', '노다메 칸타빌레', '현의 노래', '호모 데우스', '코믹 메이플 스토리 오프라인 RPG', '노다메 칸타빌레', '역사의 역사', '데드맨', '엠마', '연애혁명']


In [72]:
recommendation1 = UBCF('encore21', 8)
recommendation1.recommendation()

encore21님을 위한 추천 도서입니다.
['늑대의 유혹', '월리를 찾아라! ', '사랑을 찾아 돌아오다', '충사', '찰리와 초콜릿 공장 ', '반딧불이', '정의란 무엇인가', '오체 불만족', '오리엔트 특급살인', '욕망이라는 이름의 전차']
