In [1]:
import pandas as pd
import numpy as np
import pickle

dataframes = pd.read_pickle('recom_data/user_category_dummy3.pkl')

df_counts = dataframes['view_counts']
df_users = dataframes['users']

with open('recom_data/category_dict.pkl', 'rb') as handle:
    category_dict = pickle.load(handle)

item_similarity = pd.read_pickle('recom_data/item_similarity.pkl')

In [2]:
view_matrix = df_counts.pivot(index='user', columns='category', values='count')

In [3]:
def transform_gender(gender):
    return 'M' if (gender == '남자') else 'F'

def transform_age(age):
    return (age // 10) * 10

def recom_simple1(user, n_items=20):
    user_gender = transform_gender(user['gender'])
    user_age = transform_age(user['age'])

    mask_gender = df_users['gender'] == user_gender
    mask_age = df_users['age'] == user_age

    user_group = df_users[mask_gender & mask_age]
    
    mask_group = new_df['user'].isin(user_group['user'])
    view_df = new_df[mask_group]
    
    grouped_view_df = (view_df.groupby(['category'])['count']
                       .agg(['sum'])
                       .sort_values(by=['sum'], ascending=False)
                       .apply(lambda x: x / len(view_df['user'].unique()))
                       .reset_index())
    
    return grouped_view_df

def recom_simple2(user, n_items=20):    
    grouped_view_df = recom_simple1(user, n_items=n_items)
    
    for tester_view in tester_view_data:
        if tester_view[1] in grouped_view_df['category'].values:
            grouped_view_df.loc[grouped_view_df.category==tester_view[1], 'sum'] += tester_view[2]
        else:
            
            grouped_view_df = grouped_view_df.append({'category': tester_view[1], 'sum': tester_view[2]}, ignore_index=True)
    
    grouped_view_df = grouped_view_df.sort_values(by=['sum'], ascending=False)
    return grouped_view_df

tester_view_data = [[2, 11, 1], [2, 12, 2], [2, 1, 1]]
tester_df = pd.DataFrame(tester_view_data, columns=['user', 'category', 'count'])
new_df = df_counts.append(tester_df)

tester1 = {'user_id': 1, 'gender': '남자', 'age': 27}

result1 = recom_simple1(tester1)
result1['category'] = result1['category'].apply(lambda x: category_dict[x])
print(result1)

tester2 = {'user_id': 2, 'gender': '남자', 'age': 22}

result2 = recom_simple2(tester2)
result2['category'] = result2['category'].apply(lambda x: category_dict[x])
print(result2)

       category     sum
0        스마트 밴드  1.4500
1        스마트 워치  1.2500
2       남성 캐주얼화  1.1500
3        남성 운동화  1.1500
4        남성용 모자  1.1000
..          ...     ...
144  기타 스케이트 용품  0.2125
145       남성 향수  0.2125
146         전동휠  0.2000
147          목공  0.2000
148  기타 헤어 스타일링  0.1375

[149 rows x 2 columns]
       category     sum
7         남성 하의  2.9875
5         남성 상의  2.0625
0        스마트 밴드  1.4500
1        스마트 워치  1.2500
2       남성 캐주얼화  1.1500
..          ...     ...
144  기타 스케이트 용품  0.2125
145       남성 향수  0.2125
146         전동휠  0.2000
147          목공  0.2000
148  기타 헤어 스타일링  0.1375

[150 rows x 2 columns]


In [4]:
testers = [{'gender': '남자', 'age': 10}, {'gender': '남자', 'age': 20}, {'gender': '남자', 'age': 30}, {'gender': '남자', 'age': 40}, {'gender': '남자', 'age': 50}, {'gender': '남자', 'age': 60}, {'gender': '남자', 'age': 70},
           {'gender': '여자', 'age': 10}, {'gender': '여자', 'age': 20}, {'gender': '여자', 'age': 30}, {'gender': '여자', 'age': 40}, {'gender': '여자', 'age': 50}, {'gender': '여자', 'age': 60}, {'gender': '여자', 'age': 70}]

for test in testers:
    print(test['gender'], test['age'])
    result1 = recom_simple1(test)
    result1['category'] = result1['category'].apply(lambda x: category_dict[x])
    print(result1)

남자 10
    category     sum
0     남성 운동화  1.2250
1     남성용 가방  1.2125
2      남성 하의  1.1875
3        야구공  1.1375
4    남성 캐주얼화  1.1250
..       ...     ...
89        잡지  0.0875
90     축구 의류  0.0875
91       탁구대  0.0875
92  기타 축구 용품  0.0875
93     축구 가방  0.0750

[94 rows x 2 columns]
남자 20
       category     sum
0        스마트 밴드  1.4500
1        스마트 워치  1.2500
2       남성 캐주얼화  1.1500
3        남성 운동화  1.1500
4        남성용 모자  1.1000
..          ...     ...
144  기타 스케이트 용품  0.2125
145       남성 향수  0.2125
146         전동휠  0.2000
147          목공  0.2000
148  기타 헤어 스타일링  0.1375

[149 rows x 2 columns]
남자 30
      category       sum
0       스마트 밴드  1.650000
1       스마트 워치  1.466667
2          키보드  1.216667
3     기타 음향 가전  1.200000
4          청소기  1.183333
..         ...       ...
143         헹거  0.266667
144     아동 서랍장  0.250000
145  기타 DIY 자재  0.250000
146     주방 수납장  0.216667
147      책상/의자  0.216667

[148 rows x 2 columns]
남자 40
      category       sum
0     문학/과학/경영  1.566667
1        문구 용품 

In [5]:
from sklearn.model_selection import train_test_split
x = df_counts.copy()
y = df_counts['user']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, stratify=y)

In [6]:
# RMSE 계산해주는 함수
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true)- np.array(y_pred)) ** 2))

# 모델별 RMSE 계산
def score(model):
    id_pairs = zip(x_test['user'], x_test['category'])
    y_pred = np.array([model(user, category) for (user, category) in id_pairs])
    y_true = np.array(x_test['count'])
    return RMSE(y_true, y_pred)

train_view_matrix = x_train.pivot(index='user', columns='category', values='count')
view_matrix_t = np.transpose(train_view_matrix)

In [7]:
def simple_all_model(user, category):
    try:
        pred_viewing = train_mean[category]
    except:
        pred_viewing = 0
    return pred_viewing

user_count = len(x_train['user'].unique())
train_mean = x_train.groupby(['category'])['count'].sum().apply(lambda x: x/user_count)

# print(score(simple_all_model))

In [8]:
def IBCF_model(user, category):
    if category in item_similarity:
        sim_scores = item_similarity[category]
        user_viewing = view_matrix_t[user]
        non_viewing_idx = user_viewing[user_viewing.isnull()].index
        user_viewing = user_viewing.dropna()
        sim_scores = sim_scores.drop(non_viewing_idx)
        pred_viewing = np.dot(sim_scores, user_viewing) / sim_scores.sum()
    else:
        pred_viewing = 0
    return pred_viewing

# print(score(IBCF_model))

In [9]:
test_user = {
    'gender': '여성',
    'age': 26
}

test_user_view_data = [
    [1, 1, 4],
    [1, 2, 2],
    [1, 3, 2],
    [1, 4, 1],
    [1, 31, 1],
    [1, 32, 1],
    [1, 33, 3]
]

temp_user_df = pd.DataFrame(test_user_view_data, columns=['user', 'category', 'count'])
new_df = df_counts.append(temp_user_df)

view_matrix = new_df.pivot(index='user', columns='category', values='count')

In [10]:
def recom_category(user_id, n_items=20):
    for category in view_matrix:
        user_category.loc[category] = IBCF_model(user_id, category)
    category_sort = user_category.sort_values(ascending=False)[:n_items]
    return category_sort

def IBCF_model(user, category):
    if category in item_similarity:
        sim_scores = item_similarity[category]
        user_viewing = user_category.T
        non_viewing_idx = user_viewing[user_viewing.isnull()].index
        user_viewing = user_viewing.dropna()
        sim_scores = sim_scores.drop(non_viewing_idx)
        pred_viewing = np.dot(sim_scores, user_viewing) / sim_scores.sum()
    else:
        pred_viewing = 0
    return pred_viewing

user_category = view_matrix.loc[90081].copy()

pred_user_category = recom_category(user_id=90081, n_items=10)
recommend_category = pd.DataFrame(pred_user_category).index.map(lambda x: category_dict[x])
print(list(recommend_category))

['남성 캐주얼화', '남성 잠옷', '남성 운동화', '여성 기능성화', '남성 언더웨어', '기타', '장지갑', '남성 하의', '중지갑', '남성 구두']


In [11]:
item_similarity2 = pd.read_pickle('recom_data/item_similarity_pearson.pkl')

def recom_category(user_id, n_items=20):
    for category in view_matrix:
        user_category.loc[category] = IBCF_model(user_id, category)
    category_sort = user_category.sort_values(ascending=False)[:n_items]
    return category_sort

def IBCF_model(user, category):
    if category in item_similarity2:
        sim_scores = item_similarity2[category]
        user_viewing = user_category.T
        non_viewing_idx = user_viewing[user_viewing.isnull()].index
        user_viewing = user_viewing.dropna()
        sim_scores = sim_scores.drop(non_viewing_idx)
        pred_viewing = np.dot(sim_scores, user_viewing) / sim_scores.sum()
    else:
        pred_viewing = 0
    return pred_viewing

user_category = view_matrix.loc[90081].copy()

pred_user_category = recom_category(user_id=90081, n_items=10)
recommend_category = pd.DataFrame(pred_user_category).index.map(lambda x: category_dict[x])
print(list(recommend_category))

['기타 바디 케어', '두피 케어', '애프터선', '남성 향수', '남성용 시계', '기타 헤어 스타일링', '남성 장갑', '남성 벨트', '기타 마스크/팩', '손수건']


In [12]:
def recom_simple3(user, n_items=423):    
    grouped_view_df = recom_simple1(user, n_items=423)
    
    for tester_view in test_user_view_data:
        if tester_view[1] in grouped_view_df['category'].values:
            grouped_view_df.loc[grouped_view_df.category==tester_view[1], 'sum'] += tester_view[2]
        else:
            
            grouped_view_df = grouped_view_df.append({'category': tester_view[1], 'sum': tester_view[2]}, ignore_index=True)
    
    grouped_view_df = grouped_view_df.sort_values(by=['sum'], ascending=False)
    return grouped_view_df


pred1_user_category = recom_simple3(test_user, n_items=423)['sum']
true1_user_category = pd.DataFrame(test_user_view_data, columns=['user', 'category', 'sum']).drop(['user'], axis=1).set_index('category')['sum']
merged = pd.merge(pred1_user_category, true1_user_category, left_index=True, right_index=True, how='outer', suffixes=['_pred', '_true']).fillna(0)
print(RMSE(merged['sum_pred'], merged['sum_true']))
# 기존 그룹화 추천의 경우 데이터가 없는 경우는 Error가 낮게 나올 수밖에 없음
# 모델과 에러가 큰 차이가 안 날 경우 전환하는 걸로 얘기해야 될 듯....

# user = {
#     'user_id': 1,
#     'age': 12,
#     'gender': '남성'
# }
# true2_user_category = view_matrix.loc[user['user_id']].fillna(0)
# pred2_user_category = recom_simple3(user=user, n_items=423)
# print(true2_user_category, pred2_user_category)
# print(RMSE(pred2_user_category, true2_user_category))

0.8524250755100349


In [34]:
class MF():
    def __init__(self, ratings, K, alpha, beta, iterations, verbose=True):
        """
            K: 잠재요인(latent factor)의 수
            alpha: 학습률
            beta: 정규화 계수
            iterations: SGD의 계산을 할 때의 반복 횟수
            verbose: SGD의 중간 학습과정 출력 여부
        """
        self.R = np.array(ratings)
        
        item_id_index = []
        index_item_id = []
        for i, one_id in enumerate(ratings):
            item_id_index.append([one_id, i])
            index_item_id.append([i, one_id])
        self.item_id_index = dict(item_id_index)
        self.index_item_id = dict(index_item_id)
        user_id_index = []
        index_user_id = []
        for i, one_id in enumerate(ratings.T):
            user_id_index.append([one_id, i])
            index_user_id.append([i, one_id])
        self.user_id_index = dict(user_id_index)
        self.index_user_id = dict(index_user_id)
        
        self.num_users, self.num_items = np.shape(self.R)
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.iterations = iterations
        self.verbose = verbose
        
    def rmse(self):
        """
            현재의 P행렬과 Q행렬을 가지고 Root Mean Squared Error(RMSE)를 계산
        """
        xs, ys = self.R.nonzero()
        # R에서 평점이 있는 (0이 아닌) 요소의 인덱스를 저장
        self.predictions = []
        self.errors = []
        for x, y in zip(xs, ys):
            # 평점이 존재하는 요소 (사용자 x, 아이템 y) 각각에 대해 아래 코드를 실행
            prediction = self.get_prediction(x, y)
            # 사용자 x, 아이템 y에 대한 평점 예측치를 계산
            self.predictions.append(prediction)
            # 예측값을 리스트에 추가
            self.errors.append(self.R[x, y] - prediction)
            # 실제값과 예측값의 차이를 계산해서 오차값 리스트에 추가
        self.predictions = np.array(self.predictions)
        self.errors = np.array(self.errors)
        # errors를 사용해서 RMSE를 계산
        return np.sqrt(np.mean(self.errors**2))
    
    def train(self):
        """
            정해진 횟수만큼 반복하며 P, Q, bu, bd 값을 업데이트하는 함수
        """
        # Initializing user-feature and movie-feature matrix
        # P, Q 행렬을 임의의 값으로 채움
        # 평균이 0, 표준편차가 1/K인 정규분포를 갖는 난수로 초기화
        self.P = np.random.normal(scale=10./self.K, size=(self.num_users, self.K))
        self.Q = np.random.normal(scale=10./self.K, size=(self.num_items, self.K))
        
        # Initializing the bias terms
        # 사용자 평가 경향(bu)를 0으로 초기화 / 크기는 사용자 수 (num_users)
        # 아이템 평가 경향(bd)를 0으로 초기화 / 크기는 아이템 수 (num_items)
        # 전체 평균 b를 구해서 저장
        self.b_u = np.zeros(self.num_users)
        self.b_d = np.zeros(self.num_items)
        self.b = np.mean(self.R[self.R.nonzero()])
        
        # List of training samples
        # 평점 행렬에서 값이 존재하는 요소의 인덱스들을 가져옴
        # SGD를 적용할 대상 (평점이 있는 요소의 인덱스와 평점)을 리스트로 저장
        rows, columns = self.R.nonzero()
        self.samples = [(i, j, self.R[i,j]) for i, j in zip(rows, columns)]
        
        # Stochastic gardient descent for given number of iterations
        training_process = []
        for i in range(self.iterations):
            np.random.shuffle(self.samples)
            # samples를 임의로 섞음 => 어디서 시작하는지에 따라 수렴의 속도가 달라짐
            # 매 반복마다 다양한 시작점에서 시작하기!
            self.sgd()
            # sgd를 실행 / P, Q, bu, bd가 업데이트 됨
            rmse = self.rmse()
            training_process.append((i+1, rmse))
            if self.verbose:
                if (i+1) % 10 == 0:
                    print("반복: {0}; Train RMSE = {1:0.4f}".format(i+1, rmse))
        return training_process
    
    def get_prediction (self, i, j):
        """
            Rating prediction for user i and item j
            사용자 i의 아이템 j에 대한 평점을 예측하는 함수
        """
        prediction = self.b + self.b_u[i] + self.b_d[j] + self.P[i, :].dot(self.Q[j, :].T)
        return prediction
        
    def get_one_prediction(self, user_id, item_id):
        return self.get_prediction(self.user_id_index[user_id], self.item_id_index[item_id])
    
    def full_prediction(self):
        return self.b + self.b_u[:, np.newaxis] + self.b_d[np.newaxis, :] + self.P.dot(self.Q.T)

    def sgd(self):
        """
            Stochastic gardient descent to get optimized P and Q matrix
        """
        for i, j, r in self.samples:
            # samples에 있는 각 사용자-아이템-평점 세트에 대해서 SGD를 적용
            prediction = self.get_prediction(i, j)
            # 사용자 i, 아이템 j에 대한 평점 예측치를 구함
            e = (r - prediction)
            # 실제 평점 r과 비교해서 오차를 구함
            
            self.b_u[i] += self.alpha * (e - self.beta * self.b_u[i])
            self.b_d[j] += self.alpha * (e - self.beta * self.b_d[j])
            # 사용자 평가 경향, 아이템 평가 경향 업데이트
            
            self.P[i, :] += self.alpha * (e * self.Q[j, :] - self.beta * self.P[i, :])
            self.Q[j, :] += self.alpha * (e * self.P[i, :] - self.beta * self.Q[j, :])

In [35]:
view_matrix = df_counts.pivot(index='user', columns='category', values='count')
R_temp = view_matrix.fillna(0)
mf = MF(R_temp, K=40, alpha=0.001, beta=0.02, iterations=50, verbose=True)
mf.train()

반복: 10; Train RMSE = 0.8884
반복: 20; Train RMSE = 0.8431
반복: 30; Train RMSE = 0.8108
반복: 40; Train RMSE = 0.7845
반복: 50; Train RMSE = 0.7611


[(1, 0.9573017802630084),
 (2, 0.9469823239685168),
 (3, 0.9376099111593666),
 (4, 0.929013394184817),
 (5, 0.9210669673797167),
 (6, 0.9136783069930415),
 (7, 0.9067743077113968),
 (8, 0.9002892069116973),
 (9, 0.894177543851642),
 (10, 0.8883989255483337),
 (11, 0.882916388016239),
 (12, 0.8777023374561371),
 (13, 0.872728415816223),
 (14, 0.8679748404811775),
 (15, 0.8634216499596344),
 (16, 0.8590527222080991),
 (17, 0.8548521531755557),
 (18, 0.8508061023347225),
 (19, 0.8469020336052556),
 (20, 0.8431293744155139),
 (21, 0.8394779630569789),
 (22, 0.83593920368206),
 (23, 0.8325044089998904),
 (24, 0.8291668772306048),
 (25, 0.8259182784281832),
 (26, 0.8227523734950487),
 (27, 0.819664410430239),
 (28, 0.8166482030229743),
 (29, 0.8136986714683513),
 (30, 0.8108113231099193),
 (31, 0.8079818410005744),
 (32, 0.8052055679146205),
 (33, 0.8024798286320088),
 (34, 0.799800269497179),
 (35, 0.7971642498583381),
 (36, 0.794568227982998),
 (37, 0.7920098466066074),
 (38, 0.78948590739

In [15]:
print(df_counts)

        user  category  count
0      90001        11      0
1      90001        12      0
2      90001        42      0
3      90001        43      0
4      90001        72      1
...      ...       ...    ...
63125  91220      1092      1
63126  91220      1093      1
63127  91220      1094      1
63128  91220      1101      1
63129  91220      1102      1

[63130 rows x 3 columns]


In [16]:
# print(view_matrix)
view_matrix.fillna(0).to_csv('test')

In [17]:
test_df222 = pd.DataFrame(mf.full_prediction())
print(test_df222)
print(test_df222.to_csv('test2'))

           0         1         2         3         4         5         6    \
0     1.885533  1.771663  1.934260  2.092536  1.846037  1.809250  2.033177   
1     2.351098  2.185781  2.376757  2.530611  2.260632  2.216988  2.472687   
2     1.952525  1.814063  1.993869  2.150045  1.884701  1.849649  2.088317   
3     2.000778  1.855989  2.019426  2.182259  1.945041  1.910653  2.104952   
4     2.085441  1.945690  2.137587  2.293316  2.055735  1.986186  2.226489   
...        ...       ...       ...       ...       ...       ...       ...   
1135  1.351928  1.236617  1.401715  1.561570  1.318477  1.306137  1.486042   
1136  1.367222  1.239183  1.405272  1.562968  1.324696  1.297742  1.495311   
1137  1.364708  1.238563  1.408184  1.561940  1.316284  1.288108  1.486972   
1138  1.368120  1.225259  1.409565  1.559856  1.313930  1.288515  1.497884   
1139  1.378413  1.241116  1.410571  1.564252  1.310349  1.290468  1.507035   

           7         8         9    ...       413       414    

In [18]:
testers = [
    {
        'user_id': 90001,
        'gender': '남성',
        'age': 17
    }, {
        'user_id': 90081,
        'gender': '남성',
        'age': 22
    }, {
        'user_id': 90161,
        'gender': '남성',
        'age': 32
    }, {
        'user_id': 90221,
        'gender': '남성',
        'age': 42
    }, {
        'user_id': 90281,
        'gender': '남성',
        'age': 52
    }, {
        'user_id': 90341,
        'gender': '남성',
        'age': 62
    }, {
        'user_id': 90401,
        'gender': '남성',
        'age': 72
    }, {
        'user_id': 90501,
        'gender': '여성',
        'age': 12
    }
]

for tester in testers:
    print(tester['gender'], tester['age'])
    temp_data = dict()
    for category in R_temp:
        temp_data[category] = mf.get_one_prediction(tester['user_id'], category)

    recomm_df = pd.DataFrame.from_dict([temp_data]).T.reset_index().rename(columns={'index': 'category', 0: 'pred'})
    # print(recomm_df)
    temp_recomm_df = recomm_df.sort_values(by='pred', ascending=True)
    temp_recomm_df['category'] = temp_recomm_df['category'].map(lambda x: category_dict[x])
    print(temp_recomm_df)

남성 17
      category      pred
248  기타 자동차 용품  1.097760
257         비데  1.122807
245      세차 용품  1.132078
258         욕조  1.136731
232      기타 공구  1.142874
..         ...       ...
315     유아동 의류  2.036922
7      남성 언더웨어  2.067775
3      여성 언더웨어  2.092536
303   위생/건강 용품  2.110703
145        PSP  2.137379

[423 rows x 2 columns]
남성 22
      category      pred
248  기타 자동차 용품  1.425201
257         비데  1.456655
243        장난감  1.472348
245      세차 용품  1.474575
232      기타 공구  1.482565
..         ...       ...
315     유아동 의류  2.378765
7      남성 언더웨어  2.399803
3      여성 언더웨어  2.411278
303   위생/건강 용품  2.430382
145        PSP  2.460020

[423 rows x 2 columns]
남성 32
      category      pred
248  기타 자동차 용품  1.807416
257         비데  1.833388
245      세차 용품  1.849976
232      기타 공구  1.852619
243        장난감  1.857401
..         ...       ...
315     유아동 의류  2.743890
7      남성 언더웨어  2.786192
3      여성 언더웨어  2.789438
303   위생/건강 용품  2.825898
145        PSP  2.841536

[423 rows x 2 columns]
남성 42
    

In [29]:
print(test_df222[3])

0       2.092536
1       2.530611
2       2.150045
3       2.182259
4       2.293316
          ...   
1135    1.561570
1136    1.562968
1137    1.561940
1138    1.559856
1139    1.564252
Name: 3, Length: 1140, dtype: float64


In [19]:
def recom_category(user_id, n_items=20):
    for category in view_matrix:
        user_category.loc[category] = IBCF_model(user_id, category)
    return user_category.reset_index().rename(columns={user_id: 'pred'})

def IBCF_model(user, category):
    if category in item_similarity:
        sim_scores = item_similarity[category]
        user_viewing = user_category.T
        non_viewing_idx = user_viewing[user_viewing.isnull()].index
        user_viewing = user_viewing.dropna()
        sim_scores = sim_scores.drop(non_viewing_idx)
        pred_viewing = np.dot(sim_scores, user_viewing) / sim_scores.sum()
    else:
        pred_viewing = 0
    return pred_viewing

user_category = view_matrix.loc[90021].copy()

pred_user_category = recom_category(user_id=90021, n_items=423)
# recommend_category = pd.DataFrame(pred_user_category).index.map(lambda x: category_dict[x])
print(pred_user_category)

temp_temp_temp = pred_user_category.sort_values(by='pred')
temp_temp_temp['category'] = temp_temp_temp['category'].map(lambda x: category_dict[x])
print(temp_temp_temp)

     category      pred
0           1  0.695231
1           2  0.621502
2           3  0.741396
3           4  0.696715
4          11  0.884899
..        ...       ...
418      1092  0.754905
419      1093  0.776601
420      1094  0.776212
421      1101  0.725899
422      1102  0.757372

[423 rows x 2 columns]
     category      pred
9       여성 구두  0.601747
24     여성용 가방  0.609848
12     여성 실내화  0.614230
11     여성 운동화  0.615850
10    여성 캐주얼화  0.618955
..        ...       ...
339       축구화  0.976315
20   기타 신발 용품  1.007490
328       수영모  1.018621
338       축구공  1.083338
5       남성 하의  1.131398

[423 rows x 2 columns]


In [20]:
merged = pd.merge(pred_user_category, recomm_df, left_on='category', right_on='category', how='outer', suffixes=['_IBCF', '_MF'])
merged['pred'] = merged['pred_IBCF'] * 0.9 + merged['pred_MF'] * 0.1
merged['category'] = merged['category'].map(lambda x: category_dict[x])
print(merged.sort_values(by='pred'))

     category  pred_IBCF   pred_MF      pred
9       여성 구두   0.601747  2.228518  0.764424
12     여성 실내화   0.614230  2.139266  0.766734
10    여성 캐주얼화   0.618955  2.105933  0.767653
11     여성 운동화   0.615850  2.165405  0.770805
352       배구공   0.634319  2.009606  0.771848
..        ...        ...       ...       ...
339       축구화   0.976315  2.071808  1.085865
20   기타 신발 용품   1.007490  2.188747  1.125616
328       수영모   1.018621  2.326583  1.149418
338       축구공   1.083338  2.031960  1.178200
5       남성 하의   1.131398  2.184143  1.236672

[423 rows x 4 columns]


In [21]:
# DL
import numpy as np
import pandas as pd
import pickle

dataframes = pd.read_pickle('recom_data/user_category_dummy.pkl')

df_counts = dataframes['view_counts']
df_users = dataframes['users']

with open('recom_data/category_dict.pkl', 'rb') as handle:
    category_dict = pickle.load(handle)

In [22]:
from sklearn.utils import shuffle
# shuffle 라이브러리 사용 -> 전체 평가를 기준으로 랜덤 추출
# 극단적인 경우, 특정 사용자의 모튼 평가가 train_set으로 들어갈 수도 있음

TRAIN_SIZE = 0.75 
# train_set 을 75%로 지정
view_counts = shuffle(df_counts, random_state=1)
# ratings를 섞어줌 -> 사용자-영화-평점이 1세트
cutoff = int(TRAIN_SIZE * len(view_counts))
# 전체 데이터 중 train_set의 개수를 계산
view_train = view_counts.iloc[:cutoff]
view_test = view_counts.iloc[cutoff:]

In [23]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dot, Add, Flatten
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import SGD, Adamax

# 변수 초기화
K = 200
mu = view_train['count'].mean() # 전체 평균
M = view_counts.user.max() + 1  # Number of users
N = view_counts.category.max() + 1 # Number of movies

In [24]:
# Defining RMSE measure
def RMSE(y_true, y_pred):
    return tf.sqrt(tf.reduce_mean(tf.square(y_true - y_pred)))

In [25]:
# Keras model
user = Input(shape=(1, ))
item = Input(shape=(1, ))
P_embedding = Embedding(M, K, embeddings_regularizer=l2())(user)
Q_embedding = Embedding(N, K, embeddings_regularizer=l2())(item)
user_bias = Embedding(M, 1, embeddings_regularizer=l2())(user)
item_bias = Embedding(N, 1, embeddings_regularizer=l2())(item)

# Concatenate layers
from tensorflow.keras.layers import Dense, Concatenate, Activation
P_embedding = Flatten()(P_embedding)
Q_embedding = Flatten()(Q_embedding)
user_bias = Flatten()(user_bias)
item_bias = Flatten()(item_bias)
R = Concatenate()([P_embedding, Q_embedding, user_bias, item_bias])

In [26]:
# Neural network
R = Dense(2048)(R)
R = Activation('linear')(R)
R = Dense(256)(R)
R = Activation('linear')(R)
R = Dense(1)(R)

model = Model(inputs = [user, item], outputs=R)
model.compile(
    loss=RMSE,
    optimizer=SGD(),
    #opto,ozer=Adamax(),
    metrics=[RMSE]
)
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 1, 200)       18244200    input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1, 200)       220600      input_2[0][0]                    
_______________________________________________________________________________________

In [27]:
# Model fitting
result = model.fit(
    x = [view_train.user.values, view_train.category.values],
    y = view_train['count'].values - mu,
    epochs = 65,
    batch_size = 512,
    validation_data = (
        [view_train.user.values, view_train.category.values],
        view_train['count'].values - mu
    )
)

Epoch 1/65


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 2/65
Epoch 3/65
Epoch 4/65

KeyboardInterrupt: 

In [None]:
# Plot RMSE
import matplotlib.pyplot as plt
%matplotlib inline

plt.plot(result.history['RMSE'], label="Train RMSE")
plt.plot(result.history['val_RMSE'], label="Test RMSE")
plt.legend()
plt.show()

In [None]:
# user_ids = view_test.user.values
# category_ids = view_test.category.values
# user = np.array([90421] * len(category_dict))
# categories = np.array(list(category_dict.keys()))
# predictions = model.predict([user, categories]) + mu

# print(category_dict[categories[predictions.argmax()]])
# print(predictions.argmax())
# print("Actuals: \n", view_test[0:6])
# print()
# print("Predictions: \n", predictions)

In [None]:
model.save("testmodel.h5")

In [61]:
import pandas as pd
import numpy as np
import pickle

dataframes = pd.read_pickle('recom_data/user_category_dummy3.pkl')

df_counts = dataframes['view_counts']
df_users = dataframes['users']

with open('recom_data/category_dict.pkl', 'rb') as handle:
    category_dict = pickle.load(handle)

item_similarity = pd.read_pickle('recom_data/item_similarity.pkl')

In [62]:
from sklearn.utils import shuffle
# shuffle 라이브러리 사용 -> 전체 평가를 기준으로 랜덤 추출
# 극단적인 경우, 특정 사용자의 모튼 평가가 train_set으로 들어갈 수도 있음

TRAIN_SIZE = 0.75 
# train_set 을 75%로 지정
view_counts = shuffle(df_counts, random_state=1)
# ratings를 섞어줌 -> 사용자-영화-평점이 1세트
cutoff = int(TRAIN_SIZE * len(view_counts))
# 전체 데이터 중 train_set의 개수를 계산
view_train = view_counts.iloc[:cutoff]
view_test = view_counts.iloc[cutoff:]

In [66]:
def convert_gender(gender):
    if gender == 'M':
        return 1
    elif gender == 'F':
        return 2
    else:
        return 0
    
def convert_age(age):
    if age == 100:
        return 0
    else:
        return age // 10
    
users = df_users.copy()
users['age'] = users['age'].apply(convert_age)
users['gender'] = users['gender'].apply(convert_gender)

A, G = 8, 3
train_age = pd.merge(view_train, users, on='user')['age']
test_age = pd.merge(view_test, users, on='user')['age']

train_gender = pd.merge(view_train, users, on='user')['gender']
test_gender = pd.merge(view_test, users, on='user')['gender']

In [68]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dot, Add, Flatten
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import SGD, Adamax
from tensorflow.keras.layers import Dense, Concatenate, Activation

# 변수 초기화
K = 200
mu = view_train['count'].mean() # 전체 평균
M = view_counts.user.max() + 1  # Number of users
N = view_counts.category.max() + 1 # Number of movies

# Defining RMSE measure
def RMSE(y_true, y_pred):
    return tf.sqrt(tf.reduce_mean(tf.square(y_true - y_pred)))

# Keras model
user = Input(shape=(1, ))
item = Input(shape=(1, ))
P_embedding = Embedding(M, K, embeddings_regularizer=l2())(user)
Q_embedding = Embedding(N, K, embeddings_regularizer=l2())(item)
user_bias = Embedding(M, 1, embeddings_regularizer=l2())(user)
item_bias = Embedding(N, 1, embeddings_regularizer=l2())(item)

age = Input(shape=(1, ))
age_embedding = Embedding(A, 3, embeddings_regularizer=l2())(age)
gender = Input(shape=(1, ))
gender_embedding = Embedding(G, 3, embeddings_regularizer=l2())(gender)

P_embedding = Flatten()(P_embedding)
Q_embedding = Flatten()(Q_embedding)
user_bias = Flatten()(user_bias)
item_bias = Flatten()(item_bias)

age_layer = Flatten( )(age_embedding)
gender_layer = Flatten( )(gender_embedding)

R = Concatenate( )([P_embedding, Q_embedding, user_bias, item_bias, gender_layer, age_layer])

In [69]:
# Neural network
R = Dense(2048)(R)
R = Activation('linear')(R)
R = Dense(256)(R)
R = Activation('linear')(R)
R = Dense(1)(R)

model = Model(inputs = [user, item, gender, age], outputs=R)
model.compile(
    loss=RMSE,
    optimizer=SGD(),
    #opto,ozer=Adamax(),
    metrics=[RMSE]
)
model.summary()

Model: "functional_12"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_25 (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_26 (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_28 (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_27 (InputLayer)           [(None, 1)]          0                                            
______________________________________________________________________________________

In [70]:
# Model fitting
result = model.fit(
    x = [view_train.user.values, view_train.category.values, train_gender.values, train_age.values],
    y = view_train['count'].values - mu,
    epochs = 40,
    batch_size = 512,
    validation_data = (
        [view_test.user.values, view_test.category.values, test_gender.values, test_age.values],
        view_test['count'].values - mu,
    )
)

Epoch 1/40


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40

KeyboardInterrupt: 