## W2V based Recommender

In [3]:
import pandas as pd
import numpy as np
from sklearn.externals import joblib 
from gensim.models import word2vec
import gc

#### Read Data

In [4]:
# Data Columns
USER, ITEM, RATING = '회원번호', 'ISBN', '평점'

In [5]:
# positive sample과 이보다 3배 많은 negative sample 읽기
train, test = joblib.load('recsys_train_test_1v3.pkl')
train = train[train.ISBN.apply(lambda x: x[-1] not in ['X', 'x'])]
test = test[test.ISBN.apply(lambda x: x[-1] not in ['X', 'x'])]
train = train.query('평점 == 1')

ModuleNotFoundError: No module named 'pandas.core.internals.managers'; 'pandas.core.internals' is not a package

In [4]:
train.head()

Unnamed: 0,회원번호,ISBN,평점
0,65260,9788989415954,1.0
4,65260,9788994780375,1.0
8,65260,9788959977109,1.0
12,65260,9788989415534,1.0
16,65260,9788926977712,1.0


#### Make Corpus

In [5]:
# Hyperparameters
num_features = 100 # 단어 벡터 차원 수
min_word_count = 30 # 최소 단어 수 (over-sampling 배율)
context = 5 # 학습 윈도우(인접한 아이템 리스트) 크기
top_k = 10

In [6]:
# W2V 학습을 하기에는 데이터(즉 corpus)가 부족하여 
# 고객별로 구매한 상품 목록으로부터 n배 oversampling을 수행
def oversample(x, n):
    if n == 0:
        return list(x[ITEM])
    uw = np.unique(x[ITEM])
    bs = np.array([])
    for j in range(n):
#        gender = user_data.query('custid == @x.name').sex
#        bs = np.append(bs, 'g'+str(gender))
        bs = np.append(bs, np.random.choice(uw, len(uw), replace=False))
    return list(bs)

train_uid = train.groupby(USER).apply(lambda x: x.name)
test_uid = pd.DataFrame(test).groupby(USER).apply(lambda x: x.name)
sentences = list(train.groupby(USER).apply(oversample, min_word_count))

#### Train Word Embedding

In [7]:
# 모델 학습
model = word2vec.Word2Vec(sentences, 
                          size=num_features, 
                          min_count=min_word_count,
                          window=context)
# 필요없는 메모리 unload
model.init_sims(replace=True)
del sentences; gc.collect()

150

#### Make Recommendations

In [8]:
# 각 사용자별로 구매한 item들의 평균벡터로부터 가장 가까운(유사한) k개의 item을 찾는다.
items = list(train.groupby(USER).apply(lambda x: list(x[ITEM])))
recomm = [model.wv.most_similar(words, topn=top_k) for words in items]

# 각 사용자별 추천 리스트 데이터 프레임 형식으로 출력
recs = []
prob = []
for i, _ in enumerate(train_uid):
    recs.append([item[0] for item in recomm[i]]) # item list
    prob.append([item[1] for item in recomm[i]]) # similarity list
    
recs = pd.DataFrame(pd.DataFrame(recs, index=train_uid).stack()).reset_index(). \
                    rename({'level_0':USER, 0: ITEM}, axis=1).iloc[:,[0,2]]
prob = pd.DataFrame(prob, index=train_uid).stack().reset_index().rename({0: 'score'}, axis=1).iloc[:,[2]]
recs_prob = pd.concat([recs, prob], axis=1)

recs_prob

Unnamed: 0,회원번호,ISBN,score
0,292,9788926934340,0.861815
1,292,9788934926023,0.859673
2,292,9791155170199,0.858518
3,292,9788981651305,0.858505
4,292,9788926920633,0.858328
...,...,...,...
247895,99906,9788934958314,0.787094
247896,99906,9788963702858,0.783178
247897,99906,9788926905760,0.771739
247898,99906,9788960471962,0.761714


In [9]:
from python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k

# all ranking metrics have the same arguments
args = [test, recs_prob]
kwargs = dict(col_user=USER, 
              col_item=ITEM, 
              col_rating=RATING, 
              col_prediction='score', 
              relevancy_method='top_k', 
              k=10)

#eval_map = map_at_k(*args, **kwargs)
#eval_ndcg = ndcg_at_k(*args, **kwargs)
eval_precision = precision_at_k(*args, **kwargs)
eval_recall = recall_at_k(*args, **kwargs)

print(f"Model:",
      f"Top K:\t\t {10}",
#      f"MAP:\t\t {eval_map:f}",
#      f"NDCG:\t\t {eval_ndcg:f}",
      f"Precision@K:\t {eval_precision:f}",
      f"Recall@K:\t {eval_recall:f}", sep='\n')

Model:
Top K:		 10
Precision@K:	 0.009941
Recall@K:	 0.037588


## End