## Build Recommenders using LightFM 
- LightFM is a Python implementation of a number of popular recommendation algorithms for both implicit and explicit feedback.
- It also makes it possible to incorporate both item and user metadata into the traditional matrix factorization algorithms. 
- It represents each user and item as the sum of the latent representations of their features, thus allowing recommendations to generalise to new items (via item features) and to new users (via user features).   
http://lyst.github.io/lightfm/docs/index.html

In [1]:
from lightfm import LightFM
from lightfm.evaluation import precision_at_k, recall_at_k, auc_score
from lightfm.data import Dataset
from lightfm.cross_validation import random_train_test_split

import pandas as pd
import numpy as np
from sklearn.externals import joblib 



#### Read Data

In [2]:
# 백화점 보다 sparsity 정도가 상당히 심한 책 데이터를 이용하여 추천시스템 구축
transaction = pd.read_csv('book_transactions.csv', encoding='cp949').dropna(axis=0).query('ISBN != "-"')

In [7]:
transaction.head()

Unnamed: 0,회원번호,일자,책제목,카테고리,작가,ISBN,출판사,출판일자,주문시간,수량,배송지
0,292,20140621,유형아작 중학수학 3-2 (2014년),중고등학습서,<김순우> 등저,9788964166178,비상교육(구 비유와상징),20110801.0,23,1,서울특별시
1,292,20140621,개념+유형 중등수학 3-2 실력향상 파워 (2014년),중고등학습서,<박미정> 등저,9788966868674,비상교육(구 비유와상징),20140301.0,23,1,서울특별시
2,292,20140621,내신특강 중학 수학 3-2 (2014년),중고등학습서,<김정우> 등저,9788937898334,미래엔(대한교과서),20130530.0,23,1,서울특별시
3,294,20140528,액셀 월드 (ACCEL WORLD) 1,만화/라이트노벨,<카와하라 레키> 저/<HIMA> 그림/<김완> 역,9788926321379,서울문화사,20091110.0,20,1,충남
4,294,20140528,신약 어떤 마술의 금서목록 2,만화/라이트노벨,<카마치 카즈마> 저/<하이무라 키요타카> 그림/<김소연> 역,9788925292564,대원씨아이(단행)(대원키즈),20120115.0,20,1,충남


In [3]:
# item feature를 추가히기 위해 ISBN 기준으로 책 정보를 생성
item_data = transaction.drop_duplicates('ISBN').iloc[:,[5,2,3,4,6,7]].sort_values(by='ISBN')
item_data['출판일자'] = item_data['출판일자'].astype('int') // 100
item_data

Unnamed: 0,ISBN,책제목,카테고리,작가,출판사,출판일자
160802,2000132000143,에스콰이어 + 바자 (월간) : 6월 합본세트 [2014],잡지,가야미디어 편집부,가야미디어,201406
105532,2012112110201,개구쟁이 특공대 최신판 (전13권),전집,유키노유미코외,꼬마대통령(전집),201401
74697,2012112303251,[아람] New sos 안전동화 [전12권] 무비펜별도,전집,편집부,아람(전집),201412
228744,2013021802251,[가우스] 쿵쿵 살아숨쉬는 대륙의 공룡들 (전20권),전집,한국가우스편집부,한국가우스(전집),201101
3968,2511481629003,여성동아 B형 (여성월간) : 12월 [2013],잡지,동아일보사편집부 편,동아일보사,201310
132150,2511865908007,베리타스알파 고입·대입을 위한 고품격 교육 신문 (175호),중고등학습서,편집부 저,소년한국일보,201401
110592,2511894396004,유레카논술 339호,중고등학습서,<유레카논술 편집부> 저,유레카엠앤비,201401
53877,2512000318002,Gentleman KOREA 젠틀맨 코리아 + 레몬아이 겨울호 : 2월 [2014],잡지,중앙M&B 편집부,중앙m&b,201210
147767,2512009421000,초등 독서 논술 1학년 세트,어린이,편집부 저,YES24 기획상품,201402
238103,2512009422007,초등 독서 논술 2학년 세트,어린이,편집부 저,YES24 기획상품,201402


In [4]:
# user feature를 추가히기 위해 회원번호 기준으로 고객 정보를 생성
user_data = transaction.drop_duplicates('회원번호').iloc[:,[0,-1]].sort_values(by='회원번호')
fclass = pd.qcut(transaction.groupby('회원번호')['수량'].sum(), 4, labels=False).reset_index().rename(columns={'수량': '구매등급'})
user_data = pd.merge(user_data, fclass)
user_data

Unnamed: 0,회원번호,배송지,구매등급
0,292,서울특별시,1
1,294,충남,1
2,299,경상북도,0
3,300,광주광역시,0
4,308,전라남도,2
5,310,대전광역시,0
6,315,경기도,1
7,316,전라북도,1
8,318,인천광역시,0
9,329,전라남도,1


In [5]:
joblib.dump((user_data, item_data), 'recsys_user_item.pkl')

['recsys_user_item.pkl']

#### Transform Data

In [6]:
# user & item feature를 추가히기 위해 아래와 같은 형식으로 지정할 것!
user_info = (
    user_data['회원번호'],
    user_data['구매등급'],
    # ...
)

item_info = (
    item_data['ISBN'],
    item_data['카테고리'],
    item_data['출판일자'],
    # ...
)

In [7]:
def build_features(ftype, dataset, features):
    uf = []
    for r in range(len(features[0])):
        uf1 = []
        for c in range(1, len(features)):
            uf1.append(features[c].iloc[r])
        uf.append((features[0].iloc[r], uf1))        
    for c in range(1, len(features)):
        if ftype == 'user':
            dataset.fit_partial(users=features[0], user_features=features[c])
        else:
            dataset.fit_partial(items=features[0], item_features=features[c])       
    if ftype == 'user':
        return dataset.build_user_features(uf)
    else:
        return dataset.build_item_features(uf)   

In [8]:
ds = Dataset()
# First, specify user_ids & item_ids
ds.fit(users=user_data['회원번호'], items=item_data['ISBN'])

# Second, build an interaction matrix (like ratings matrix)
interactions, weights = ds.build_interactions([(i,j) for i, j in zip(transaction['회원번호'], transaction['ISBN'])])

# Third, build user features & item features
user_features = build_features('user', ds, user_info)
item_features = build_features('item', ds, item_info)

#### Split Data

In [9]:
train, test = random_train_test_split(interactions, test_percentage=0.25, random_state=np.random.RandomState(123))

#### Build Recommender Models

In [10]:
m1 = LightFM(loss='warp', random_state=123)  # Refer to LightFM API
%time m1.fit(train)
m2 = LightFM(loss='warp', random_state=123)  # Refer to LightFM API
%time m2.fit(train, user_features=user_features, item_features=item_features)

Wall time: 725 ms
Wall time: 1.15 s


<lightfm.lightfm.LightFM at 0x214263d25f8>

#### Evaluate Recommender Models

In [11]:
# Compute precision, recall & f1-score
def precision_recall_score(model, data, user_features, item_features, k):
    precision = precision_at_k(model, data, user_features=user_features, item_features=item_features, k=k).mean()
    recall = recall_at_k(model, data, user_features=user_features, item_features=item_features, k=k).mean()
    f1_score = 2 * (precision * recall) / (precision + recall)
    return precision, recall, f1_score

%time print(precision_recall_score(m1, test, user_features=None, item_features=None, k=10))
%time print(precision_recall_score(m2, test, user_features=user_features, item_features=item_features, k=10))

(0.011190699, 0.03356963024769485, 0.01678574060404797)
Wall time: 2min 4s
(0.006551326, 0.02101328866299061, 0.009988523520619031)
Wall time: 2min 39s


#### Recommend top-N Items

In [12]:
# Recommend top-N items for each user
def make_recommendation(model, dataset, user_features, item_features, n_users, k):
    n_items = dataset.interactions_shape()[1]
    recs = []
    for user_id in range(n_users):
        scores = model.predict(user_id, np.arange(n_items), user_features=user_features, item_features=item_features)
        top_items = np.argsort(-scores)[:k]      
        rec = [list(dataset.mapping()[2].keys())[i] for i in top_items]
        recs.append(rec)
    recs = pd.DataFrame(pd.DataFrame(recs, index=list(dataset.mapping()[0].keys())[:n_users]).stack()).reset_index(). \
                    rename({'level_0':'user_id', 'level_1': 'rank', 0: 'item_id'}, axis=1).iloc[:,[0,2,1]]
    return recs    

%time make_recommendation(m2, ds, user_features=user_features, item_features=item_features, n_users=1000, k=10)

Wall time: 22.9 s


Unnamed: 0,user_id,item_id,rank
0,292,9788937488351,0
1,292,9788956056852,1
2,292,9788970653976,2
3,292,9788965420170,3
4,292,9788917218336,4
5,292,9788952227829,5
6,292,9788965961055,6
7,292,9788949171197,7
8,292,9788991731783,8
9,292,9788954730631,9


## End