In [None]:
import pandas as pd
import pickle
import numpy as np
from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import *
from lightfm.data import Dataset

In [49]:
invest = pd.read_excel('invest_data.xlsx').fillna(0)
invest.head(20)

Unnamed: 0,客戶,基金,基金簡稱,風險等級,金額
0,100066,1,野村優質基金-累積類型新臺幣計價,RR4,3000
1,100066,5,野村 e科技基金,RR5,36079
2,100066,7,野村中小基金-累積類型,RR5,40000
3,100066,9,野村全球高股息基金-累積型新臺幣計價,RR4,3000
4,100066,30,野村精選貨幣市場基金,RR1,0
5,100066,37,野村全球生技醫療基金,RR4,9617
6,100066,46,野村新興傘型之大俄羅斯基金,RR5,54273
7,100066,48,野村中國機會基金,RR5,119385
8,100066,49,野村全球美元投資級公司債基金-累積型,RR2,5000
9,100066,51,野村巴西證券投資信託基金,RR5,88041


In [50]:
invest = invest.infer_objects()

In [51]:
invest.dtypes

客戶       int64
基金      object
基金簡稱    object
風險等級    object
金額       int64
dtype: object

In [52]:
fund = invest[['基金','基金簡稱','風險等級']].drop_duplicates('基金','first')

In [53]:
fund_dict = fund.set_index('基金').T.to_dict()

In [66]:
fund_dict[1]

{'基金簡稱': '野村優質基金-累積類型新臺幣計價', '風險等級': 'RR4'}

In [55]:
invest_sum = invest.groupby(by='客戶', as_index=True,sort=False).sum()

In [56]:
invest_sum.rename(columns={'金額':'總金額'},inplace=True)
invest_sum.head()

Unnamed: 0_level_0,總金額
客戶,Unnamed: 1_level_1
100066,601432
100542,3000
100558,12000
100606,12000
100657,5000


In [57]:
invest = pd.merge(invest, invest_sum, on=['客戶'])

In [58]:
invest.head()

Unnamed: 0,客戶,基金,基金簡稱,風險等級,金額,總金額
0,100066,1,野村優質基金-累積類型新臺幣計價,RR4,3000,601432
1,100066,5,野村 e科技基金,RR5,36079,601432
2,100066,7,野村中小基金-累積類型,RR5,40000,601432
3,100066,9,野村全球高股息基金-累積型新臺幣計價,RR4,3000,601432
4,100066,30,野村精選貨幣市場基金,RR1,0,601432


In [59]:
invest['weight'] = invest['金額']/invest['總金額']

In [60]:
invest.head(20)

Unnamed: 0,客戶,基金,基金簡稱,風險等級,金額,總金額,weight
0,100066,1,野村優質基金-累積類型新臺幣計價,RR4,3000,601432,0.004988
1,100066,5,野村 e科技基金,RR5,36079,601432,0.059988
2,100066,7,野村中小基金-累積類型,RR5,40000,601432,0.066508
3,100066,9,野村全球高股息基金-累積型新臺幣計價,RR4,3000,601432,0.004988
4,100066,30,野村精選貨幣市場基金,RR1,0,601432,0.0
5,100066,37,野村全球生技醫療基金,RR4,9617,601432,0.01599
6,100066,46,野村新興傘型之大俄羅斯基金,RR5,54273,601432,0.09024
7,100066,48,野村中國機會基金,RR5,119385,601432,0.198501
8,100066,49,野村全球美元投資級公司債基金-累積型,RR2,5000,601432,0.008313
9,100066,51,野村巴西證券投資信託基金,RR5,88041,601432,0.146386


In [16]:
data = zip(list(invest['客戶']),list(invest['基金']),list(invest['weight']))

In [17]:
dataset = Dataset()
user_id = list(set(invest['客戶']))
item_id = list(set(invest['基金']))
dataset.fit(user_id,item_id)
COO = dataset.build_interactions(data)

In [20]:
id_map = dataset.mapping()

In [21]:
user_map = id_map[0]
internal_item_map = id_map[2]

In [22]:
user_map[123855]

181

In [23]:
item_map={}
for k in internal_item_map.keys():
    v = internal_item_map[k]
    item_map[v]=k

In [24]:
item_map[55]

93

In [65]:
# invest.to_csv('invest_with_weight.csv', index=None, encoding="big5")

In [25]:
from lightfm import LightFM
model = LightFM(loss='warp')
model.fit(COO[0],epochs=100,num_threads=4)

<lightfm.lightfm.LightFM at 0x1e27dad6860>

In [26]:
query_id = 105417

In [27]:
scores = model.predict(user_map[105417], np.arange(83))

In [29]:
def recommendation(model, data, user_ids,user_map,item_map):
    n_users, n_items = data.shape
    for user_id in user_ids:
        known_positives = invest[invest['客戶']==user_id]
        scores = model.predict(user_map[user_id], np.arange(n_items))
        top_items = []
        count = 0
        known_positive_list = list(known_positives['基金'])
        for x in np.argsort(-scores):
            if(item_map[x] not in known_positive_list):
                top_items.append(item_map[x])
                count += 1
                if(count==3):
                    break
        print("User %s" % user_id)
        print("Known positives:")
        print(known_positives[['客戶','基金','基金簡稱','風險等級','金額']].head())
        print("Recommended:")
        for x in top_items[:3]:
            print(x,':',fund_dict[x])

In [30]:
recommendation(model,COO[0],[query_id],user_map,item_map)

User 105417
Known positives:
         客戶  基金              基金簡稱 風險等級    金額
125  105417   1  野村優質基金-累積類型新臺幣計價  RR4  3000
126  105417   7       野村中小基金-累積類型  RR5  3000
127  105417  23          野村積極成長基金  RR4  3000
128  105417  28           野村高科技基金  RR5  3000
129  105417  93     野村日本領先基金-累積類型  RR4  3000
Recommended:
3 : {'基金簡稱': '野村成長基金', '風險等級': 'RR4'}
18 : {'基金簡稱': '野村台灣運籌基金', '風險等級': 'RR4'}
19 : {'基金簡稱': '野村鴻運基金', '風險等級': 'RR4'}


In [45]:
105417 in user_map

True

In [43]:
pickle.dump(model, open("lightFM_model.pkl","wb"), )
pickle.dump(COO, open("COO.pkl","wb"), )
pickle.dump(user_map, open("user_map.pkl","wb"), )
pickle.dump(item_map, open("item_map.pkl","wb"), )

In [67]:
pickle.dump(fund_dict, open("fund_dict.pkl","wb"), )

# Cross validation

In [32]:
(train,test) = random_train_test_split(COO[0],test_percentage=0.1)

<lightfm.lightfm.LightFM at 0x1df097adbe0>

In [33]:
val_model = LightFM(loss='warp', learning_rate=0.001)
val_model.fit(train,epochs=100)
print('Test precision:',precision_at_k(val_model,test,k=10).mean())
print('Train precision:',precision_at_k(val_model,train,k=10).mean())
print('Test auc:',auc_score(val_model,test).mean())
print('Train auc:',auc_score(val_model,train).mean())

Test precision: 0.06954546
Train precision: 0.19354619
Test auc: 0.84218377
Train auc: 0.86500937
