In [1]:
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
import pandas as pd
import pickle as pkl

In [2]:
LIBFM_PATH = 'D:\software\libfm-1.40.windows\libfm.exe'
model_path = '../dataset/recsysmode.fm'

In [3]:
!$LIBFM_PATH

----------------------------------------------------------------------------
libFM
  Version: 1.40
  Author:  Steffen Rendle, steffen.rendle@uni-konstanz.de
  WWW:     http://www.libfm.org/
  License: Free for academic use. See license.txt.
----------------------------------------------------------------------------
-cache_size     cache size for data storage (only applicable if data is
                in binary format), default=infty
-dim            'k0,k1,k2': k0=use bias, k1=use 1-way interactions,
                k2=dim of 2-way interactions; default=1,1,8
-help           this screen
-init_stdev     stdev for initialization of 2-way factors; default=0.1
-iter           number of iterations; default=100
-learn_rate     learn_rate for SGD; default=0.1
-meta           filename for meta information about data set
-method         learning method (SGD, SGDA, ALS, MCMC); default=MCMC
-out            filename for output
-regular        'r0,r1,r2' for SGD and ALS: r0=bias regularization,
  

## FM模型训练和预测，使用libFM库蒙特卡洛法

In [4]:
def FM(train_file, test_file, classification=True, rank=10, n_iter=150):
    task = 'c' if classification else 'r'
    console_output = !$LIBFM_PATH -task $task -method mcmc -train $train_file -test $test_file -iter $n_iter -dim '1,1,$rank' -out output_.libfm
    print(console_output)
    libfm_pred = pd.read_csv('output_.libfm', header=None).values.flatten()
    return libfm_pred

In [5]:
train_file = '../dataset/recsys_train_scaling.txt'
test_file = '../dataset/recsys_test_scaling.txt'
libfm_predict = FM(train_file, test_file, classification=False)

['----------------------------------------------------------------------------', 'libFM', '  Version: 1.40', '  Author:  Steffen Rendle, steffen.rendle@uni-konstanz.de', '  WWW:     http://www.libfm.org/', '  License: Free for academic use. See license.txt.', '----------------------------------------------------------------------------', 'Loading train...\t', 'has x = 0', 'has xt = 1', 'num_rows=180000\tnum_values=3546761\tnum_features=43772\tmin_target=2\tmax_target=10', 'Loading test... \t', 'has x = 0', 'has xt = 1', 'num_rows=19813\tnum_values=390516\tnum_features=43770\tmin_target=2\tmax_target=10', '#relations: 0', 'Loading meta data...\t', '#Iter=  0\tTrain=2.54765\tTest=2.59464', '#Iter=  1\tTrain=2.44985\tTest=2.54826', '#Iter=  2\tTrain=2.40291\tTest=2.51439', '#Iter=  3\tTrain=2.36168\tTest=2.48695', '#Iter=  4\tTrain=2.33111\tTest=2.46372', '#Iter=  5\tTrain=2.30215\tTest=2.44287', '#Iter=  6\tTrain=2.27456\tTest=2.42418', '#Iter=  7\tTrain=2.24887\tTest=2.40643', '#Iter=  

In [6]:
train_file_without_escape_path = '../dataset/recsys_test_scaling.txt'

In [7]:
from sklearn.datasets import load_svmlight_file

def get_test_data(path):
    data = load_svmlight_file(path)
    return data[0], data[1]

In [8]:
X, y = get_test_data(train_file_without_escape_path)

In [9]:
y

array([ 8.,  8.,  8., ..., 10., 10.,  8.])

In [10]:
libfm_predict_series = pd.Series(libfm_predict)

In [11]:
libfm_predict_series_int = libfm_predict_series.apply(lambda x: int(x))

In [12]:
libfm_predict_series_int.tolist()

[8,
 7,
 7,
 3,
 6,
 7,
 4,
 4,
 8,
 8,
 7,
 4,
 4,
 9,
 6,
 5,
 7,
 8,
 8,
 9,
 8,
 8,
 4,
 9,
 4,
 7,
 8,
 8,
 8,
 5,
 7,
 9,
 7,
 7,
 8,
 8,
 7,
 8,
 8,
 8,
 4,
 7,
 8,
 8,
 4,
 8,
 7,
 7,
 7,
 7,
 7,
 8,
 8,
 5,
 7,
 8,
 7,
 5,
 4,
 7,
 6,
 8,
 6,
 7,
 8,
 5,
 7,
 8,
 4,
 6,
 7,
 7,
 5,
 7,
 7,
 7,
 4,
 4,
 8,
 6,
 9,
 7,
 8,
 8,
 7,
 7,
 3,
 6,
 7,
 2,
 4,
 7,
 6,
 8,
 3,
 8,
 8,
 6,
 7,
 8,
 7,
 8,
 8,
 6,
 4,
 5,
 8,
 8,
 3,
 8,
 6,
 6,
 8,
 8,
 9,
 8,
 7,
 7,
 6,
 8,
 4,
 8,
 8,
 8,
 6,
 7,
 8,
 7,
 5,
 6,
 6,
 4,
 7,
 6,
 6,
 7,
 7,
 7,
 7,
 7,
 5,
 8,
 5,
 4,
 7,
 6,
 4,
 8,
 8,
 6,
 4,
 6,
 7,
 6,
 8,
 8,
 7,
 8,
 6,
 5,
 6,
 8,
 5,
 8,
 6,
 8,
 6,
 6,
 9,
 4,
 4,
 9,
 6,
 5,
 4,
 6,
 8,
 6,
 9,
 7,
 4,
 6,
 5,
 8,
 7,
 8,
 4,
 5,
 5,
 7,
 6,
 6,
 4,
 6,
 8,
 3,
 7,
 7,
 4,
 6,
 8,
 8,
 8,
 7,
 7,
 6,
 7,
 5,
 8,
 7,
 4,
 5,
 7,
 7,
 3,
 5,
 9,
 8,
 9,
 6,
 8,
 6,
 5,
 7,
 6,
 6,
 6,
 8,
 4,
 8,
 4,
 7,
 5,
 6,
 7,
 7,
 7,
 7,
 6,
 7,
 4,
 4,
 4,
 3,
 7,
 7,
 4,
 7,
 8,
 6,


In [13]:
y

array([ 8.,  8.,  8., ..., 10., 10.,  8.])

## 转换为评分后的测试集MSE

In [14]:
mean_squared_error(y, libfm_predict_series_int.tolist())

4.676727401201232

In [15]:
mean_squared_error(y, libfm_predict_series.tolist())

4.384792147533347

In [16]:
y[0:10]

array([ 8.,  8.,  8.,  4., 10., 10.,  4.,  4.,  8., 10.])

In [17]:
libfm_predict_series[100:500]

100    7.34519
101    8.77586
102    8.65679
103    6.59199
104    4.44896
105    5.88468
106    8.01670
107    8.73353
108    3.67003
109    8.73270
110    6.67533
111    6.72056
112    8.27252
113    8.34008
114    9.35346
115    8.23754
116    7.88176
117    7.02376
118    6.62872
119    8.50161
120    4.00111
121    8.87787
122    8.46534
123    8.18763
124    6.36202
125    7.86422
126    8.11483
127    7.13508
128    5.26463
129    6.07317
        ...   
470    6.50967
471    7.42814
472    7.76787
473    7.18996
474    7.04490
475    7.19295
476    8.28238
477    3.87494
478    8.38499
479    8.26623
480    5.77554
481    7.41266
482    7.62928
483    4.75189
484    7.42415
485    7.27891
486    4.51170
487    6.18790
488    6.02756
489    7.73879
490    8.43980
491    6.86265
492    4.76168
493    6.59557
494    8.57539
495    8.32975
496    7.92331
497    3.82961
498    8.57978
499    4.33842
Length: 400, dtype: float64

## 向量转换器

In [18]:
v_from_pkl = None
with open('../dataset/dict2vec', 'rb') as f:
    v_from_pkl = pkl.load(f)

## 频度统计字典

In [19]:
actors_dict = None
director_dict = None

with open('../dataset/actors_dict', 'rb') as f:
    actors_dict = pkl.load(f)
with open('../dataset/director_dict', 'rb') as f:
    director_dict = pkl.load(f)

## 与训练样本一样的方式构建预测数据的输入矩阵

In [20]:
def convert_dataframe_2_dict_list(df_main, actors_dict, director_dict):
    data_dict_list = []
    for i in df_main.index:
        _dict = {}
        is_invalid = False
        #type
        for s_type in df_main.iloc[i]['type'].split('|'):
            _dict[s_type] = 1
        #actors
        for s_actor in df_main.iloc[i]['actors'].split('|'):
            if not s_actor in actors_dict:
                print('invalid data index is ' + str(i))
                invalid_data_list.append(i)
                is_invalid = True
                break
            if actors_dict[s_actor] < 2:
                _dict['other_actor'] = 1
            else:
                _dict[s_actor] = 1
        if is_invalid == True:
          continue;
        #regios
        _dict[df_main.iloc[i]['region']] = 1
        #userid ...
        _dict[df_main.iloc[i]['UserId']] = 1
        _dict[str(df_main.iloc[i]['MovieId'])] = 1
        _dict['rat'] = df_main.iloc[i]['rat']
        _dict['rmax'] = df_main.iloc[i]['rmax']
        _dict['rmin'] = df_main.iloc[i]['rmin']
        _dict['ravg'] = df_main.iloc[i]['ravg']
        _dict['rcount'] = df_main.iloc[i]['rcount']
        _dict['rsum'] = df_main.iloc[i]['rsum']
        _dict['rmedian'] = df_main.iloc[i]['rmedian']
        _dict['TIME_DIS'] = df_main.iloc[i]['TIME_DIS']
        #director
        for s_director in df_main.iloc[i]['director'].split('|'):
            if director_dict[s_director] < 2:
                _dict['other_director'] = 1
            else:
                _dict[s_director] = 1
        #trait
        for s_trait in df_main.iloc[i]['trait'].split('|'):
            _dict[s_trait] = 1
        data_dict_list.append(_dict)
    return data_dict_list

In [21]:
df_data = df_data.fillna(0)  # 无新的数据，不能实验推荐预测

NameError: name 'df_data' is not defined

In [None]:
dict_list = convert_dataframe_2_dict_list(df_data, actors_list, director_list)

In [None]:
dict_list

In [None]:
predict_X = v_from_pkl.transform(dict_list)

In [None]:
predict_file = '../dataset/recsys_predict.txt'

dump_svmlight_file(predict_X, np.zeros(predict_X.shape[0]), predict_file_)

In [None]:
libfm_predict_final = FM(train_file, predict_file, classification=True)

In [None]:
libfm_predict_final

## 调用FM打分

In [None]:
train_file_lr_path = '../dataset/recsys_train_scaling_lr.txt'
test_file_lr_path = '../dataset/recsys_test_scaling_lr.txt'

In [None]:
train_lr_path = '../dataset/recsys_train_lr.txt'

In [None]:
train_X_lr, train_y = get_test_data(train_lr_path)

In [None]:
test_X_lr, test_y = get_test_data(test_file_lr_path)

## LR模型fit

In [None]:
lr = LogisticRegression(C=0.01, penalty='l2')
lr.fit(train_X_lr, train_y)

In [None]:
train_X_4_scale, train_y_4_scale = get_test_data(train_lr_path)

In [None]:
train_X_4_scale.shape

In [None]:
train_y_4_scale[train_y_4_scale == 1].shape

## LR预测（最终实现参考recsys_core）

In [None]:
scaler = preprocessing.MaxAbsScaler()
scaler.fit(train_X_4_scale)

In [None]:
temp_one = v_from_pkl.transform({})

In [None]:
temp_one_X = scaler.transform(temp_one)

In [None]:
temp_one_X[temp_one_X != 0]

In [None]:
predict_X = scaler.transform(predict_X)

In [None]:
lr_predict_final = lr.predict_proba(temp_one)

In [None]:
lr_predict_final