In [5]:
import pandas as pd
pd.set_option('display.max_columns', None)
from pandasticsearch import DataFrame, Select
from elasticsearch import Elasticsearch
from sklearn.linear_model import LogisticRegression
from sklearn2pmml.pipeline import PMMLPipeline
from sklearn2pmml import sklearn2pmml
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
import numpy as np

In [2]:
es_ysyp_host = "115.29.34.243:9200"
unsername = 'elastic'
password = 'superzsh123'

es = Elasticsearch(
    [es_ysyp_host],
    http_auth=(unsername, password)
)

In [3]:
# 用户信息
user_info = es.search(index="index_user_info", body={"size": 1000}, timeout='30m')
user_info = Select.from_dict(user_info).to_pandas().drop(['_index', '_type', '_id', '_score'], axis=1).astype(str)
user_features = ['userid', 'ismember', 'os', 'phone', 'sex']
user_info = user_info[user_features]
user_info.isnull().sum() * 100 / len(user_info)

userid      0.0
ismember    0.0
os          0.0
phone       0.0
sex         0.0
dtype: float64

In [4]:
# 商品信息
item_info = es.search(index="syp", body={"size":10}, timeout='30m')
item_info = Select.from_dict(item_info).to_pandas().astype(str)
item_features = ['bbsid', 'memprice', 'gradeid', 'squareid', 'bbstype', 'discount']
item_info = item_info[item_features]
item_info.head()

Unnamed: 0,bbsid,memprice,gradeid,squareid,bbstype,discount
0,163316,8800.0,3,0,1,0.453608
1,163270,32800.0,3,0,1,0.0
2,163301,8200.0,1,24877,1,0.41
3,163271,98800.0,3,0,1,0.27831
4,163303,3999.0,1,24821,1,0.0


In [5]:
# 召回信息
pv_info = es.search(index="index_syp_user_suggest", body={"size":1000}, timeout='30m')
pv_info = Select.from_dict(pv_info).to_pandas().astype(str)
pv_features = ['userid', 'bbsid']
pv_info = pv_info[pv_features]
pv_info

Unnamed: 0,userid,bbsid
0,1098119554,20175756
1,1038111831,20175782
2,1038111831,20175749
3,1038111831,20175745
4,1038111831,20175773
...,...,...
116,1098119704,20175807
117,1098119704,20175812
118,1098119704,20175839
119,1098119704,20175821


In [6]:
pv_info.groupby('userid').count()

Unnamed: 0_level_0,bbsid
userid,Unnamed: 1_level_1
1038111556,8
1038111610,1
1038111778,10
1038111831,10
1098117798,13
1098119406,1
1098119552,8
1098119554,24
1098119569,8
1098119580,21


In [7]:
# 特征拼接
x_train = pd.merge(pd.merge(pv_info, user_info, how='left', on='userid'), item_info, how='left', on='bbsid')
x_train

Unnamed: 0,userid,bbsid,ismember,os,phone,sex,memprice,gradeid,squareid,bbstype,discount
0,1098119554,20175756,0,2,18810927806,0,,,,,
1,1038111831,20175782,0,2,15712893526,2,,,,,
2,1038111831,20175749,0,2,15712893526,2,,,,,
3,1038111831,20175745,0,2,15712893526,2,,,,,
4,1038111831,20175773,0,2,15712893526,2,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
116,1098119704,20175807,0,2,15210115615,1,,,,,
117,1098119704,20175812,0,2,15210115615,1,,,,,
118,1098119704,20175839,0,2,15210115615,1,,,,,
119,1098119704,20175821,0,2,15210115615,1,,,,,


In [8]:
# 生成模拟点击数据（1-点击 / 0-没点击）
# 点击 index_bbs_browse_record 
import random
label = [[random.randint(0,1)] for i in range(121)]

In [9]:
# ETL + 特征工程
#x_train.phone = SimpleImputer(strategy='most_frequent').fit_transform(x_train.phone.values.reshape(-1, 1))
#x_train.phone = x_train.phone.apply(lambda phone:phone[0:3])
#x_train.sex = SimpleImputer(strategy='most_frequent').fit_transform(x_train.sex.values.reshape(-1, 1))
x_train = x_train.fillna('0')
#x_train.userid = x_train.userid.astype('object')
x_feature = pd.get_dummies(x_train)
x_feature.head()

Unnamed: 0,userid_1038111556,userid_1038111610,userid_1038111778,userid_1038111831,userid_1098117798,userid_1098119406,userid_1098119552,userid_1098119554,userid_1098119569,userid_1098119580,userid_1098119649,userid_1098119653,userid_1098119678,userid_1098119703,userid_1098119704,bbsid_20175736,bbsid_20175737,bbsid_20175745,bbsid_20175746,bbsid_20175747,bbsid_20175748,bbsid_20175749,bbsid_20175750,bbsid_20175756,bbsid_20175769,bbsid_20175772,bbsid_20175773,bbsid_20175777,bbsid_20175781,bbsid_20175782,bbsid_20175783,bbsid_20175785,bbsid_20175789,bbsid_20175790,bbsid_20175791,bbsid_20175792,bbsid_20175793,bbsid_20175807,bbsid_20175808,bbsid_20175812,bbsid_20175813,bbsid_20175814,bbsid_20175815,bbsid_20175816,bbsid_20175817,bbsid_20175819,bbsid_20175820,bbsid_20175821,bbsid_20175823,bbsid_20175826,bbsid_20175839,ismember_0,os_0,os_2,phone_0,phone_13131333204,phone_13655556666,phone_15001237089,phone_15188987541,phone_15210115615,phone_15712893526,phone_15922223333,phone_17744474185,phone_18646117093,phone_18810927806,phone_18911741863,sex_0,sex_1,sex_2,memprice_0,gradeid_0,squareid_0,bbstype_0,discount_0
0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,1,1,1,1
1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,1,1,1
2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,1,1,1
3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,1,1,1
4,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,1,1,1


In [10]:
pipeline = PMMLPipeline([("classifier", LogisticRegression())])
pipeline.fit(x_feature, label)
sklearn2pmml(pipeline, "ysp_rank_model.demo-0.2.pmml", with_repr = True)

  y = column_or_1d(y, warn=True)
