In [9]:
import pandas as pd
import numpy as np
import warnings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
import xlearn as xl

In [10]:
train = pd.read_csv("./input/train_set.csv", sep=",")
test = pd.read_csv("./input/test_set.csv", sep=",")

### 数据预处理

In [11]:
test['y'] = -1
data = train.append(test).reset_index(drop=True)

label_features = ['default','housing','loan']
onehot_features = ['job','marital','contact','month','poutcome']
num_features = ['age','balance','day','duration','campaign','pdays','previous']

In [12]:
def education_cut(x):
    if x=='unknown':
        return 0
    elif  x=='primary':
        return 1
    elif  x=='secondary':
        return 2
    elif  x=='tertiary':
        return 3
data['education'] = data['education'].map(education_cut)

label_col = [i for i in data.select_dtypes(object).columns if i in label_features]
for i in label_col:
    lbl = LabelEncoder()
    data[i] = lbl.fit_transform(data[i].astype(str))

In [13]:
# 对类别特征做归一化
onehot_col = [i for i in data.select_dtypes(object).columns if i in onehot_features]
for i in onehot_col:
    # 进行one-hot
    tmp = pd.get_dummies(data[i],prefix = i)
    # 删除原来的列
    data = data.drop([i], axis=1)
    data = pd.concat([data,tmp], axis=1)
    
# 对数值特征做归一化
mms = MinMaxScaler()
data[num_features] = mms.fit_transform(data[num_features])

feats = [f for f in data.columns if f not in ['ID','y']]
data.head()

Unnamed: 0,ID,age,education,default,balance,housing,loan,day,duration,campaign,pdays,previous,y,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,job_unknown,marital_divorced,marital_married,marital_single,contact_cellular,contact_telephone,contact_unknown,month_apr,month_aug,month_dec,month_feb,month_jan,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,1,0.324675,3,0,0.075445,1,0,0.266667,0.03865,0.017544,0.0,0.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
1,2,0.311688,1,0,0.118888,1,0,0.2,0.025509,0.0,0.288991,0.007273,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,3,0.376623,2,0,0.073748,1,1,0.433333,0.01984,0.017544,0.0,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
3,4,0.12987,2,0,0.063779,1,1,0.566667,0.044834,0.017544,0.0,0.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
4,5,0.311688,2,0,0.099804,1,0,0.666667,0.048183,0.070175,0.0,0.0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1


### 建模

##### 1，初始化fm模型（使用xlearn自带的FM模型）

In [14]:
fm_model = xl.create_fm()

#####  2，获取训练集和验证集

In [15]:
tar = data[data['y'] != -1][feats]
y = data[data['y'] != -1]['y']
test_x = data[data['y'] == -1][feats]
test_y = data[data['y'] == -1]['y']

# 分训练集和验证集
train_x, vali_x, train_y, vali_y = train_test_split(tar, y, test_size=0.3, random_state=42)

#####  3，设置DMatrix格式

In [16]:
xdm_train = xl.DMatrix(train_x, train_y)
xdm_vali = xl.DMatrix(vali_x, vali_y)
xdm_test = xl.DMatrix(test_x, test_y)

##### 4，设置模型的格式和参数

In [17]:
fm_model.setTrain(xdm_train)
fm_model.setValidate(xdm_vali)

# 参数:
#  0. 二分类任务
#  1. learning rate: 0.2
#  2. lambda: 0.002
#  3. metric: accuracy
param = {'task':'binary', 'lr':0.1,
         'lambda':0.002, 'metric':'acc'}

#####  5，训练模型，并输出参数

In [18]:
fm_model.fit(param, './output/model_fm.out')

##### 6，设置测试集数据进行预测

In [19]:
# 把输出生成为二分概率
fm_model.setTest(xdm_test)  # Test data
fm_model.setSigmoid()  # Convert output to 0-1

In [20]:
res = fm_model.predict("./output/model_fm.out")

print(res)

[0.10245499 0.10828665 0.11862546 ... 0.05048832 0.11568101 0.09455083]
