# 模型预测

In [19]:
#导入必要的包
#计算与可视化
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

#评估指标、编码与模型
from  sklearn.metrics import accuracy_score,  auc, confusion_matrix, roc_auc_score, classification_report,cohen_kappa_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
import xgboost as xgb
from scipy.optimize import fmin_powell
#from ml_metrics import quadratic_weighted_kappa

#from wf_analyse.analyse import base
import pickle

## 加载数据

In [30]:
# 导入数据集 insurance-test-sample
 
import wfio
_INPUT = '{"type":15,"uri":"awss3fcd548f44cdd456991721d04d43ec5ad/sd_911a1dde7776496b9c2dd78f6fa00e52"}'
 
# 读取并返回对应的Dataframe
# 参数as_spark: 为True返回Spark DataFrame，为False返回Pandas DataFrame，默认为False 
test = wfio.read_dataframe(_INPUT,as_spark = False)

In [46]:
test = test[:10]

In [32]:
def to_lower(str):
    return str.lower()
test.columns = list(map(to_lower,list(test.columns)))

In [33]:
test.drop(["name", "email","mobile_number","sex","id_card", "addr"], axis=1, inplace = True)

In [34]:
def eval_wrapper(yhat, y):  
    y = np.array(y)
    y = y.astype(int)
    yhat = np.array(yhat)
    yhat = np.clip(np.round(yhat), np.min(y), np.max(y)).astype(int)   
    return cohen_kappa_score(yhat, y, weights= 'quadratic')
    
    
def apply_offset(data, bin_offset, sv, scorer=eval_wrapper):
    # data has the format of pred=0, offset_pred=1, labels=2 in the first dim
    #data[0] = data[1]是预测值， data[2]是真实标签值
    #sv是是类别编号sv = 0(1, 2, 3, 4, 5, 6, 7)
    #把预测值等于sv这一类的样本预测值取出来 + bin_offset偏移量 作为新的预测值
    data[1, data[0].astype(int)==sv] = data[0, data[0].astype(int)==sv] + bin_offset
    score = scorer(data[1], data[2])
    return score

def new_target3(row):
    if (row['BMI_Wt']=='under_weight') or (row['Old_Young']=='young')  or (row['Thin_Fat']=='thin'):
        val='low_end'
    else:
        val='non_low_end'
    return val


def new_target1(row):
    if (row['BMI_Wt']=='overweight') or (row['Old_Young']=='old')  or (row['Thin_Fat']=='fat'):
        val='extremely_risky'
    else:
        val='not_extremely_risky'
    return val

def acc(y, yhat):  
    y = np.array(y)
    y = y.astype(int)
    yhat = np.array(yhat)
    yhat = np.clip(np.round(yhat), np.min(y), np.max(y)).astype(int)   
    return accuracy_score(y, yhat)

In [35]:
# 全局变量
columns_to_drop = ['response']
xgb_num_rounds = 700
num_classes = 8

In [36]:
test_label = test["response"].copy()
# 将训练集与测试集拼接，并于统一特征处理
all_data = test

## 数据预处理

In [37]:
# 创建新特征 
# 特征编码
all_data['product_info_2'] = pd.factorize(all_data['product_info_2'])[0]

print('Eliminate missing values')    
# 空值填充-1
all_data.fillna(-1, inplace=True)

# 将label转为int值
all_data['response'] = all_data['response'].astype(int)

Eliminate missing values


In [38]:
# BMI离散化处理
conditions = [
    (all_data['bmi'] <= all_data['bmi'].quantile(0.25)),
    (all_data['bmi'] > all_data['bmi'].quantile(0.25)) & (all_data['bmi'] <= all_data['bmi'].quantile(0.75)),
    (all_data['bmi'] > all_data['bmi'].quantile(0.75))]

choices = ['under_weight', 'average', 'overweight']

all_data['BMI_Wt'] = np.select(conditions, choices)

# 年龄离散化处理
conditions = [
    (all_data['ins_age'] <= all_data['ins_age'].quantile(0.25)),
    (all_data['ins_age'] > all_data['ins_age'].quantile(0.25)) & (all_data['ins_age'] <= all_data['ins_age'].quantile(0.75)),
    (all_data['ins_age'] > all_data['ins_age'].quantile(0.75))]

choices = ['young', 'average', 'old']
all_data['Old_Young'] = np.select(conditions, choices)

# 身份离散化处理
conditions = [
    (all_data['ht'] <= all_data['ht'].quantile(0.25)),
    (all_data['ht'] > all_data['ht'].quantile(0.25)) & (all_data['ht'] <= all_data['ht'].quantile(0.75)),
    (all_data['ht'] > all_data['ht'].quantile(0.75))]

choices = ['short', 'average', 'tall']

all_data['Short_Tall'] = np.select(conditions, choices)

# 体重离散化处理
conditions = [
    (all_data['wt'] <= all_data['wt'].quantile(0.25)),
    (all_data['wt'] > all_data['wt'].quantile(0.25)) & (all_data['wt'] <= all_data['wt'].quantile(0.75)),
    (all_data['wt'] > all_data['wt'].quantile(0.75))]

choices = ['thin', 'average', 'fat']

all_data['Thin_Fat'] = np.select(conditions, choices)

In [39]:
##创建新特征
all_data['extreme_risk'] = all_data.apply(new_target1,axis=1)
#创建新特征
all_data['low_end_risk'] = all_data.apply(new_target3,axis=1)

In [40]:
all_data['BMI_Wt'] = pd.factorize(all_data['BMI_Wt'])[0]
all_data['Old_Young'] = pd.factorize(all_data['Old_Young'])[0]
all_data['Short_Tall'] = pd.factorize(all_data['Short_Tall'])[0]
all_data['Thin_Fat'] = pd.factorize(all_data['Thin_Fat'])[0]
all_data['extreme_risk'] = pd.factorize(all_data['extreme_risk'])[0]
all_data['low_end_risk'] = pd.factorize(all_data['low_end_risk'])[0]

In [41]:
all_data.drop(["BMI_Wt", "Short_Tall","Thin_Fat","Old_Young", "empty_name"], axis=1, inplace = True)

In [42]:
# 转成xgb格式
xgtest = xgb.DMatrix(all_data.drop(columns_to_drop, axis=1))

## 加载模型，进行预测

In [43]:
model = pickle.load(open("/home/nbuser/work_e9695a8d7ac541d7a70a0060c591183e/model/xgboost.model", "rb"))



In [44]:
test_preds = model.predict(xgtest, ntree_limit=model.best_iteration)
#print('Test kappa score is:', eval_wrapper( test_preds, test_label))

In [48]:
test_preds = np.clip(test_preds, -0.99, 8.99)

# 设置偏移量，并使用fmin_powell寻找最佳值
offsets = pickle.load(open("/home/nbuser/work_e9695a8d7ac541d7a70a0060c591183e/model/final_offset.list", "rb"))

# 将在训练集上的偏移量应用在测试集上
data = np.vstack((test_preds, test_preds, test['response'].values))
for j in range(num_classes):
    data[1, data[0].astype(int)==j] = data[0, data[0].astype(int)==j] + offsets[j] 

final_test_preds = np.round(np.clip(data[1], 1, 8)).astype(int)
print("10 samples's prediction results:", final_test_preds)

10 samples's prediction results: [5 1 1 6 6 8 7 8 8 8]
