In [1]:
import pandas as pd
import lightgbmmt as lgb
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.metrics import roc_auc_score

这里使用的房屋售价预测数据，y1是销售价，y2是将平均房价>120的设置为1,<120的设置为0   
所以，对于y1是回归任务，对于y2是分类任务

In [2]:
df=pd.read_csv("./data/train.csv")

In [3]:
df["y1"]=df["SalePrice"]
df["y2"]=(df["y1"]/df["GrLivArea"]).apply(lambda x:1 if x>120 else 0)

In [4]:
df=df.drop("Id",axis=1).drop("SalePrice",axis=1)

In [5]:
df=df.fillna(-1)

In [6]:
cate_cols=[col for col in  df.columns if df[col].dtype=='object']

In [7]:
for col in cate_cols:
    df[col]=LabelEncoder().fit_transform(df[col].apply(lambda x:str(x)))

In [8]:
df.head(5)

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,y1,y2
0,60,3,65.0,8450,1,0,3,3,0,4,...,0,0,0,0,2,2008,8,4,208500,1
1,20,3,80.0,9600,1,0,3,3,0,2,...,0,0,0,0,5,2007,8,4,181500,1
2,60,3,68.0,11250,1,0,0,3,0,4,...,0,0,0,0,9,2008,8,4,223500,1
3,70,3,60.0,9550,1,0,0,3,0,0,...,0,0,0,0,2,2006,8,0,140000,0
4,60,3,84.0,14260,1,0,0,3,0,2,...,0,0,0,0,12,2008,8,4,250000,0


In [9]:
trn_df=df[:1200]
val_df=df[1200:]

In [10]:
trn_df.shape,val_df.shape

((1200, 81), (260, 81))

In [11]:
trn_y=trn_df[["y1","y2"]]
val_y=val_df[["y1","y2"]]

In [12]:
trn_x=trn_df.drop(["y1","y2"],axis=1)
val_x=val_df.drop(["y1","y2"],axis=1)

In [13]:
def rmse(y,pred):
    return np.sqrt(np.sum(np.abs(y-pred)))

只训练y1

In [14]:
param = {
    'num_leaves':48, 
    'objective':'regression',
    'max_depth':6,
    'learning_rate':.03,
    'max_bin':200,
    'lambda_l1':0.1,
    'lambda_l2':0.2,
    'verboses':10,
    'metrics':"rmse",
    'num_threads':4,
    'tree_learner': 'serial2'}

In [15]:
trn_data=lgb.Dataset(trn_x,label=trn_y["y1"],categorical_feature=cate_cols)
val_data=lgb.Dataset(val_x,label=val_y["y1"],categorical_feature=cate_cols)
clf=lgb.train(param,trn_data,verbose_eval=10,
                num_boost_round=200,valid_sets=[trn_data,val_data])
clf.set_num_labels(1)



[10]	training's rmse: 63630.2	valid_1's rmse: 56465.5
[20]	training's rmse: 51283.4	valid_1's rmse: 46026.9
[30]	training's rmse: 42493	valid_1's rmse: 38870.1
[40]	training's rmse: 35905.8	valid_1's rmse: 34372.3
[50]	training's rmse: 30888.4	valid_1's rmse: 31729.8
[60]	training's rmse: 27349.1	valid_1's rmse: 30066.1
[70]	training's rmse: 24871.5	valid_1's rmse: 29238.2
[80]	training's rmse: 23151.4	valid_1's rmse: 28894.8
[90]	training's rmse: 21930.2	valid_1's rmse: 28863.1
[100]	training's rmse: 20991.3	valid_1's rmse: 28699.6
[110]	training's rmse: 20212.4	valid_1's rmse: 28697.9
[120]	training's rmse: 19654.5	valid_1's rmse: 28774.3
[130]	training's rmse: 19130.8	valid_1's rmse: 28726.7
[140]	training's rmse: 18740.7	valid_1's rmse: 28723.6
[150]	training's rmse: 18220.1	valid_1's rmse: 28799.7
[160]	training's rmse: 17830.8	valid_1's rmse: 28811.9
[170]	training's rmse: 17498.7	valid_1's rmse: 28881.8
[180]	training's rmse: 17140.7	valid_1's rmse: 28926.5
[190]	training's rmse

In [16]:
rmse(clf.predict(val_x),val_y["y1"])

inner_predict 260


2042.9441797076763

只训练y2

In [17]:
param = {
    'num_leaves':48, 
    'objective':'binary',
    'max_depth':6,
    'learning_rate':.03,
    'max_bin':200,
    'lambda_l1':0.1,
    'lambda_l2':0.2,
    'verboses':10,
    'metrics':"precision",
    'num_threads':4,
    'tree_learner': 'serial2'}

In [18]:
trn_data=lgb.Dataset(trn_x,label=trn_y["y2"],categorical_feature=cate_cols)
val_data=lgb.Dataset(val_x,label=val_y["y2"],categorical_feature=cate_cols)
clf=lgb.train(param,trn_data,verbose_eval=10,
                num_boost_round=200,valid_sets=[trn_data,val_data])
clf.set_num_labels(1)

In [19]:
roc_auc_score(val_y["y2"],clf.predict(val_x))

inner_predict 260


0.9437337123904288

同时训练y1,y2

In [20]:
num_labels=2
param = {
    'num_leaves':48, 
    'max_depth':6,
    'learning_rate':.03,
    'max_bin':200,
    'lambda_l1':0.1,
    'lambda_l2':0.2,
    'verbose': 5,

    # multitask
    'objective':'custom',           
    'num_labels':num_labels, 
    'tree_learner': 'serial2',
    'num_threads':4}  

In [21]:
import copy
def sigmoid(x2):
    """
    sigmoid函数
    :param x2:
    :return:
    """
    x = copy.deepcopy(x2)
    if type(x) is int:
        x = 20.0 if x > 20.0 else x
        x = -100.0 if x < -100.0 else x
    else:
        # 避免下溢
        x[x > 20.0] = 20.0
        # 避免上溢
        x[x < -100.0] = -100.0
    return 1 / (1 + np.exp(-x))

y1使用mse做损失函数，y2使用交叉熵做损失函数，下面的times表示俩损失函数的权重比值

In [22]:
times=10
def object_func(preds, train_data, ep = 0):
    labels = train_data.get_label()
    labels2 = labels.reshape((num_labels,-1)).transpose()    
    preds2 = preds.reshape((num_labels,-1)).transpose()
    
    #regression
    grad_regress=preds2[:,0]-labels2[:,0]
    hess_regress=grad_regress*0+1
    #binary
    grad_binary=sigmoid(preds2[:,1])-labels2[:,1]
    hess_binary=sigmoid(preds2[:,1])*(1-sigmoid(preds2[:,1]))
    
    #split
    grad=grad_regress+times*grad_binary
    hess=hess_regress+times*hess_binary
    
    #value 
    grad2=np.concatenate([grad_regress,grad_binary])
    hess2=np.concatenate([hess_regress,hess_binary])
    
    return grad, hess, grad2, hess2   

def eval_func(preds, train_data):
    labels = train_data.get_label()
    labels2 = labels.reshape((num_labels,-1)).transpose()
    preds2 = preds.reshape((num_labels,-1)).transpose()
    #regression score
    regress_score = np.mean((labels2[:,0]-preds2[:,0]) ** 2)**0.5
    #binary score
    binary_score=roc_auc_score(labels2[:,1],sigmoid(preds2[:,1]))
    total_score=regress_score+times*binary_score
    return 'score', total_score, False

In [23]:
trn_data=lgb.Dataset(trn_x,label=trn_y.values,categorical_feature=cate_cols)
val_data=lgb.Dataset(val_x,label=val_y.values,categorical_feature=cate_cols)
clf=lgb.train(param,trn_data,verbose_eval=10,fobj=object_func,feval=eval_func,
                num_boost_round=200,valid_sets=[trn_data,val_data])
clf.set_num_labels(num_labels)

[10]	training's score: 148131	valid_1's score: 143248




[20]	training's score: 111177	valid_1's score: 106804
[30]	training's score: 84224.4	valid_1's score: 80491
[40]	training's score: 64618.9	valid_1's score: 61705.5
[50]	training's score: 50545.1	valid_1's score: 48339.6
[60]	training's score: 40474.6	valid_1's score: 39381.7
[70]	training's score: 33415	valid_1's score: 33870.7
[80]	training's score: 28494	valid_1's score: 30526.1
[90]	training's score: 25193.9	valid_1's score: 28606
[100]	training's score: 23045.7	valid_1's score: 27667.7
[110]	training's score: 21557.3	valid_1's score: 27222.8
[120]	training's score: 20439.3	valid_1's score: 26904.9
[130]	training's score: 19641.6	valid_1's score: 26736.5
[140]	training's score: 19072.9	valid_1's score: 26695.7
[150]	training's score: 18536.1	valid_1's score: 26637.3
[160]	training's score: 18027.8	valid_1's score: 26590.2
[170]	training's score: 17660.1	valid_1's score: 26584.7
[180]	training's score: 17230.3	valid_1's score: 26564.3
[190]	training's score: 16896.8	valid_1's score: 

In [24]:
rmse(clf.predict(val_x)[:,0],val_y["y1"])

inner_predict 520


1984.9600648325397

In [25]:
roc_auc_score(val_y["y2"],sigmoid(clf.predict(val_x)[:,1]))

inner_predict 520


0.9333688699360342

可以发现，引入y2可以起到优化y1的作用