In [94]:
import numpy as np 
import pandas as pd 
import lightgbm as lgb
import xgboost as xgb
from sklearn.linear_model import BayesianRidge
from sklearn.model_selection import KFold, RepeatedKFold
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from scipy import sparse
import warnings
import time
import sys
import os
import re
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
from sklearn.metrics import mean_squared_error
from sklearn.metrics import log_loss
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns',None)
pd.set_option('max_colwidth',100)

In [95]:
train = pd.read_csv("./jinnan_round1_train_20181227.csv", encoding = 'gb18030')
test = pd.read_csv("./jinnan_round1_testA_20181227.csv", encoding = 'gb18030')

In [96]:
# 删除类别唯一的特征
for df in [train, test]:
    df.drop(['B3', 'B13', 'A13', 'A18', 'A23'], axis=1, inplace=True)

In [97]:
# 删除缺失率超过90%的列
good_cols = list(train.columns)
for col in train.columns:
    rate = train[col].value_counts(normalize=True, dropna=False).values[0]
    if rate > 0.95:
        good_cols.remove(col)
        print(col,rate)

# 删除异常值
train = train[train['score']>0.87]
        
train = train[good_cols]
good_cols.remove('score')
test  = test[good_cols]

(u'A1', 0.9863896848137536)
(u'A2', 0.9699140401146131)
(u'A3', 0.9570200573065902)
(u'A4', 0.9570200573065902)
(u'B2', 0.9842406876790831)


In [98]:
# 合并数据集
target = train['score']
del train['score']
data = pd.concat([train,test],axis=0,ignore_index=True)
data = data.fillna(-1)

In [99]:
def timeTranSecond(t):
    try:
        t,m,s=t.split(":")
    except:
        if t=='1900/1/9 7:00':
            return 7*3600/3600
        elif t=='1900/1/1 2:30':
            return (2*3600+30*60)/3600
        elif t==-1:
            return -1
        else:
            return 0
    
    try:
        tm = (int(t)*3600+int(m)*60+int(s))/3600
    except:
        return (30*60)/3600
    
    return tm
for f in ['A5','A7','A9','A11','A14','A16','A24','A26','B5','B7']:
    try:
        data[f] = data[f].apply(timeTranSecond)
    except:
        print(f,'应该在前面被删除了！')

def getDuration(se):
    try:
        sh,sm,eh,em=re.findall(r"\d+\.?\d*",se)
    except:
        if se == -1:
            return -1 
        
    try:
        if int(sh)>int(eh):
            tm = (int(eh)*3600+int(em)*60-int(sm)*60-int(sh)*3600)/3600 + 24
        else:
            tm = (int(eh)*3600+int(em)*60-int(sm)*60-int(sh)*3600)/3600
    except:
        if se=='19:-20:05':
            return 1
        elif se=='15:00-1600':
            return 1
    
    return tm
for f in ['A20','A28','B4','B9','B10','B11']:
    data[f] = data.apply(lambda df: getDuration(df[f]), axis=1)
    
    
def getID(s):
    t,m=s.split("_")
    return m

data['ID'] = data.apply(lambda df: getID(df['sample id']), axis=1).astype(int)

In [100]:
cate_columns = [f for f in data.columns if f != 'sample id']
print(cate_columns)

[u'A5', u'A6', u'A7', u'A8', u'A9', u'A10', u'A11', u'A12', u'A14', u'A15', u'A16', u'A17', u'A19', u'A20', u'A21', u'A22', u'A24', u'A25', u'A26', u'A27', u'A28', u'B1', u'B4', u'B5', u'B6', u'B7', u'B8', u'B9', u'B10', u'B11', u'B12', u'B14', 'ID']


In [101]:
#label encoder
for f in cate_columns:
    data[f] = data[f].map(dict(zip(data[f].unique(), range(0, data[f].nunique()))))
train = data[:train.shape[0]]
test  = data[train.shape[0]:]

In [102]:
train['target'] = target
train['intTarget'] = pd.cut(train['target'], 5, labels=False)
train = pd.get_dummies(train, columns=['intTarget'])
li = ['intTarget_0.0','intTarget_1.0','intTarget_2.0','intTarget_3.0','intTarget_4.0']
mean_features = []

for f1 in cate_columns:
    rate = train[f1].value_counts(normalize=True, dropna=False).values[0]
    if rate < 0.50:
        for f2 in li:
            col_name = f1+"_"+f2+'_mean'
            mean_features.append(col_name)
            order_label = train.groupby([f1])[f2].mean()
            for df in [train, test]:
                df[col_name] = df[f].map(order_label)

train.drop(li, axis=1, inplace=True)

In [103]:
train.drop(['sample id', 'target'], axis=1, inplace=True)
test = test[train.columns]
X_train = train.values
y_train = target.values
X_test = test.values
print(X_train.shape)
print(X_test.shape)

(1381, 118)
(150, 118)


In [104]:
train.head()

Unnamed: 0,A5,A6,A7,A8,A9,A10,A11,A12,A14,A15,A16,A17,A19,A20,A21,A22,A24,A25,A26,A27,A28,B1,B4,B5,B6,B7,B8,B9,B10,B11,B12,B14,ID,A5_intTarget_0.0_mean,A5_intTarget_1.0_mean,A5_intTarget_2.0_mean,A5_intTarget_3.0_mean,A5_intTarget_4.0_mean,A6_intTarget_0.0_mean,A6_intTarget_1.0_mean,A6_intTarget_2.0_mean,A6_intTarget_3.0_mean,A6_intTarget_4.0_mean,A9_intTarget_0.0_mean,A9_intTarget_1.0_mean,A9_intTarget_2.0_mean,A9_intTarget_3.0_mean,A9_intTarget_4.0_mean,A10_intTarget_0.0_mean,A10_intTarget_1.0_mean,A10_intTarget_2.0_mean,A10_intTarget_3.0_mean,A10_intTarget_4.0_mean,A11_intTarget_0.0_mean,A11_intTarget_1.0_mean,A11_intTarget_2.0_mean,A11_intTarget_3.0_mean,A11_intTarget_4.0_mean,A12_intTarget_0.0_mean,A12_intTarget_1.0_mean,A12_intTarget_2.0_mean,A12_intTarget_3.0_mean,A12_intTarget_4.0_mean,A14_intTarget_0.0_mean,A14_intTarget_1.0_mean,A14_intTarget_2.0_mean,A14_intTarget_3.0_mean,A14_intTarget_4.0_mean,A16_intTarget_0.0_mean,A16_intTarget_1.0_mean,A16_intTarget_2.0_mean,A16_intTarget_3.0_mean,A16_intTarget_4.0_mean,A17_intTarget_0.0_mean,A17_intTarget_1.0_mean,A17_intTarget_2.0_mean,A17_intTarget_3.0_mean,A17_intTarget_4.0_mean,A24_intTarget_0.0_mean,A24_intTarget_1.0_mean,A24_intTarget_2.0_mean,A24_intTarget_3.0_mean,A24_intTarget_4.0_mean,A25_intTarget_0.0_mean,A25_intTarget_1.0_mean,A25_intTarget_2.0_mean,A25_intTarget_3.0_mean,A25_intTarget_4.0_mean,A26_intTarget_0.0_mean,A26_intTarget_1.0_mean,A26_intTarget_2.0_mean,A26_intTarget_3.0_mean,A26_intTarget_4.0_mean,A27_intTarget_0.0_mean,A27_intTarget_1.0_mean,A27_intTarget_2.0_mean,A27_intTarget_3.0_mean,A27_intTarget_4.0_mean,B5_intTarget_0.0_mean,B5_intTarget_1.0_mean,B5_intTarget_2.0_mean,B5_intTarget_3.0_mean,B5_intTarget_4.0_mean,B6_intTarget_0.0_mean,B6_intTarget_1.0_mean,B6_intTarget_2.0_mean,B6_intTarget_3.0_mean,B6_intTarget_4.0_mean,B7_intTarget_0.0_mean,B7_intTarget_1.0_mean,B7_intTarget_2.0_mean,B7_intTarget_3.0_mean,B7_intTarget_4.0_mean,ID_intTarget_0.0_mean,ID_intTarget_1.0_mean,ID_intTarget_2.0_mean,ID_intTarget_3.0_mean,ID_intTarget_4.0_mean
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.208333,0.208333,0.458333,0.083333,0.041667,0.157895,0.210526,0.438596,0.105263,0.070175,0.121951,0.341463,0.317073,0.121951,0.04878,0.177743,0.272025,0.387944,0.092736,0.05255,0.121951,0.341463,0.317073,0.121951,0.04878,0.1775,0.265,0.3925,0.095,0.0575,0.121951,0.341463,0.317073,0.121951,0.04878,0.121951,0.341463,0.317073,0.121951,0.04878,0.195402,0.258621,0.425287,0.074713,0.028736,0.176471,0.352941,0.411765,0.058824,0.0,0.130435,0.26087,0.434783,0.0,0.130435,0.047619,0.380952,0.47619,0.095238,0.0,0.086957,0.304348,0.413043,0.043478,0.130435,0.142857,0.190476,0.47619,0.190476,0.0,0.180272,0.238095,0.394558,0.102041,0.071429,0.173913,0.304348,0.434783,0.043478,0.043478,1,0,0,0,0
1,1,1,0,0,1,1,1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1,0,1,1,1,0,0,0,1,1,0,1,0.130597,0.309701,0.373134,0.093284,0.078358,0.160243,0.292089,0.383367,0.093306,0.064909,0.130769,0.292308,0.407692,0.084615,0.076923,0.107383,0.325503,0.402685,0.097315,0.067114,0.130769,0.296154,0.403846,0.084615,0.076923,0.141593,0.29646,0.40118,0.082596,0.067847,0.130268,0.295019,0.40613,0.084291,0.076628,0.130268,0.295019,0.40613,0.084291,0.076628,0.160294,0.302941,0.388235,0.080882,0.060294,0.128405,0.311284,0.389105,0.085603,0.077821,0.159851,0.284387,0.390335,0.100372,0.055762,0.129771,0.305344,0.396947,0.080153,0.076336,0.151515,0.271132,0.408293,0.095694,0.066986,0.121622,0.310811,0.387387,0.09009,0.081081,0.154088,0.273585,0.410377,0.091195,0.064465,0.173913,0.217391,0.347826,0.130435,0.130435,0,1,0,0,0
2,1,1,0,0,1,2,1,1,1,1,1,1,1,0,0,0,1,2,1,1,1,1,0,1,1,2,0,0,0,1,1,0,2,0.27907,0.186047,0.44186,0.069767,0.023256,0.173913,0.26087,0.434783,0.086957,0.043478,0.214286,0.214286,0.357143,0.166667,0.047619,0.173913,0.258454,0.415459,0.07971,0.062802,0.219512,0.219512,0.341463,0.170732,0.04878,0.211382,0.252033,0.390244,0.097561,0.03252,0.214286,0.214286,0.357143,0.166667,0.047619,0.214286,0.214286,0.357143,0.166667,0.047619,0.121739,0.226087,0.452174,0.121739,0.069565,0.19697,0.242424,0.439394,0.090909,0.015152,0.189873,0.291139,0.43038,0.037975,0.050633,0.266667,0.222222,0.355556,0.133333,0.022222,0.167273,0.294545,0.385455,0.087273,0.050909,0.152091,0.304183,0.418251,0.076046,0.041825,0.166667,0.5,0.333333,0.0,0.0,0.142857,0.308756,0.368664,0.092166,0.078341,0,0,1,0,0
3,2,0,0,0,2,0,2,0,2,0,2,0,1,0,0,1,2,3,2,2,1,2,0,2,0,3,0,0,0,0,0,0,3,0.182857,0.285714,0.348571,0.102857,0.068571,0.151832,0.293194,0.356021,0.104712,0.062827,0.177215,0.291139,0.341772,0.107595,0.075949,0.136364,0.318182,0.363636,0.136364,0.045455,0.175,0.2875,0.35,0.10625,0.075,0.142857,0.228571,0.471429,0.107143,0.05,0.173913,0.285714,0.354037,0.10559,0.074534,0.173913,0.285714,0.354037,0.10559,0.074534,0.227723,0.277228,0.356436,0.069307,0.049505,0.178344,0.267516,0.363057,0.10828,0.076433,0.168605,0.275194,0.393411,0.096899,0.050388,0.176101,0.264151,0.371069,0.113208,0.075472,0.165049,0.237864,0.441748,0.101942,0.038835,0.174863,0.278689,0.382514,0.092896,0.071038,0.128205,0.307692,0.307692,0.076923,0.153846,0.272727,0.181818,0.363636,0.181818,0.0,0,1,0,0,0
4,3,1,0,0,3,1,3,1,3,1,3,1,1,1,0,0,3,1,3,1,1,1,0,3,1,4,0,0,0,1,1,1,4,0.193548,0.354839,0.322581,0.129032,0.0,0.3125,0.125,0.375,0.0625,0.0625,0.09375,0.3125,0.4375,0.09375,0.0625,,,,,,0.09375,0.3125,0.4375,0.09375,0.0625,0.117647,0.470588,0.294118,0.117647,0.0,0.09375,0.3125,0.4375,0.09375,0.0625,0.09375,0.3125,0.4375,0.09375,0.0625,0.11215,0.299065,0.336449,0.17757,0.056075,0.134021,0.302405,0.42268,0.079038,0.054983,0.145161,0.290323,0.38172,0.086022,0.091398,0.236842,0.263158,0.368421,0.078947,0.052632,0.166667,0.296296,0.37037,0.12963,0.018519,0.107143,0.25,0.464286,0.107143,0.071429,0.153846,0.307692,0.538462,0.0,0.0,0.16568,0.272189,0.384615,0.100592,0.076923,0,0,0,0,1


In [105]:
def myFeval(preds, xgbtrain):
    label = xgbtrain.get_label()
    score = mean_squared_error(label,preds)*0.5
    return 'myFeval',score

In [106]:
param = {'num_leaves': 120,
         'min_data_in_leaf': 30, 
         'objective':'regression',
         'max_depth': -1,
         'learning_rate': 0.01,
         "min_child_samples": 30,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         "bagging_seed": 11,
         "metric": 'mse',
         "lambda_l1": 0.1,
         "verbosity": -1}
folds = KFold(n_splits=5, shuffle=True, random_state=2018)
oof_lgb = np.zeros(len(train))
predictions_lgb = np.zeros(len(test))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
    print("fold n°{}".format(fold_+1))
    trn_data = lgb.Dataset(X_train[trn_idx], y_train[trn_idx])
    val_data = lgb.Dataset(X_train[val_idx], y_train[val_idx])

    num_round = 10000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=200, early_stopping_rounds = 100)
    oof_lgb[val_idx] = clf.predict(X_train[val_idx], num_iteration=clf.best_iteration)
    
    predictions_lgb += clf.predict(X_test, num_iteration=clf.best_iteration) / folds.n_splits

print("CV score: {:<8.8f}".format(mean_squared_error(oof_lgb, target)))

fold n°1
Training until validation scores don't improve for 100 rounds.
[200]	training's l2: 0.00025499	valid_1's l2: 0.000270749
[400]	training's l2: 0.000197672	valid_1's l2: 0.000225229
[600]	training's l2: 0.000170057	valid_1's l2: 0.000213595
[800]	training's l2: 0.000153779	valid_1's l2: 0.000209849
[1000]	training's l2: 0.000142993	valid_1's l2: 0.000208749
[1200]	training's l2: 0.000134856	valid_1's l2: 0.000207857
[1400]	training's l2: 0.000128503	valid_1's l2: 0.000207811
[1600]	training's l2: 0.000123659	valid_1's l2: 0.000207162
[1800]	training's l2: 0.000119661	valid_1's l2: 0.00020693
Early stopping, best iteration is:
[1715]	training's l2: 0.000121303	valid_1's l2: 0.000206845
fold n°2
Training until validation scores don't improve for 100 rounds.
[200]	training's l2: 0.000244175	valid_1's l2: 0.000271907
[400]	training's l2: 0.000189001	valid_1's l2: 0.000253665
[600]	training's l2: 0.000161586	valid_1's l2: 0.00024844
[800]	training's l2: 0.000145991	valid_1's l2: 0.00

In [107]:
##### xgb
xgb_params = {'eta': 0.005, 'max_depth': 10, 'subsample': 0.8, 'colsample_bytree': 0.8, 
          'objective': 'reg:linear', 'eval_metric': 'rmse', 'silent': True, 'nthread': 4}

folds = KFold(n_splits=5, shuffle=True, random_state=2018)
oof_xgb = np.zeros(len(train))
predictions_xgb = np.zeros(len(test))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
    print("fold n°{}".format(fold_+1))
    trn_data = xgb.DMatrix(X_train[trn_idx], y_train[trn_idx])
    val_data = xgb.DMatrix(X_train[val_idx], y_train[val_idx])

    watchlist = [(trn_data, 'train'), (val_data, 'valid_data')]
    clf = xgb.train(dtrain=trn_data, num_boost_round=20000, feval = myFeval, evals=watchlist, early_stopping_rounds=200, verbose_eval=100, params=xgb_params)
    oof_xgb[val_idx] = clf.predict(xgb.DMatrix(X_train[val_idx]), ntree_limit=clf.best_ntree_limit)
    predictions_xgb += clf.predict(xgb.DMatrix(X_test), ntree_limit=clf.best_ntree_limit) / folds.n_splits
    
print("CV score: {:<8.8f}".format(mean_squared_error(oof_xgb, target)))

fold n°1
[0]	train-rmse:0.422934	valid_data-rmse:0.423824	train-myFeval:0.089436	valid_data-myFeval:0.089813
Multiple eval metrics have been passed: 'valid_data-myFeval' will be used for early stopping.

Will train until valid_data-myFeval hasn't improved in 200 rounds.
[100]	train-rmse:0.257099	valid_data-rmse:0.258054	train-myFeval:0.03305	valid_data-myFeval:0.033296
[200]	train-rmse:0.156783	valid_data-rmse:0.157707	train-myFeval:0.012291	valid_data-myFeval:0.012436
[300]	train-rmse:0.096245	valid_data-rmse:0.097064	train-myFeval:0.004632	valid_data-myFeval:0.004711
[400]	train-rmse:0.059876	valid_data-rmse:0.060683	train-myFeval:0.001793	valid_data-myFeval:0.001841
[500]	train-rmse:0.038168	valid_data-rmse:0.039171	train-myFeval:0.000728	valid_data-myFeval:0.000767
[600]	train-rmse:0.02526	valid_data-rmse:0.026852	train-myFeval:0.000319	valid_data-myFeval:0.000361
[700]	train-rmse:0.017518	valid_data-rmse:0.02007	train-myFeval:0.000153	valid_data-myFeval:0.000201
[800]	train-rmse:0

[900]	train-rmse:0.010154	valid_data-rmse:0.01514	train-myFeval:5.2e-05	valid_data-myFeval:0.000115
[1000]	train-rmse:0.008497	valid_data-rmse:0.014324	train-myFeval:3.6e-05	valid_data-myFeval:0.000103
[1100]	train-rmse:0.007417	valid_data-rmse:0.013922	train-myFeval:2.8e-05	valid_data-myFeval:9.7e-05
[1200]	train-rmse:0.006625	valid_data-rmse:0.013765	train-myFeval:2.2e-05	valid_data-myFeval:9.5e-05
[1300]	train-rmse:0.005995	valid_data-rmse:0.01368	train-myFeval:1.8e-05	valid_data-myFeval:9.4e-05
[1400]	train-rmse:0.005446	valid_data-rmse:0.013657	train-myFeval:1.5e-05	valid_data-myFeval:9.3e-05
[1500]	train-rmse:0.004971	valid_data-rmse:0.013653	train-myFeval:1.2e-05	valid_data-myFeval:9.3e-05
Stopping. Best iteration:
[1305]	train-rmse:0.005961	valid_data-rmse:0.013673	train-myFeval:1.8e-05	valid_data-myFeval:9.3e-05

CV score: 0.00020256


In [108]:
# 将lgb和xgb的结果进行stacking
train_stack = np.vstack([oof_lgb,oof_xgb]).transpose()
test_stack = np.vstack([predictions_lgb, predictions_xgb]).transpose()

folds_stack = RepeatedKFold(n_splits=5, n_repeats=2, random_state=4590)
oof_stack = np.zeros(train_stack.shape[0])
predictions = np.zeros(test_stack.shape[0])

for fold_, (trn_idx, val_idx) in enumerate(folds_stack.split(train_stack,target)):
    print("fold {}".format(fold_))
    trn_data, trn_y = train_stack[trn_idx], target.iloc[trn_idx].values
    val_data, val_y = train_stack[val_idx], target.iloc[val_idx].values
    
    clf_3 = BayesianRidge()
    clf_3.fit(trn_data, trn_y)
    
    oof_stack[val_idx] = clf_3.predict(val_data)
    predictions += clf_3.predict(test_stack) / 10
    
mean_squared_error(target.values, oof_stack)

fold 0
fold 1
fold 2
fold 3
fold 4
fold 5
fold 6
fold 7
fold 8
fold 9


0.0001989710003463426

In [109]:
sub_df = pd.read_csv('submission.csv', header=None)
sub_df[1] = predictions
sub_df[1] = sub_df[1].apply(lambda x:round(x, 3))
sub_df.to_csv("ly_submission.csv", index=False, header=None)