In [3]:
import numpy as np 
import pandas as pd 
import lightgbm as lgb
import xgboost as xgb
from sklearn.linear_model import BayesianRidge
from sklearn.model_selection import KFold, RepeatedKFold
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from scipy import sparse
import warnings
import time
import sys
import os
import re
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.metrics import log_loss
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns',None)
pd.set_option('max_colwidth',100)

In [5]:
train = pd.read_csv('datasets/jinnan/jinnan_round1_train_20181227.csv', encoding = 'gb18030')
test  = pd.read_csv('datasets/jinnan/jinnan_round1_testA_20181227.csv', encoding = 'gb18030')

In [6]:
# 删除类别唯一的特征
for df in [train, test]:
    df.drop(['B3', 'B13', 'A13', 'A18', 'A23'], axis=1, inplace=True)

In [7]:
# 删除缺失率超过90%的列
good_cols = list(train.columns)
for col in train.columns:
    rate = train[col].value_counts(normalize=True, dropna=False).values[0]
    if rate > 0.9:
        good_cols.remove(col)

# 删除异常值
train = train[train['收率']>0.87]
        
train = train[good_cols]
good_cols.remove('收率')
test  = test[good_cols]

In [8]:
# 合并数据集
target = train['收率']
del train['收率']
data = pd.concat([train,test],axis=0,ignore_index=True)
data = data.fillna(-1)

In [9]:
def timeTranSecond(t):
    try:
        t,m,s=t.split(":")
    except:
        if t=='1900/1/9 7:00':
            return 7*3600/3600
        elif t=='1900/1/1 2:30':
            return (2*3600+30*60)/3600
        elif t==-1:
            return -1
        else:
            return 0
    
    try:
        tm = (int(t)*3600+int(m)*60+int(s))/3600
    except:
        return (30*60)/3600
    
    return tm
for f in ['A5','A7','A9','A11','A14','A16','A24','A26','B5','B7']:
    data[f] = data[f].apply(timeTranSecond)

def getDuration(se):
    try:
        sh,sm,eh,em=re.findall(r"\d+\.?\d*",se)
    except:
        if se == -1:
            return -1 
        
    try:
        if int(sh)>int(eh):
            tm = (int(eh)*3600+int(em)*60-int(sm)*60-int(sh)*3600)/3600 + 24
        else:
            tm = (int(eh)*3600+int(em)*60-int(sm)*60-int(sh)*3600)/3600
    except:
        if se=='19:-20:05':
            return 1
        elif se=='15:00-1600':
            return 1
    
    return tm
for f in ['A20','A28','B4','B9','B10','B11']:
    data[f] = data.apply(lambda df: getDuration(df[f]), axis=1)

In [10]:
cate_columns = [f for f in data.columns if f != '样本id']

In [11]:
#label encoder
for f in cate_columns:
    data[f] = data[f].map(dict(zip(data[f].unique(), range(0, data[f].nunique()))))
train = data[:train.shape[0]]
test  = data[train.shape[0]:]

构造每个特征对于异常值的概率

In [12]:
'''
train['target'] = target
train['outliers'] = 0
train.loc[train['target'] <= 0.87, 'outliers'] = 1
train['outliers'].value_counts()
for f in cate_columns:
    colname = f+'_outliers_mean'
    order_label = train.groupby([f])['outliers'].mean()
    for df in [train, test]:
        df[colname] = df[f].map(order_label)
'''

"\ntrain['target'] = target\ntrain['outliers'] = 0\ntrain.loc[train['target'] <= 0.87, 'outliers'] = 1\ntrain['outliers'].value_counts()\nfor f in cate_columns:\n    colname = f+'_outliers_mean'\n    order_label = train.groupby([f])['outliers'].mean()\n    for df in [train, test]:\n        df[colname] = df[f].map(order_label)\n"

添加新特征，将收率进行分箱，然后构造每个特征中的类别对应不同收率的均值

In [13]:
train['target'] = target
train['intTarget'] = pd.cut(train['target'], 5, labels=False)
train = pd.get_dummies(train, columns=['intTarget'])
li = ['intTarget_0.0','intTarget_1.0','intTarget_2.0','intTarget_3.0','intTarget_4.0']
mean_features = []

In [14]:
train.head()

Unnamed: 0,样本id,A5,A6,A7,A8,A9,A10,A11,A12,A14,A15,A16,A17,A19,A20,A21,A22,A24,A25,A26,A27,A28,B1,B4,B5,B6,B7,B8,B9,B10,B11,B12,B14,target,intTarget_0.0,intTarget_1.0,intTarget_2.0,intTarget_3.0,intTarget_4.0
0,sample_1528,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.879,1,0,0,0,0
1,sample_1698,1,1,0,0,1,1,1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1,0,1,1,1,0,0,0,1,1,0,0.902,0,1,0,0,0
2,sample_639,1,1,0,0,1,2,1,1,1,1,1,1,1,0,0,0,1,2,1,1,1,1,0,1,1,2,0,0,0,1,1,0,0.936,0,0,1,0,0
3,sample_483,2,0,0,0,2,0,2,0,2,0,2,0,1,0,0,1,2,3,2,2,1,2,0,2,0,3,0,0,0,0,0,0,0.902,0,1,0,0,0
4,sample_617,3,1,0,0,3,1,3,1,3,1,3,1,1,1,0,0,3,1,3,1,1,1,0,3,1,4,0,0,0,1,1,1,0.983,0,0,0,0,1


In [15]:
for f1 in cate_columns:
    for f2 in li:
        col_name = f1+"_"+f2+'_mean'
        mean_features.append(col_name)
        order_label = train.groupby([f1])[f2].mean()
        for df in [train, test]:
            df[col_name] = df[f].map(order_label)

train.drop(li, axis=1, inplace=True)

In [16]:
train.drop(['样本id','target'], axis=1, inplace=True)
test = test[train.columns]
X_train = train.values
y_train = target.values
X_test = test.values

In [17]:
param = {'num_leaves': 120,
         'min_data_in_leaf': 30, 
         'objective':'regression',
         'max_depth': -1,
         'learning_rate': 0.01,
         "min_child_samples": 30,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         "bagging_seed": 11,
         "metric": 'mse',
         "lambda_l1": 0.1,
         "verbosity": -1}
folds = KFold(n_splits=5, shuffle=True, random_state=2018)
oof_lgb = np.zeros(len(train))
predictions_lgb = np.zeros(len(test))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
    print("fold n°{}".format(fold_+1))
    trn_data = lgb.Dataset(X_train[trn_idx], y_train[trn_idx])
    val_data = lgb.Dataset(X_train[val_idx], y_train[val_idx])

    num_round = 10000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=200, early_stopping_rounds = 100)
    oof_lgb[val_idx] = clf.predict(X_train[val_idx], num_iteration=clf.best_iteration)
    
    predictions_lgb += clf.predict(X_test, num_iteration=clf.best_iteration) / folds.n_splits

print("CV score: {:<8.8f}".format(mean_squared_error(oof_lgb, target)))

fold n°1
Training until validation scores don't improve for 100 rounds.
[200]	training's l2: 0.000213649	valid_1's l2: 0.000238709
[400]	training's l2: 0.000165397	valid_1's l2: 0.000194135
[600]	training's l2: 0.00014877	valid_1's l2: 0.000186247
[800]	training's l2: 0.000139663	valid_1's l2: 0.000183224
[1000]	training's l2: 0.000134053	valid_1's l2: 0.00018144
[1200]	training's l2: 0.000129792	valid_1's l2: 0.000180621
[1400]	training's l2: 0.000126568	valid_1's l2: 0.000180056
[1600]	training's l2: 0.00012393	valid_1's l2: 0.000179733
[1800]	training's l2: 0.000121882	valid_1's l2: 0.000179494
[2000]	training's l2: 0.000119955	valid_1's l2: 0.000179393
Early stopping, best iteration is:
[2033]	training's l2: 0.000119666	valid_1's l2: 0.000179346
fold n°2
Training until validation scores don't improve for 100 rounds.
[200]	training's l2: 0.000212428	valid_1's l2: 0.000234753
[400]	training's l2: 0.000163155	valid_1's l2: 0.000208565
[600]	training's l2: 0.000146908	valid_1's l2: 0.0

In [18]:
##### xgb
xgb_params = {'eta': 0.005, 'max_depth': 10, 'subsample': 0.8, 'colsample_bytree': 0.8, 
          'objective': 'reg:linear', 'eval_metric': 'rmse', 'silent': True, 'nthread': 4}

folds = KFold(n_splits=5, shuffle=True, random_state=2018)
oof_xgb = np.zeros(len(train))
predictions_xgb = np.zeros(len(test))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
    print("fold n°{}".format(fold_+1))
    trn_data = xgb.DMatrix(X_train[trn_idx], y_train[trn_idx])
    val_data = xgb.DMatrix(X_train[val_idx], y_train[val_idx])

    watchlist = [(trn_data, 'train'), (val_data, 'valid_data')]
    clf = xgb.train(dtrain=trn_data, num_boost_round=20000, evals=watchlist, early_stopping_rounds=200, verbose_eval=100, params=xgb_params)
    oof_xgb[val_idx] = clf.predict(xgb.DMatrix(X_train[val_idx]), ntree_limit=clf.best_ntree_limit)
    predictions_xgb += clf.predict(xgb.DMatrix(X_test), ntree_limit=clf.best_ntree_limit) / folds.n_splits
    
print("CV score: {:<8.8f}".format(mean_squared_error(oof_xgb, target)))

fold n°1
[0]	train-rmse:0.422932	valid_data-rmse:0.423818
Multiple eval metrics have been passed: 'valid_data-rmse' will be used for early stopping.

Will train until valid_data-rmse hasn't improved in 200 rounds.
[100]	train-rmse:0.256987	valid_data-rmse:0.257703
[200]	train-rmse:0.156566	valid_data-rmse:0.157217
[300]	train-rmse:0.095961	valid_data-rmse:0.096496
[400]	train-rmse:0.059487	valid_data-rmse:0.059991
[500]	train-rmse:0.037744	valid_data-rmse:0.038446
[600]	train-rmse:0.024976	valid_data-rmse:0.026087
[700]	train-rmse:0.017557	valid_data-rmse:0.019294
[800]	train-rmse:0.013337	valid_data-rmse:0.015854
[900]	train-rmse:0.010958	valid_data-rmse:0.014198
[1000]	train-rmse:0.009588	valid_data-rmse:0.013447
[1100]	train-rmse:0.00878	valid_data-rmse:0.0131
[1200]	train-rmse:0.008221	valid_data-rmse:0.012923
[1300]	train-rmse:0.007836	valid_data-rmse:0.012845
[1400]	train-rmse:0.007532	valid_data-rmse:0.012809
[1500]	train-rmse:0.007282	valid_data-rmse:0.012808
[1600]	train-rmse:

In [19]:
# 将lgb和xgb的结果进行stacking
train_stack = np.vstack([oof_lgb,oof_xgb]).transpose()
test_stack = np.vstack([predictions_lgb, predictions_xgb]).transpose()

folds_stack = RepeatedKFold(n_splits=5, n_repeats=2, random_state=4590)
oof_stack = np.zeros(train_stack.shape[0])
predictions = np.zeros(test_stack.shape[0])

for fold_, (trn_idx, val_idx) in enumerate(folds_stack.split(train_stack,target)):
    print("fold {}".format(fold_))
    trn_data, trn_y = train_stack[trn_idx], target.iloc[trn_idx].values
    val_data, val_y = train_stack[val_idx], target.iloc[val_idx].values
    
    clf_3 = BayesianRidge()
    clf_3.fit(trn_data, trn_y)
    
    oof_stack[val_idx] = clf_3.predict(val_data)
    predictions += clf_3.predict(test_stack) / 10
    
mean_squared_error(target.values, oof_stack)

fold 0
fold 1
fold 2
fold 3
fold 4
fold 5
fold 6
fold 7
fold 8
fold 9


0.0001727489889676719

In [21]:
sub_df = pd.read_csv('datasets/jinnan/jinnan_round1_submit_20181227.csv', header=None)
sub_df[1] = predictions
sub_df[1] = sub_df[1].apply(lambda x:round(x, 3))
sub_df.to_csv("datasets/jinnan/results.csv", index=False, header=None)