In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import HuberRegressor
import lightgbm as lgb
from catboost import CatBoostClassifier
from catboost import CatBoostRegressor

In [8]:
train = pd.read_csv('first_round_training_data.csv')
test = pd.read_csv('first_round_testing_data.csv')

def quality_encoder(x):
    return {'Excellent':0,'Good':1,'Pass':2,'Fail':3}[x]
train['label'] = train.Quality_label.apply(quality_encoder)

In [9]:
# 线下验证group 连续构造
for group in range(50):
    train['group_%s'%group] = (train.index+group)//50%120

# 线下验证group 随机构造
for group in range(50,500):
    name = 'group_%s'%group
    train[name] = 0
    kfold=KFold(n_splits=120, shuffle=True,random_state=group)
    split=kfold.split(train)
    i = 0
    for train_index,valid_index in split:
        train.iloc[valid_index,-1] = i
        i+=1
        
        
train['label_Excellent'] = 1*(train['label'] == 0)
train['label_Good'] = 1*(train['label'] == 1)
train['label_Pass'] = 1*(train['label'] == 2)
train['label_Fail'] = 1*(train['label'] == 3)

In [10]:
# K折交叉验证

features = ['Parameter5','Parameter6','Parameter7','Parameter8','Parameter9','Parameter10']

kfold=KFold(n_splits=5, shuffle=False)
split=kfold.split(train)
result = []
for train_index,valid_index in split:
    
    X_train=train.iloc[train_index,:].copy()
    X_valid=train.iloc[valid_index,:].copy()
    
    model = CatBoostClassifier(iterations=100,depth=7,learning_rate=0.1,verbose=100,random_state=0,loss_function='MultiClass')
    model.fit(X_train.loc[:,features].values,X_train.label.values)
    
    # 提前赋值
    X_valid['prob_Excellent'] = 0.0
    X_valid['prob_Good'] = 0.0
    X_valid['prob_Pass'] = 0.0
    X_valid['prob_Fail'] = 0.0
    
    X_valid.loc[:,['prob_Excellent','prob_Good','prob_Pass','prob_Fail']] = model.predict_proba(X_valid.loc[:,features])
    X_valid['label_pred'] = model.predict(X_valid.loc[:,features])
    
    result.append(X_valid)
result=pd.concat(result)

# mae评估
mean = []
for group in range(500):
    result_mae = result.groupby(['group_%s'%group],as_index=False)['prob_Excellent','prob_Good','prob_Pass','prob_Fail','label_Excellent','label_Good','label_Pass','label_Fail'].mean()
    a = np.abs(result_mae.loc[:,['prob_Excellent','prob_Good','prob_Pass','prob_Fail']].values
               - result_mae.loc[:,['label_Excellent','label_Good','label_Pass','label_Fail']].values).mean()
    mean.append(1/(1+10*a))
np.mean(mean)

0:	learn: 1.3367231	total: 37.9ms	remaining: 3.75s
99:	learn: 1.0060527	total: 3.3s	remaining: 0us
0:	learn: 1.3410821	total: 32.3ms	remaining: 3.19s
99:	learn: 1.0321815	total: 2.95s	remaining: 0us
0:	learn: 1.3403853	total: 29.8ms	remaining: 2.95s
99:	learn: 1.0165788	total: 2.85s	remaining: 0us
0:	learn: 1.3465247	total: 30.7ms	remaining: 3.04s
99:	learn: 1.0324442	total: 2.88s	remaining: 0us
0:	learn: 1.3430928	total: 30.8ms	remaining: 3.05s
99:	learn: 1.0140480	total: 2.88s	remaining: 0us


In [12]:
# blending
features = ['Parameter5','Parameter6','Parameter7','Parameter8','Parameter9','Parameter10']

kfold=KFold(n_splits=5, shuffle=False)
split=kfold.split(train)
result = []
for train_index,valid_index in split:
    
    X_train=train.iloc[train_index,:].copy()
    X_valid=train.iloc[valid_index,:].copy()
    
    # 模型1
    model = CatBoostClassifier(iterations=100,depth=7,learning_rate=0.1,verbose=100,random_state=0,loss_function='MultiClass')
    model.fit(X_train.loc[:,features].values,X_train.label.values)
    # 提前赋值
    X_valid['prob1_Excellent'] = 0.0
    X_valid['prob1_Good'] = 0.0
    X_valid['prob1_Pass'] = 0.0
    X_valid['prob1_Fail'] = 0.0
    
    X_valid.loc[:,['prob1_Excellent','prob1_Good','prob1_Pass','prob1_Fail']] = model.predict_proba(X_valid.loc[:,features])
    
    # 模型2
    model = CatBoostClassifier(iterations=100,depth=10,learning_rate=0.1,verbose=100,random_state=0,loss_function='MultiClass')
    model.fit(X_train.loc[:,features].values,X_train.label.values)
    # 提前赋值
    X_valid['prob2_Excellent'] = 0.0
    X_valid['prob2_Good'] = 0.0
    X_valid['prob2_Pass'] = 0.0
    X_valid['prob2_Fail'] = 0.0
    
    X_valid.loc[:,['prob2_Excellent','prob2_Good','prob2_Pass','prob2_Fail']] = model.predict_proba(X_valid.loc[:,features])
    
    result.append(X_valid)
result=pd.concat(result)

result['prob_Excellent'] = 0.0
result['prob_Good'] = 0.0
result['prob_Pass'] = 0.0
result['prob_Fail'] = 0.0
result.loc[:,['prob_Excellent','prob_Good','prob_Pass','prob_Fail']] = 0.5*result.loc[:,['prob1_Excellent','prob1_Good','prob1_Pass','prob1_Fail']].values+0.5*result.loc[:,['prob2_Excellent','prob2_Good','prob2_Pass','prob2_Fail']].values

# mae评估
mean = []
for group in range(500):
    result_mae = result.groupby(['group_%s'%group],as_index=False)['prob_Excellent','prob_Good','prob_Pass','prob_Fail','label_Excellent','label_Good','label_Pass','label_Fail'].mean()
    a = np.abs(result_mae.loc[:,['prob_Excellent','prob_Good','prob_Pass','prob_Fail']].values
               - result_mae.loc[:,['label_Excellent','label_Good','label_Pass','label_Fail']].values).mean()
    mean.append(1/(1+10*a))
np.mean(mean)

0:	learn: 1.3367231	total: 34.5ms	remaining: 3.41s
99:	learn: 1.0060527	total: 3.22s	remaining: 0us
0:	learn: 1.3358802	total: 75ms	remaining: 7.43s
99:	learn: 0.9559350	total: 6.94s	remaining: 0us
0:	learn: 1.3410821	total: 28.6ms	remaining: 2.83s
99:	learn: 1.0321815	total: 2.88s	remaining: 0us
0:	learn: 1.3408880	total: 48.4ms	remaining: 4.79s
99:	learn: 0.9862171	total: 6.91s	remaining: 0us
0:	learn: 1.3403853	total: 32.4ms	remaining: 3.2s
99:	learn: 1.0165788	total: 3.09s	remaining: 0us
0:	learn: 1.3379317	total: 138ms	remaining: 13.6s
99:	learn: 0.9619551	total: 8.6s	remaining: 0us
0:	learn: 1.3465247	total: 31.9ms	remaining: 3.15s
99:	learn: 1.0324442	total: 2.87s	remaining: 0us
0:	learn: 1.3458274	total: 70.9ms	remaining: 7.01s
99:	learn: 0.9843060	total: 6.82s	remaining: 0us
0:	learn: 1.3430928	total: 34.2ms	remaining: 3.38s
99:	learn: 1.0140480	total: 2.85s	remaining: 0us
0:	learn: 1.3419605	total: 72.7ms	remaining: 7.2s
99:	learn: 0.9668975	total: 6.99s	remaining: 0us


0.6947700301738983

In [13]:
# stacking
features = ['Parameter5','Parameter6','Parameter7','Parameter8','Parameter9','Parameter10']

kfold=KFold(n_splits=5, shuffle=False)
split=kfold.split(train)
result = []
for train_index,valid_index in split:
    
    X_train=train.iloc[train_index,:].copy()
    X_valid=train.iloc[valid_index,:].copy()
    
    # 模型1
    model = CatBoostClassifier(iterations=100,depth=7,learning_rate=0.1,verbose=100,random_state=0,loss_function='MultiClass')
    model.fit(X_train.loc[:,features].values,X_train.label.values)
    # 提前赋值
    X_valid['prob1_Excellent'] = 0.0
    X_valid['prob1_Good'] = 0.0
    X_valid['prob1_Pass'] = 0.0
    X_valid['prob1_Fail'] = 0.0
    
    X_valid.loc[:,['prob1_Excellent','prob1_Good','prob1_Pass','prob1_Fail']] = model.predict_proba(X_valid.loc[:,features])
    
    # 模型2
    model = CatBoostClassifier(iterations=100,depth=10,learning_rate=0.1,verbose=100,random_state=0,loss_function='MultiClass')
    model.fit(X_train.loc[:,features].values,X_train.label.values)
    # 提前赋值
    X_valid['prob2_Excellent'] = 0.0
    X_valid['prob2_Good'] = 0.0
    X_valid['prob2_Pass'] = 0.0
    X_valid['prob2_Fail'] = 0.0
    
    X_valid.loc[:,['prob2_Excellent','prob2_Good','prob2_Pass','prob2_Fail']] = model.predict_proba(X_valid.loc[:,features])
    
    result.append(X_valid)
result=pd.concat(result)

# 第二层交叉验证
features = ['prob1_Excellent','prob1_Good','prob1_Pass','prob1_Fail','prob2_Excellent','prob2_Good','prob2_Pass','prob2_Fail']
kfold=KFold(n_splits=5, shuffle=False)
split=kfold.split(result)
result2 = []
for train_index,valid_index in split:
    
    X_train=result.iloc[train_index,:].copy()
    X_valid=result.iloc[valid_index,:].copy()
    
    # 第二层模型
    model = CatBoostClassifier(iterations=100,depth=3,learning_rate=0.1,verbose=100,random_state=0,loss_function='MultiClass')
    model.fit(X_train.loc[:,features].values,X_train.label.values)
    # 提前赋值
    X_valid['prob_Excellent'] = 0.0
    X_valid['prob_Good'] = 0.0
    X_valid['prob_Pass'] = 0.0
    X_valid['prob_Fail'] = 0.0
    
    X_valid.loc[:,['prob_Excellent','prob_Good','prob_Pass','prob_Fail']] = model.predict_proba(X_valid.loc[:,features])
    
    result2.append(X_valid)
result2=pd.concat(result2)

0:	learn: 1.3367231	total: 30.3ms	remaining: 3s
99:	learn: 1.0060527	total: 3.21s	remaining: 0us
0:	learn: 1.3358802	total: 78.7ms	remaining: 7.79s
99:	learn: 0.9559350	total: 7.03s	remaining: 0us
0:	learn: 1.3410821	total: 28ms	remaining: 2.77s
99:	learn: 1.0321815	total: 2.87s	remaining: 0us
0:	learn: 1.3408880	total: 47.8ms	remaining: 4.73s
99:	learn: 0.9862171	total: 6.89s	remaining: 0us
0:	learn: 1.3403853	total: 31.2ms	remaining: 3.09s
99:	learn: 1.0165788	total: 2.87s	remaining: 0us
0:	learn: 1.3379317	total: 68.9ms	remaining: 6.82s
99:	learn: 0.9619551	total: 6.98s	remaining: 0us
0:	learn: 1.3465247	total: 32.3ms	remaining: 3.2s
99:	learn: 1.0324442	total: 2.96s	remaining: 0us
0:	learn: 1.3458274	total: 70ms	remaining: 6.93s
99:	learn: 0.9843060	total: 6.88s	remaining: 0us
0:	learn: 1.3430928	total: 29.1ms	remaining: 2.88s
99:	learn: 1.0140480	total: 2.88s	remaining: 0us
0:	learn: 1.3419605	total: 69.4ms	remaining: 6.87s
99:	learn: 0.9668975	total: 7.84s	remaining: 0us
0:	learn

In [15]:
# mae评估
mean = []
for group in range(500):
    result2_mae = result2.groupby(['group_%s'%group],as_index=False)['prob_Excellent','prob_Good','prob_Pass','prob_Fail','label_Excellent','label_Good','label_Pass','label_Fail'].mean()
    a = np.abs(result2_mae.loc[:,['prob_Excellent','prob_Good','prob_Pass','prob_Fail']].values
               - result2_mae.loc[:,['label_Excellent','label_Good','label_Pass','label_Fail']].values).mean()
    mean.append(1/(1+10*a))
np.mean(mean)

0.6945368950332439