In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import HuberRegressor
import lightgbm as lgb
from catboost import CatBoostClassifier
from catboost import CatBoostRegressor

In [20]:
train = pd.read_csv('first_round_training_data.csv')
test = pd.read_csv('first_round_testing_data.csv')

def quality_encoder(x):
    return {'Excellent':0,'Good':1,'Pass':2,'Fail':3}[x]
train['label'] = train.Quality_label.apply(quality_encoder)

In [21]:
# 普通交叉验证

features = ['Parameter5','Parameter6','Parameter7','Parameter8','Parameter9','Parameter10']

X_train = train.iloc[:5000,:].copy()
X_valid = train.iloc[5000:,:].copy()

model = CatBoostClassifier(iterations=100,depth=7,learning_rate=0.1,verbose=100,random_state=0,loss_function='MultiClass')
model.fit(X_train.loc[:,features].values,X_train.label.values)

# 提前赋值
X_valid['prob_Excellent'] = 0.0
X_valid['prob_Good'] = 0.0
X_valid['prob_Pass'] = 0.0
X_valid['prob_Fail'] = 0.0

X_valid.loc[:,['prob_Excellent','prob_Good','prob_Pass','prob_Fail']] = model.predict_proba(X_valid.loc[:,features])
# X_valid['label_pred'] = np.argmax(X_valid.loc[:,['prob_Excellent','prob_Good','prob_Pass','prob_Fail']].values,axis=1)
X_valid['label_pred'] = model.predict(X_valid.loc[:,features])

(X_valid['label_pred'] == X_valid['label']).mean()

0:	learn: 1.3400205	total: 43.3ms	remaining: 4.29s
99:	learn: 1.0118615	total: 4.05s	remaining: 0us


0.479

In [22]:
# K折交叉验证

features = ['Parameter5','Parameter6','Parameter7','Parameter8','Parameter9','Parameter10']

kfold=KFold(n_splits=5, shuffle=False)
split=kfold.split(train)
result = []
for train_index,valid_index in split:
    
    X_train=train.iloc[train_index,:].copy()
    X_valid=train.iloc[valid_index,:].copy()
    
    model = CatBoostClassifier(iterations=100,depth=7,learning_rate=0.1,verbose=100,random_state=0,loss_function='MultiClass')
    model.fit(X_train.loc[:,features].values,X_train.label.values)
    
    # 提前赋值
    X_valid['prob_Excellent'] = 0.0
    X_valid['prob_Good'] = 0.0
    X_valid['prob_Pass'] = 0.0
    X_valid['prob_Fail'] = 0.0
    
    X_valid.loc[:,['prob_Excellent','prob_Good','prob_Pass','prob_Fail']] = model.predict_proba(X_valid.loc[:,features])
    X_valid['label_pred'] = model.predict(X_valid.loc[:,features])
    
    result.append(X_valid)
result=pd.concat(result)

(result['label_pred'] == result['label']).mean()

# 这里能看出为什么不要前四个特征

0:	learn: 1.3810680	total: 74ms	remaining: 2m 27s
100:	learn: 1.1589573	total: 6.87s	remaining: 2m 9s
200:	learn: 1.1040708	total: 13.2s	remaining: 1m 57s
300:	learn: 1.0818434	total: 19.7s	remaining: 1m 51s
400:	learn: 1.0687852	total: 25.8s	remaining: 1m 42s
500:	learn: 1.0596792	total: 33.3s	remaining: 1m 39s
600:	learn: 1.0523142	total: 40.9s	remaining: 1m 35s
700:	learn: 1.0455993	total: 47.5s	remaining: 1m 28s
800:	learn: 1.0401737	total: 55.1s	remaining: 1m 22s
900:	learn: 1.0350806	total: 1m 1s	remaining: 1m 14s
1000:	learn: 1.0301295	total: 1m 6s	remaining: 1m 6s
1100:	learn: 1.0248692	total: 1m 12s	remaining: 59.5s
1200:	learn: 1.0195575	total: 1m 18s	remaining: 52.5s
1300:	learn: 1.0147610	total: 1m 25s	remaining: 45.7s
1400:	learn: 1.0098228	total: 1m 31s	remaining: 39s
1500:	learn: 1.0054533	total: 1m 37s	remaining: 32.3s
1600:	learn: 1.0014004	total: 1m 43s	remaining: 25.8s
1700:	learn: 0.9976450	total: 1m 49s	remaining: 19.2s
1800:	learn: 0.9935811	total: 1m 55s	remainin

0.51

In [27]:
# blending
features = ['Parameter5','Parameter6','Parameter7','Parameter8','Parameter9','Parameter10']

kfold=KFold(n_splits=5, shuffle=False)
split=kfold.split(train)
result = []
for train_index,valid_index in split:
    
    X_train=train.iloc[train_index,:].copy()
    X_valid=train.iloc[valid_index,:].copy()
    
    # 模型1
    model = CatBoostClassifier(iterations=100,depth=7,learning_rate=0.1,verbose=100,random_state=0,loss_function='MultiClass')
    model.fit(X_train.loc[:,features].values,X_train.label.values)
    # 提前赋值
    X_valid['prob1_Excellent'] = 0.0
    X_valid['prob1_Good'] = 0.0
    X_valid['prob1_Pass'] = 0.0
    X_valid['prob1_Fail'] = 0.0
    
    X_valid.loc[:,['prob1_Excellent','prob1_Good','prob1_Pass','prob1_Fail']] = model.predict_proba(X_valid.loc[:,features])
    
    # 模型2
    model = CatBoostClassifier(iterations=100,depth=10,learning_rate=0.1,verbose=100,random_state=0,loss_function='MultiClass')
    model.fit(X_train.loc[:,features].values,X_train.label.values)
    # 提前赋值
    X_valid['prob2_Excellent'] = 0.0
    X_valid['prob2_Good'] = 0.0
    X_valid['prob2_Pass'] = 0.0
    X_valid['prob2_Fail'] = 0.0
    
    X_valid.loc[:,['prob2_Excellent','prob2_Good','prob2_Pass','prob2_Fail']] = model.predict_proba(X_valid.loc[:,features])
    
    result.append(X_valid)
result=pd.concat(result)

result['prob_Excellent'] = 0.0
result['prob_Good'] = 0.0
result['prob_Pass'] = 0.0
result['prob_Fail'] = 0.0
result.loc[:,['prob_Excellent','prob_Good','prob_Pass','prob_Fail']] = 0.5*result.loc[:,['prob1_Excellent','prob1_Good','prob1_Pass','prob1_Fail']].values+0.5*result.loc[:,['prob2_Excellent','prob2_Good','prob2_Pass','prob2_Fail']].values

result['label_pred'] = np.argmax(result.loc[:,['prob_Excellent','prob_Good','prob_Pass','prob_Fail']].values,axis=1)
(result['label_pred'] == result['label']).mean()

0:	learn: 1.3367231	total: 46.1ms	remaining: 4.57s
99:	learn: 1.0060527	total: 4.32s	remaining: 0us
0:	learn: 1.3358802	total: 127ms	remaining: 12.6s
99:	learn: 0.9559350	total: 7.25s	remaining: 0us
0:	learn: 1.3410821	total: 29.7ms	remaining: 2.94s
99:	learn: 1.0321815	total: 2.98s	remaining: 0us
0:	learn: 1.3408880	total: 52.7ms	remaining: 5.22s
99:	learn: 0.9862171	total: 6.98s	remaining: 0us
0:	learn: 1.3403853	total: 29.8ms	remaining: 2.95s
99:	learn: 1.0165788	total: 2.93s	remaining: 0us
0:	learn: 1.3379317	total: 74ms	remaining: 7.33s
99:	learn: 0.9619551	total: 7.08s	remaining: 0us
0:	learn: 1.3465247	total: 30.2ms	remaining: 2.99s
99:	learn: 1.0324442	total: 3.04s	remaining: 0us
0:	learn: 1.3458274	total: 83.7ms	remaining: 8.28s
99:	learn: 0.9843060	total: 9.44s	remaining: 0us
0:	learn: 1.3430928	total: 35.5ms	remaining: 3.51s
99:	learn: 1.0140480	total: 3.81s	remaining: 0us
0:	learn: 1.3419605	total: 142ms	remaining: 14.1s
99:	learn: 0.9668975	total: 9.02s	remaining: 0us


0.5268333333333334

In [34]:
result.loc[:,['prob_Excellent','prob_Good','prob_Pass','prob_Fail']] = 0*result.loc[:,['prob1_Excellent','prob1_Good','prob1_Pass','prob1_Fail']].values+1*result.loc[:,['prob2_Excellent','prob2_Good','prob2_Pass','prob2_Fail']].values

result['label_pred'] = np.argmax(result.loc[:,['prob_Excellent','prob_Good','prob_Pass','prob_Fail']].values,axis=1)
(result['label_pred'] == result['label']).mean()

0.5266666666666666

In [39]:
# stacking
features = ['Parameter5','Parameter6','Parameter7','Parameter8','Parameter9','Parameter10']

kfold=KFold(n_splits=5, shuffle=False)
split=kfold.split(train)
result = []
for train_index,valid_index in split:
    
    X_train=train.iloc[train_index,:].copy()
    X_valid=train.iloc[valid_index,:].copy()
    
    # 模型1
    model = CatBoostClassifier(iterations=100,depth=7,learning_rate=0.1,verbose=100,random_state=0,loss_function='MultiClass')
    model.fit(X_train.loc[:,features].values,X_train.label.values)
    # 提前赋值
    X_valid['prob1_Excellent'] = 0.0
    X_valid['prob1_Good'] = 0.0
    X_valid['prob1_Pass'] = 0.0
    X_valid['prob1_Fail'] = 0.0
    
    X_valid.loc[:,['prob1_Excellent','prob1_Good','prob1_Pass','prob1_Fail']] = model.predict_proba(X_valid.loc[:,features])
    
    # 模型2
    model = CatBoostClassifier(iterations=100,depth=10,learning_rate=0.1,verbose=100,random_state=0,loss_function='MultiClass')
    model.fit(X_train.loc[:,features].values,X_train.label.values)
    # 提前赋值
    X_valid['prob2_Excellent'] = 0.0
    X_valid['prob2_Good'] = 0.0
    X_valid['prob2_Pass'] = 0.0
    X_valid['prob2_Fail'] = 0.0
    
    X_valid.loc[:,['prob2_Excellent','prob2_Good','prob2_Pass','prob2_Fail']] = model.predict_proba(X_valid.loc[:,features])
    
    result.append(X_valid)
result=pd.concat(result)

# 第二层交叉验证
features = ['prob1_Excellent','prob1_Good','prob1_Pass','prob1_Fail','prob2_Excellent','prob2_Good','prob2_Pass','prob2_Fail']
kfold=KFold(n_splits=5, shuffle=False)
split=kfold.split(result)
result2 = []
for train_index,valid_index in split:
    
    X_train=result.iloc[train_index,:].copy()
    X_valid=result.iloc[valid_index,:].copy()
    
    # 第二层模型
    model = CatBoostClassifier(iterations=100,depth=3,learning_rate=0.1,verbose=100,random_state=0,loss_function='MultiClass')
    model.fit(X_train.loc[:,features].values,X_train.label.values)
    # 提前赋值
    X_valid['prob_Excellent'] = 0.0
    X_valid['prob_Good'] = 0.0
    X_valid['prob_Pass'] = 0.0
    X_valid['prob_Fail'] = 0.0
    
    X_valid.loc[:,['prob_Excellent','prob_Good','prob_Pass','prob_Fail']] = model.predict_proba(X_valid.loc[:,features])
    
    result2.append(X_valid)
result2=pd.concat(result2)

0:	learn: 1.3367231	total: 32.8ms	remaining: 3.24s
99:	learn: 1.0060527	total: 3.46s	remaining: 0us
0:	learn: 1.3358802	total: 75ms	remaining: 7.43s
99:	learn: 0.9559350	total: 6.96s	remaining: 0us
0:	learn: 1.3410821	total: 29.8ms	remaining: 2.95s
99:	learn: 1.0321815	total: 2.96s	remaining: 0us
0:	learn: 1.3408880	total: 48.6ms	remaining: 4.81s
99:	learn: 0.9862171	total: 7.04s	remaining: 0us
0:	learn: 1.3403853	total: 30.6ms	remaining: 3.03s
99:	learn: 1.0165788	total: 2.97s	remaining: 0us
0:	learn: 1.3379317	total: 70.7ms	remaining: 7s
99:	learn: 0.9619551	total: 7.12s	remaining: 0us
0:	learn: 1.3465247	total: 30ms	remaining: 2.97s
99:	learn: 1.0324442	total: 3.02s	remaining: 0us
0:	learn: 1.3458274	total: 72ms	remaining: 7.13s
99:	learn: 0.9843060	total: 7.07s	remaining: 0us
0:	learn: 1.3430928	total: 30.3ms	remaining: 3s
99:	learn: 1.0140480	total: 3s	remaining: 0us
0:	learn: 1.3419605	total: 76.7ms	remaining: 7.59s
99:	learn: 0.9668975	total: 7.21s	remaining: 0us
0:	learn: 1.332

In [40]:
result2['label_pred'] = np.argmax(result2.loc[:,['prob_Excellent','prob_Good','prob_Pass','prob_Fail']].values,axis=1)
(result2['label_pred'] == result2['label']).mean()

0.5296666666666666