In [1]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
# catboost安装方式：命令行输入 pip install -i https://pypi.tuna.tsinghua.edu.cn/simple catboost

In [2]:
# 读取数据
train = pd.read_csv('.\\data\\first_round_training_data.csv')
test = pd.read_csv('.\\data\\first_round_testing_data.csv')

In [3]:
# 特征列表
features = ["Parameter5","Parameter6","Parameter7","Parameter8","Parameter9","Parameter10"]

In [4]:
# 标签转化
def quality_encoder(x):
    return {'Excellent':0,'Good':1,'Pass':2,'Fail':3}[x]

train['label'] = train.Quality_label.apply(quality_encoder)
train['label_Excellent'] = 1*(train['label'] == 0)
train['label_Good'] = 1*(train['label'] == 1)
train['label_Pass'] = 1*(train['label'] == 2)
train['label_Fail'] = 1*(train['label'] == 3)

In [5]:
# 本地多分类准确率评估
local_train = train.iloc[:5000,:].copy()
local_valid = train.iloc[5000:,:].copy()

model = CatBoostClassifier(iterations=2000,depth=8,learning_rate=0.01,verbose=100,loss_function='MultiClass',random_state=666)
model.fit(local_train.loc[:,features],local_train.label)

local_valid['prediction'] = model.predict(local_valid.loc[:,features])
(local_valid.label == local_valid.prediction).mean()

0:	learn: 1.3814611	total: 146ms	remaining: 4m 52s
100:	learn: 1.1609463	total: 6.37s	remaining: 1m 59s
200:	learn: 1.1042828	total: 12.3s	remaining: 1m 50s
300:	learn: 1.0812268	total: 18s	remaining: 1m 41s
400:	learn: 1.0687255	total: 23.8s	remaining: 1m 34s
500:	learn: 1.0594367	total: 29.3s	remaining: 1m 27s
600:	learn: 1.0521968	total: 35.1s	remaining: 1m 21s
700:	learn: 1.0462422	total: 40.5s	remaining: 1m 15s
800:	learn: 1.0408338	total: 45.7s	remaining: 1m 8s
900:	learn: 1.0360825	total: 51.1s	remaining: 1m 2s
1000:	learn: 1.0310389	total: 56.8s	remaining: 56.6s
1100:	learn: 1.0257770	total: 1m 2s	remaining: 51.1s
1200:	learn: 1.0207589	total: 1m 8s	remaining: 45.6s
1300:	learn: 1.0157969	total: 1m 14s	remaining: 40.2s
1400:	learn: 1.0113287	total: 1m 20s	remaining: 34.6s
1500:	learn: 1.0075482	total: 1m 27s	remaining: 29s
1600:	learn: 1.0034624	total: 1m 33s	remaining: 23.3s
1700:	learn: 0.9998904	total: 1m 39s	remaining: 17.5s
1800:	learn: 0.9964848	total: 1m 45s	remaining: 1

0.485

In [6]:
# 线上提交
model = CatBoostClassifier(iterations=2000,depth=8,learning_rate=0.01,verbose=100,loss_function='MultiClass',random_state=903)

model.fit(train.loc[:,features],train.label)

test['prediction'] = model.predict(test.loc[:,features])
test['prob_Excellent'] = 0.0
test['prob_Good'] = 0.0
test['prob_Pass'] = 0.0
test['prob_Fail'] = 0.0
test.loc[:,['prob_Excellent','prob_Good','prob_Pass','prob_Fail']] = model.predict_proba(test.loc[:,features])

0:	learn: 1.3817637	total: 91.4ms	remaining: 3m 2s
100:	learn: 1.1648519	total: 6.31s	remaining: 1m 58s
200:	learn: 1.1100865	total: 12.5s	remaining: 1m 52s
300:	learn: 1.0893340	total: 18.5s	remaining: 1m 44s
400:	learn: 1.0773263	total: 24.2s	remaining: 1m 36s
500:	learn: 1.0688930	total: 30s	remaining: 1m 29s
600:	learn: 1.0622544	total: 35.9s	remaining: 1m 23s
700:	learn: 1.0567301	total: 41.8s	remaining: 1m 17s
800:	learn: 1.0516514	total: 47.7s	remaining: 1m 11s
900:	learn: 1.0472675	total: 53.4s	remaining: 1m 5s
1000:	learn: 1.0429465	total: 59.3s	remaining: 59.2s
1100:	learn: 1.0384945	total: 1m 5s	remaining: 53.3s
1200:	learn: 1.0333279	total: 1m 11s	remaining: 47.5s
1300:	learn: 1.0287419	total: 1m 17s	remaining: 41.8s
1400:	learn: 1.0247204	total: 1m 24s	remaining: 36.1s
1500:	learn: 1.0203804	total: 1m 30s	remaining: 30.2s
1600:	learn: 1.0164020	total: 1m 37s	remaining: 24.2s
1700:	learn: 1.0127281	total: 1m 43s	remaining: 18.2s
1800:	learn: 1.0092687	total: 1m 50s	remainin

In [7]:
# 提交用
prediction = test.groupby(['Group'],as_index=False)['prob_Excellent','prob_Good','prob_Pass','prob_Fail'].mean()
prediction.columns = ['Group','Excellent ratio','Good ratio','Pass ratio','Fail ratio']
prediction.to_csv('baseline.csv',index=False)