In [2]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
# catboost安装方式：命令行输入 pip install -i https://pypi.tuna.tsinghua.edu.cn/simple catboost

In [3]:
# 读取数据
train = pd.read_csv('.\\data\\first_round_training_data.csv')
test = pd.read_csv('.\\data\\first_round_testing_data.csv')

In [4]:
# 特征列表
features = ["Parameter5","Parameter6","Parameter7","Parameter8","Parameter9","Parameter10"]

In [5]:
# 标签转化
def quality_encoder(x):
    return {'Excellent':0,'Good':1,'Pass':2,'Fail':3}[x]

train['label'] = train.Quality_label.apply(quality_encoder)
train['label_Excellent'] = 1*(train['label'] == 0)
train['label_Good'] = 1*(train['label'] == 1)
train['label_Pass'] = 1*(train['label'] == 2)
train['label_Fail'] = 1*(train['label'] == 3)

In [6]:
# 本地多分类准确率评估
local_train = train.iloc[:5000,:].copy()
local_valid = train.iloc[5000:,:].copy()

model = CatBoostClassifier(iterations=2000,depth=8,learning_rate=0.01,verbose=100,loss_function='MultiClass',random_state=666)
model.fit(local_train.loc[:,features],local_train.label)

local_valid['prediction'] = model.predict(local_valid.loc[:,features])
(local_valid.label == local_valid.prediction).mean()

0:	learn: 1.3814611	total: 137ms	remaining: 4m 34s
100:	learn: 1.1609463	total: 6.18s	remaining: 1m 56s
200:	learn: 1.1042828	total: 12.1s	remaining: 1m 48s
300:	learn: 1.0812268	total: 17.8s	remaining: 1m 40s
400:	learn: 1.0687255	total: 23.6s	remaining: 1m 34s
500:	learn: 1.0594367	total: 29.1s	remaining: 1m 27s
600:	learn: 1.0521968	total: 34.8s	remaining: 1m 21s
700:	learn: 1.0462422	total: 40.3s	remaining: 1m 14s
800:	learn: 1.0408338	total: 45.7s	remaining: 1m 8s
900:	learn: 1.0360825	total: 51.2s	remaining: 1m 2s
1000:	learn: 1.0310389	total: 57s	remaining: 56.9s
1100:	learn: 1.0257770	total: 1m 2s	remaining: 51.4s
1200:	learn: 1.0207589	total: 1m 9s	remaining: 46s
1300:	learn: 1.0157969	total: 1m 15s	remaining: 40.6s
1400:	learn: 1.0113287	total: 1m 21s	remaining: 34.9s
1500:	learn: 1.0075482	total: 1m 27s	remaining: 29.2s
1600:	learn: 1.0034624	total: 1m 34s	remaining: 23.5s
1700:	learn: 0.9998904	total: 1m 40s	remaining: 17.7s
1800:	learn: 0.9964848	total: 1m 46s	remaining: 1

0.485

In [7]:
# 线上提交
model = CatBoostClassifier(iterations=2000,depth=8,learning_rate=0.01,verbose=100,loss_function='MultiClass',random_state=903)

model.fit(train.loc[:,features],train.label)

test['prediction'] = model.predict(test.loc[:,features])
test['prob_Excellent'] = 0.0
test['prob_Good'] = 0.0
test['prob_Pass'] = 0.0
test['prob_Fail'] = 0.0
test.loc[:,['prob_Excellent','prob_Good','prob_Pass','prob_Fail']] = model.predict_proba(test.loc[:,features])

0:	learn: 1.3817637	total: 68.7ms	remaining: 2m 17s
100:	learn: 1.1648519	total: 6.33s	remaining: 1m 58s
200:	learn: 1.1100865	total: 12.6s	remaining: 1m 53s
300:	learn: 1.0893340	total: 18.7s	remaining: 1m 45s
400:	learn: 1.0773263	total: 24.4s	remaining: 1m 37s
500:	learn: 1.0688930	total: 30.4s	remaining: 1m 30s
600:	learn: 1.0622544	total: 36.2s	remaining: 1m 24s
700:	learn: 1.0567301	total: 42.2s	remaining: 1m 18s
800:	learn: 1.0516514	total: 48.2s	remaining: 1m 12s
900:	learn: 1.0472675	total: 54s	remaining: 1m 5s
1000:	learn: 1.0429465	total: 59.8s	remaining: 59.7s
1100:	learn: 1.0384945	total: 1m 5s	remaining: 53.8s
1200:	learn: 1.0333279	total: 1m 12s	remaining: 48s
1300:	learn: 1.0287419	total: 1m 18s	remaining: 42.3s
1400:	learn: 1.0247204	total: 1m 25s	remaining: 36.4s
1500:	learn: 1.0203804	total: 1m 31s	remaining: 30.4s
1600:	learn: 1.0164020	total: 1m 38s	remaining: 24.4s
1700:	learn: 1.0127281	total: 1m 44s	remaining: 18.4s
1800:	learn: 1.0092687	total: 1m 50s	remaining

In [9]:
# 提交用
prediction = test.groupby(['Group'],as_index=False)['prob_Excellent','prob_Good','prob_Pass','prob_Fail'].mean()
prediction.columns = ['Group','Excellent ratio','Good ratio','Pass ratio','Fail ratio']
prediction.to_csv('baseline.csv',index=False)

In [13]:
test.head()

Unnamed: 0,Group,Parameter1,Parameter10,Parameter2,Parameter3,Parameter4,Parameter5,Parameter6,Parameter7,Parameter8,Parameter9,prediction,prob_Excellent,prob_Good,prob_Pass,prob_Fail
0,0,0.235279,0.19568,0.351488,0.186814,0.086456,1.43406,1.230409,0.038483,2.931083,2.005852,0.0,0.42852,0.277435,0.254325,0.039719
1,0,0.043799,0.523976,32.590688,0.886285,29.70249,0.027434,0.016852,0.600827,0.118077,887.611257,3.0,0.012315,0.152226,0.055159,0.7803
2,0,21.466737,0.523976,11.847792,79499.554454,13544.605343,0.027434,0.016852,0.600827,0.118077,887.611257,3.0,0.012315,0.152226,0.055159,0.7803
3,0,0.703795,0.010192,0.01808,317.015599,38.087562,0.428092,0.314162,0.038483,17.850021,0.05185,1.0,0.102019,0.526633,0.126028,0.245321
4,0,10.44796,0.010192,15.983891,0.996049,16.71569,0.477823,0.464037,0.038483,17.850021,0.05185,1.0,0.095198,0.530673,0.13415,0.239979


In [21]:
prediction.head().append(prediction.tail())

Unnamed: 0,Group,Excellent ratio,Good ratio,Pass ratio,Fail ratio
0,0,0.21907,0.336087,0.249561,0.195282
1,1,0.181191,0.260221,0.376934,0.181654
2,2,0.213087,0.303198,0.306363,0.177352
3,3,0.241189,0.300243,0.352785,0.105783
4,4,0.18097,0.377722,0.254252,0.187056
115,115,0.174227,0.247881,0.423375,0.154517
116,116,0.091107,0.111225,0.725299,0.072369
117,117,0.19282,0.272478,0.355537,0.179164
118,118,0.087955,0.101799,0.758373,0.051873
119,119,0.187666,0.274639,0.387277,0.150417
