In [7]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
# catboost安装方式：命令行输入 pip install -i https://pypi.tuna.tsinghua.edu.cn/simple catboost

In [8]:
# 读取数据
train = pd.read_csv('first_round_training_data.csv')
test = pd.read_csv('first_round_testing_data.csv')

In [9]:
# 特征列表
features = ["Parameter5","Parameter6","Parameter7","Parameter8","Parameter9","Parameter10"]

In [10]:
# 标签转化
def quality_encoder(x):
    return {'Excellent':0,'Good':1,'Pass':2,'Fail':3}[x]

train['label'] = train.Quality_label.apply(quality_encoder)
train['label_Excellent'] = 1*(train['label'] == 0)
train['label_Good'] = 1*(train['label'] == 1)
train['label_Pass'] = 1*(train['label'] == 2)
train['label_Fail'] = 1*(train['label'] == 3)

In [11]:
# 本地多分类准确率评估
local_train = train.iloc[:5000,:].copy()
local_valid = train.iloc[5000:,:].copy()

model = CatBoostClassifier(iterations=2000,depth=8,learning_rate=0.01,verbose=100,loss_function='MultiClass',random_state=666)
model.fit(local_train.loc[:,features],local_train.label)

local_valid['prediction'] = model.predict(local_valid.loc[:,features])
(local_valid.label == local_valid.prediction).mean()

0:	learn: 1.3809616	total: 105ms	remaining: 3m 30s
100:	learn: 1.1594414	total: 8.92s	remaining: 2m 47s
200:	learn: 1.1032685	total: 17.4s	remaining: 2m 35s
300:	learn: 1.0809452	total: 25.6s	remaining: 2m 24s
400:	learn: 1.0684502	total: 33.5s	remaining: 2m 13s
500:	learn: 1.0592936	total: 41.3s	remaining: 2m 3s
600:	learn: 1.0526293	total: 48.6s	remaining: 1m 53s
700:	learn: 1.0468903	total: 56.5s	remaining: 1m 44s
800:	learn: 1.0417044	total: 1m 3s	remaining: 1m 35s
900:	learn: 1.0376457	total: 1m 11s	remaining: 1m 26s
1000:	learn: 1.0330360	total: 1m 18s	remaining: 1m 18s
1100:	learn: 1.0281466	total: 1m 26s	remaining: 1m 10s
1200:	learn: 1.0231503	total: 1m 36s	remaining: 1m 3s
1300:	learn: 1.0188333	total: 1m 45s	remaining: 56.8s
1400:	learn: 1.0144731	total: 1m 55s	remaining: 49.5s
1500:	learn: 1.0102373	total: 2m 6s	remaining: 41.9s
1600:	learn: 1.0065442	total: 2m 16s	remaining: 34s
1700:	learn: 1.0025974	total: 2m 25s	remaining: 25.6s
1800:	learn: 0.9990192	total: 2m 34s	rema

0.481

In [12]:
# 线上提交
model = CatBoostClassifier(iterations=2000,depth=8,learning_rate=0.01,verbose=100,loss_function='MultiClass',random_state=666)

model.fit(train.loc[:,features],train.label)

test['prediction'] = model.predict(test.loc[:,features])
test['prob_Excellent'] = 0.0
test['prob_Good'] = 0.0
test['prob_Pass'] = 0.0
test['prob_Fail'] = 0.0
test.loc[:,['prob_Excellent','prob_Good','prob_Pass','prob_Fail']] = model.predict_proba(test.loc[:,features])

0:	learn: 1.3809997	total: 93.8ms	remaining: 3m 7s
100:	learn: 1.1658482	total: 8.76s	remaining: 2m 44s
200:	learn: 1.1109862	total: 17.6s	remaining: 2m 37s
300:	learn: 1.0892558	total: 26s	remaining: 2m 26s
400:	learn: 1.0772519	total: 34.5s	remaining: 2m 17s
500:	learn: 1.0693838	total: 42.4s	remaining: 2m 6s
600:	learn: 1.0626996	total: 50.6s	remaining: 1m 57s
700:	learn: 1.0565535	total: 58.7s	remaining: 1m 48s
800:	learn: 1.0517824	total: 1m 7s	remaining: 1m 40s
900:	learn: 1.0472620	total: 1m 14s	remaining: 1m 31s
1000:	learn: 1.0434407	total: 1m 22s	remaining: 1m 22s
1100:	learn: 1.0389269	total: 1m 31s	remaining: 1m 14s
1200:	learn: 1.0346725	total: 1m 39s	remaining: 1m 6s
1300:	learn: 1.0299140	total: 1m 48s	remaining: 58.4s
1400:	learn: 1.0262837	total: 1m 57s	remaining: 50.3s
1500:	learn: 1.0220511	total: 2m 6s	remaining: 42s
1600:	learn: 1.0182999	total: 2m 15s	remaining: 33.7s
1700:	learn: 1.0148734	total: 2m 24s	remaining: 25.3s
1800:	learn: 1.0117304	total: 2m 33s	remain

In [13]:
# 提交用
prediction = test.groupby(['Group'],as_index=False)['prob_Excellent','prob_Good','prob_Pass','prob_Fail'].mean()
prediction.columns = ['Group','Excellent ratio','Good ratio','Pass ratio','Fail ratio']
prediction.to_csv('baseline.csv',index=False)