In [102]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from tqdm import tqdm
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
import pandas as pd
import numpy as np
import math
import gc

pd.set_option('display.max_columns', 100)

In [17]:
train = pd.read_csv('.\\data\\first_round_training_data.csv')
test = pd.read_csv('.\\data\\first_round_testing_data.csv')
submit_example = pd.read_csv('.\\data\\submit_example.csv')

print('shape of train:', train.shape)
print('shape of test:', test.shape)
print('shape of submit:', submit_example.shape)

shape of train: (6000, 21)
shape of test: (6000, 11)
shape of submit: (120, 5)


In [18]:
def label_map(x):
    labels = {'Excellent':1, 'Good':2, 'Pass':3, 'Fail':4}
    return labels[x]

train['label'] = train.Quality_label.apply(label_map)
train.head()

Unnamed: 0,Parameter1,Parameter2,Parameter3,Parameter4,Parameter5,Parameter6,Parameter7,Parameter8,Parameter9,Parameter10,Attribute1,Attribute2,Attribute3,Attribute4,Attribute5,Attribute6,Attribute7,Attribute8,Attribute9,Attribute10,Quality_label,label
0,0.00166,0.591013,147.608373,38.186345,0.000421,0.000612,2286.523413,0.035407,0.593081,1.010385,6.856075,0.168761,1.098755,36.955992,8.454598,11.438066,177.24312,338.729256,2.021704,0.079526,Pass,3
1,1.601749,0.015052,0.035864,51.130326,0.000909,0.002397,2286.523413,0.035407,0.593081,1.010385,0.000362,11.649033,0.066671,225.632949,0.48186,20597.447822,3.72333,15.37619,0.986973,4.634376,Fail,4
2,0.098039,69.233685,0.08092,0.112265,0.000909,0.001972,2286.523413,0.035407,0.593081,1.010385,0.022201,0.078213,110.079689,2.208138,0.073525,236.079314,0.064196,0.576302,33.87579,1.813727,Fail,4
3,18.18186,0.047325,0.018061,1.098102,0.000909,0.002397,2286.523413,0.035407,0.593081,1.010385,1.459004,0.380281,0.011491,0.654517,0.025872,176.948915,0.029777,0.246726,27.117165,0.081819,Fail,4
4,0.012085,0.008749,0.005509,524.327396,0.000909,0.002397,2286.523413,0.035407,0.593081,1.010385,11.576647,1.555672,38.613386,0.260989,0.00938,194.798039,0.055053,0.014725,13.569707,18.138496,Fail,4


In [59]:
features = ['Parameter5', 'Parameter6', 'Parameter7', 'Parameter8', 
            'Parameter9', 'Parameter10']
full = pd.concat([train[features], test[features]], ignore_index=True)

In [63]:
full.shape

(12000, 12)

In [61]:
for feature in features:
    lr = LabelEncoder()
    full[feature+'_le'] = lr.fit_transform(full[feature])

In [62]:
full.head()

Unnamed: 0,Parameter5,Parameter6,Parameter7,Parameter8,Parameter9,Parameter10,Parameter5_le,Parameter6_le,Parameter7_le,Parameter8_le,Parameter9_le,Parameter10_le
0,0.000421,0.000612,2286.523413,0.035407,0.593081,1.010385,25,24,12,1,5,24
1,0.000909,0.002397,2286.523413,0.035407,0.593081,1.010385,33,31,12,1,5,24
2,0.000909,0.001972,2286.523413,0.035407,0.593081,1.010385,33,30,12,1,5,24
3,0.000909,0.002397,2286.523413,0.035407,0.593081,1.010385,33,31,12,1,5,24
4,0.000909,0.002397,2286.523413,0.035407,0.593081,1.010385,33,31,12,1,5,24


In [68]:
def woe_preprocess(full, train, col, label):
    # col: Parameter5_le
    # label: 1/2/3/4
    label_cnt = train.label.value_counts()
    total_cnt = label_cnt.sum()
    label_cnt = train[col].value_counts()
    woe_map = {}
    for cat in label_cnt.index:
        cnt_cat_label = train[(train[col]==cat)&(train.label==label)].shape[0]
        cnt_cat_not_label = train[(train[col]==cat)&(train.label!=label)].shape[0]
        woe_map[cat] = math.log((cnt_cat_not_label + 0.5) / (cnt_cat_label + 0.5))
    full[col+'_woe_'+str(label)] = full[col].apply(lambda x: woe_map.get(x))
    # test = pd.merge(test, train[[col.split('_')[0], col+'_woe_'+str(label)]], on=[col.split('_')[0]], how='left')
    return full

In [69]:
for feature in tqdm(features):
    for label in [1,2,3,4]:
        # print(feature, label)
        full = woe_preprocess(full, train, feature+'_le', label)

100%|█████████████████████████████████████████████████| 6/6 [00:04<00:00,  1.28it/s]


In [71]:
full.head(5).append(full.tail(5))

Unnamed: 0,Parameter5,Parameter6,Parameter7,Parameter8,Parameter9,Parameter10,Parameter5_le,Parameter6_le,Parameter7_le,Parameter8_le,Parameter9_le,Parameter10_le,Parameter5_le_woe_1,Parameter5_le_woe_2,Parameter5_le_woe_3,Parameter5_le_woe_4,Parameter6_le_woe_1,Parameter6_le_woe_2,Parameter6_le_woe_3,Parameter6_le_woe_4,Parameter7_le_woe_1,Parameter7_le_woe_2,Parameter7_le_woe_3,Parameter7_le_woe_4,Parameter8_le_woe_1,Parameter8_le_woe_2,Parameter8_le_woe_3,Parameter8_le_woe_4,Parameter9_le_woe_1,Parameter9_le_woe_2,Parameter9_le_woe_3,Parameter9_le_woe_4,Parameter10_le_woe_1,Parameter10_le_woe_2,Parameter10_le_woe_3,Parameter10_le_woe_4
0,0.000421,0.000612,2286.523413,0.035407,0.593081,1.010385,25,24,12,1,5,24,1.650681,0.705886,0.705886,1.418383,0.796331,1.062894,1.209838,1.209838,1.94591,1.94591,-0.510826,0.510826,2.03899,1.248534,-0.04312,1.722908,0.720581,0.648387,1.317337,2.002249,1.877155,1.877155,-0.707069,2.571616
1,0.000909,0.002397,2286.523413,0.035407,0.593081,1.010385,33,31,12,1,5,24,0.998529,0.635989,2.036882,0.635989,0.619039,1.098612,0.200671,2.944439,1.94591,1.94591,-0.510826,0.510826,2.03899,1.248534,-0.04312,1.722908,0.720581,0.648387,1.317337,2.002249,1.877155,1.877155,-0.707069,2.571616
2,0.000909,0.001972,2286.523413,0.035407,0.593081,1.010385,33,30,12,1,5,24,0.998529,0.635989,2.036882,0.635989,1.512588,0.887303,1.17412,0.756326,1.94591,1.94591,-0.510826,0.510826,2.03899,1.248534,-0.04312,1.722908,0.720581,0.648387,1.317337,2.002249,1.877155,1.877155,-0.707069,2.571616
3,0.000909,0.002397,2286.523413,0.035407,0.593081,1.010385,33,31,12,1,5,24,0.998529,0.635989,2.036882,0.635989,0.619039,1.098612,0.200671,2.944439,1.94591,1.94591,-0.510826,0.510826,2.03899,1.248534,-0.04312,1.722908,0.720581,0.648387,1.317337,2.002249,1.877155,1.877155,-0.707069,2.571616
4,0.000909,0.002397,2286.523413,0.035407,0.593081,1.010385,33,31,12,1,5,24,0.998529,0.635989,2.036882,0.635989,0.619039,1.098612,0.200671,2.944439,1.94591,1.94591,-0.510826,0.510826,2.03899,1.248534,-0.04312,1.722908,0.720581,0.648387,1.317337,2.002249,1.877155,1.877155,-0.707069,2.571616
11995,0.000218,0.000414,2286.523413,0.035407,0.593081,51.944717,19,22,12,1,5,37,2.564949,1.299283,1.299283,-0.587787,2.944439,1.098612,-0.200671,1.098612,1.94591,1.94591,-0.510826,0.510826,2.03899,1.248534,-0.04312,1.722908,0.720581,0.648387,1.317337,2.002249,1.665008,0.52702,0.592504,1.911719
11996,3.095123,1.817391,0.600827,17.850021,6.783967,0.19568,110,67,5,15,7,19,1.340385,0.637797,0.591098,2.18546,1.396406,0.965359,0.493658,1.734601,1.557634,1.731954,-0.513229,2.918661,1.867745,2.693125,-1.275543,3.310543,1.405343,0.292987,1.405343,1.483838,1.118613,1.286665,0.341592,1.928961
11997,0.74163,1.495371,0.600827,17.850021,0.05185,0.073078,96,66,5,15,3,15,0.865418,0.888259,0.865418,1.984376,1.590569,1.380371,-0.145827,2.236515,1.557634,1.731954,-0.513229,2.918661,1.867745,2.693125,-1.275543,3.310543,1.836732,1.228989,-0.016232,1.879048,0.783249,1.046714,0.605674,2.482967
11998,3.454681,3.262468,0.600827,17.850021,6.783967,0.027291,111,70,5,15,7,12,1.44977,1.163635,0.252252,1.828841,1.810109,1.481057,-0.315081,2.192274,1.557634,1.731954,-0.513229,2.918661,1.867745,2.693125,-1.275543,3.310543,1.405343,0.292987,1.405343,1.483838,0.97538,0.369217,1.371479,2.014154
11999,1.031282,0.833011,0.038483,2.931083,2.005852,0.073078,99,63,3,10,6,15,1.39083,1.30236,0.13062,1.929449,0.87068,0.579609,1.078662,2.244042,1.932864,1.267411,-0.155926,2.03613,1.098612,1.098612,-0.336472,2.397895,2.586689,1.098612,1.734601,-0.200671,0.783249,1.046714,0.605674,2.482967


In [78]:
# test部分woe的缺失值用对应部分的mean表示
for col in full.columns:
    mean = full[col].mean()
    full[col].fillna(mean, inplace=True)

In [86]:
le_features = ['Parameter5_le', 'Parameter6_le', 'Parameter7_le', 'Parameter8_le', 'Parameter9_le', 'Parameter10_le']
trainX = full[:train.shape[0]]
trainX.drop(le_features, axis=1, inplace=True)

testX = full[train.shape[0]:]
testX.drop(le_features, axis=1, inplace=True)

print('shape of trainX:', trainX.shape)
print('shape of testX:', testX.shape)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


shape of trainX: (6000, 30)
shape of testX: (6000, 30)


In [104]:
full.shape

(12000, 36)

In [117]:
test_full = full[train.shape[0]:]
test_full.head()

Unnamed: 0,Parameter5,Parameter6,Parameter7,Parameter8,Parameter9,Parameter10,Parameter5_le,Parameter6_le,Parameter7_le,Parameter8_le,Parameter9_le,Parameter10_le,Parameter5_le_woe_1,Parameter5_le_woe_2,Parameter5_le_woe_3,Parameter5_le_woe_4,Parameter6_le_woe_1,Parameter6_le_woe_2,Parameter6_le_woe_3,Parameter6_le_woe_4,Parameter7_le_woe_1,Parameter7_le_woe_2,Parameter7_le_woe_3,Parameter7_le_woe_4,Parameter8_le_woe_1,Parameter8_le_woe_2,Parameter8_le_woe_3,Parameter8_le_woe_4,Parameter9_le_woe_1,Parameter9_le_woe_2,Parameter9_le_woe_3,Parameter9_le_woe_4,Parameter10_le_woe_1,Parameter10_le_woe_2,Parameter10_le_woe_3,Parameter10_le_woe_4
6000,1.43406,1.230409,0.038483,2.931083,2.005852,0.19568,102,65,3,10,6,19,1.148623,0.955511,0.349184,2.438997,0.968347,1.237968,0.470686,1.99482,1.932864,1.267411,-0.155926,2.03613,1.098612,1.098612,-0.336472,2.397895,2.586689,1.098612,1.734601,-0.200671,1.118613,1.286665,0.341592,1.928961
6001,0.027434,0.016852,0.600827,0.118077,887.611257,0.523976,65,41,5,4,12,22,2.833213,0.0,0.451985,1.609438,3.100092,1.008664,1.291984,0.034486,1.557634,1.731954,-0.513229,2.918661,2.90658,0.987827,0.725334,0.589684,1.098612,1.098612,1.098612,-1.098612,2.120264,1.128135,-0.389465,2.793208
6002,0.027434,0.016852,0.600827,0.118077,887.611257,0.523976,65,41,5,4,12,22,2.833213,0.0,0.451985,1.609438,3.100092,1.008664,1.291984,0.034486,1.557634,1.731954,-0.513229,2.918661,2.90658,0.987827,0.725334,0.589684,1.098612,1.098612,1.098612,-1.098612,2.120264,1.128135,-0.389465,2.793208
6003,0.428092,0.314162,0.038483,17.850021,0.05185,0.010192,91,58,3,15,3,8,0.992129,0.992129,0.680877,1.820747,1.129149,0.535143,1.129149,1.708109,1.932864,1.267411,-0.155926,2.03613,1.867745,2.693125,-1.275543,3.310543,1.836732,1.228989,-0.016232,1.879048,2.833213,1.148623,2.14352,-0.451985
6004,0.477823,0.464037,0.038483,17.850021,0.05185,0.010192,92,60,3,15,3,8,1.021651,0.430036,1.082987,2.197225,1.18756,0.852479,0.652656,1.850028,1.932864,1.267411,-0.155926,2.03613,1.867745,2.693125,-1.275543,3.310543,1.836732,1.228989,-0.016232,1.879048,2.833213,1.148623,2.14352,-0.451985


In [136]:
np.mean(full.Parameter5_le_woe_4.value_counts().index)

1.372512343770417

In [135]:
full.Parameter5_le_woe_4.mean()

1.962605517197862

### PCA降维

In [137]:
pca = PCA(n_components=10)
full_pca = pca.fit_transform(full)

trainX = full_pca[:train.shape[0]]
testX = full_pca[train.shape[0]:]

print('shape of trainX:', trainX.shape)
print('shape of testX:', testX.shape)

shape of trainX: (6000, 10)
shape of testX: (6000, 10)


In [141]:
model = CatBoostClassifier(iterations=5000, learning_rate=0.01, 
                           depth=10, loss_function='MultiClass', task_type='GPU',
                           random_state=925*48, verbose=200, early_stopping_rounds=100)
model.fit(trainX, train.label)
predY = model.predict(testX)



0:	learn: 1.3808649	total: 92.1ms	remaining: 30m 41s
200:	learn: 1.0370697	total: 8.57s	remaining: 14m 4s
400:	learn: 0.9697907	total: 17s	remaining: 13m 51s
600:	learn: 0.9304499	total: 25.9s	remaining: 13m 54s
800:	learn: 0.8997545	total: 34.5s	remaining: 13m 46s
1000:	learn: 0.8716698	total: 43s	remaining: 13m 36s
1200:	learn: 0.8458390	total: 51.5s	remaining: 13m 25s
1400:	learn: 0.8226207	total: 59.9s	remaining: 13m 15s
1600:	learn: 0.8012358	total: 1m 8s	remaining: 13m 7s
1800:	learn: 0.7823035	total: 1m 16s	remaining: 12m 57s
2000:	learn: 0.7648421	total: 1m 25s	remaining: 12m 47s
2200:	learn: 0.7484110	total: 1m 33s	remaining: 12m 37s
2400:	learn: 0.7335299	total: 1m 42s	remaining: 12m 27s
2600:	learn: 0.7194675	total: 1m 50s	remaining: 12m 18s
2800:	learn: 0.7066277	total: 1m 58s	remaining: 12m 9s
3000:	learn: 0.6947908	total: 2m 7s	remaining: 12m
3200:	learn: 0.6832568	total: 2m 15s	remaining: 11m 51s
3400:	learn: 0.6723248	total: 2m 24s	remaining: 11m 43s
3600:	learn: 0.6620

In [142]:
test['pred'] = predY
test['prob_Excellent'] = 0.0
test['prob_Good'] = 0.0
test['prob_Pass'] = 0.0
test['prob_Fail'] = 0.0
test.loc[:, ['prob_Excellent', 'prob_Good', 'prob_Pass', 'prob_Fail']] = model.predict_proba(testX)
test.head(10)

Unnamed: 0,Group,Parameter1,Parameter10,Parameter2,Parameter3,Parameter4,Parameter5,Parameter6,Parameter7,Parameter8,Parameter9,pred,prob_Excellent,prob_Good,prob_Pass,prob_Fail
0,0,0.235279,0.19568,0.351488,0.186814,0.086456,1.43406,1.230409,0.038483,2.931083,2.005852,2.0,0.236484,0.709537,0.05023,0.003749
1,0,0.043799,0.523976,32.590688,0.886285,29.70249,0.027434,0.016852,0.600827,0.118077,887.611257,4.0,0.000461,0.04998,0.040283,0.909276
2,0,21.466737,0.523976,11.847792,79499.554454,13544.605343,0.027434,0.016852,0.600827,0.118077,887.611257,4.0,0.000461,0.04998,0.040283,0.909276
3,0,0.703795,0.010192,0.01808,317.015599,38.087562,0.428092,0.314162,0.038483,17.850021,0.05185,2.0,0.01148,0.619887,0.051458,0.317174
4,0,10.44796,0.010192,15.983891,0.996049,16.71569,0.477823,0.464037,0.038483,17.850021,0.05185,2.0,0.304285,0.502574,0.036332,0.156809
5,0,733.43992,0.010192,99.698641,3.202776,31.368042,0.477823,0.464037,0.038483,17.850021,0.05185,2.0,0.304285,0.502574,0.036332,0.156809
6,0,0.149962,0.010192,2.910066,2.187656,331.744593,0.533331,0.464037,0.038483,17.850021,0.05185,2.0,0.034474,0.753438,0.054089,0.157999
7,0,0.060635,0.010192,0.016492,0.007373,7183.436876,0.533331,0.464037,0.038483,17.850021,0.05185,2.0,0.034474,0.753438,0.054089,0.157999
8,0,971.284623,0.010192,29.954312,3.639672,0.136383,0.477823,0.381815,0.038483,17.850021,0.05185,2.0,0.014488,0.747721,0.021478,0.216313
9,0,1372.664549,0.010192,2.103451,0.531944,5.802723,0.477823,0.464037,0.038483,17.850021,0.05185,2.0,0.304285,0.502574,0.036332,0.156809


In [143]:
result = test.groupby(['Group'], as_index=False)['prob_Excellent', 'prob_Good', 'prob_Pass', 'prob_Fail'].mean()
result.columns = ['Group','Excellent ratio','Good ratio','Pass ratio','Fail ratio']
result.to_csv('submission19.csv', index=False)

In [91]:
CatBoostClassifier?

In [90]:
train.label.value_counts()

3    2417
2    1584
1    1107
4     892
Name: label, dtype: int64

In [139]:
1 / (1 + 10 * 0.06258054507206764)

0.6150797437397731