In [28]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import RFE
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier
import xgboost as xgb
import lightgbm as lgb
import pandas_profiling as ppf
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
import os
import gc

pd.set_option('display.max_columns', 50)
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('.\\data\\first_round_training_data.csv')
test = pd.read_csv('.\\data\\first_round_testing_data.csv')
submit_example = pd.read_csv('.\\data\\submit_example.csv')

print('shape of train:', train.shape)
print('shape of test:', test.shape)
print('shape of submit:', submit_example.shape)

shape of train: (6000, 21)
shape of test: (6000, 11)
shape of submit: (120, 5)


In [3]:
train.head()

Unnamed: 0,Parameter1,Parameter2,Parameter3,Parameter4,Parameter5,Parameter6,Parameter7,Parameter8,Parameter9,Parameter10,Attribute1,Attribute2,Attribute3,Attribute4,Attribute5,Attribute6,Attribute7,Attribute8,Attribute9,Attribute10,Quality_label
0,0.00166,0.591013,147.608373,38.186345,0.000421,0.000612,2286.523413,0.035407,0.593081,1.010385,6.856075,0.168761,1.098755,36.955992,8.454598,11.438066,177.24312,338.729256,2.021704,0.079526,Pass
1,1.601749,0.015052,0.035864,51.130326,0.000909,0.002397,2286.523413,0.035407,0.593081,1.010385,0.000362,11.649033,0.066671,225.632949,0.48186,20597.447822,3.72333,15.37619,0.986973,4.634376,Fail
2,0.098039,69.233685,0.08092,0.112265,0.000909,0.001972,2286.523413,0.035407,0.593081,1.010385,0.022201,0.078213,110.079689,2.208138,0.073525,236.079314,0.064196,0.576302,33.87579,1.813727,Fail
3,18.18186,0.047325,0.018061,1.098102,0.000909,0.002397,2286.523413,0.035407,0.593081,1.010385,1.459004,0.380281,0.011491,0.654517,0.025872,176.948915,0.029777,0.246726,27.117165,0.081819,Fail
4,0.012085,0.008749,0.005509,524.327396,0.000909,0.002397,2286.523413,0.035407,0.593081,1.010385,11.576647,1.555672,38.613386,0.260989,0.00938,194.798039,0.055053,0.014725,13.569707,18.138496,Fail


In [4]:
test.head().append(test.tail())

Unnamed: 0,Group,Parameter1,Parameter10,Parameter2,Parameter3,Parameter4,Parameter5,Parameter6,Parameter7,Parameter8,Parameter9
0,0,0.235279,0.19568,0.351488,0.186814,0.086456,1.43406,1.230409,0.038483,2.931083,2.005852
1,0,0.043799,0.523976,32.590688,0.886285,29.70249,0.027434,0.016852,0.600827,0.118077,887.611257
2,0,21.466737,0.523976,11.847792,79499.554454,13544.605343,0.027434,0.016852,0.600827,0.118077,887.611257
3,0,0.703795,0.010192,0.01808,317.015599,38.087562,0.428092,0.314162,0.038483,17.850021,0.05185
4,0,10.44796,0.010192,15.983891,0.996049,16.71569,0.477823,0.464037,0.038483,17.850021,0.05185
5995,119,765.876721,51.944717,0.053836,0.258969,0.125856,0.000218,0.000414,2286.523413,0.035407,0.593081
5996,119,0.071211,0.19568,0.924208,284.265495,15.66877,3.095123,1.817391,0.600827,17.850021,6.783967
5997,119,0.001922,0.073078,7.829744,16.138304,0.000376,0.74163,1.495371,0.600827,17.850021,0.05185
5998,119,5.4e-05,0.027291,0.672029,0.002134,0.149019,3.454681,3.262468,0.600827,17.850021,6.783967
5999,119,1.026527,0.073078,0.116152,2.923321,610.091923,1.031282,0.833011,0.038483,2.931083,2.005852


In [5]:
submit_example.head()

Unnamed: 0,Group,Excellent ratio,Good ratio,Pass ratio,Fail ratio
0,0,0.4,0.2,0.2,0.2
1,1,0.4,0.2,0.2,0.2
2,2,0.4,0.2,0.2,0.2
3,3,0.4,0.2,0.2,0.2
4,4,0.4,0.2,0.2,0.2


In [6]:
train.Quality_label.unique()

array(['Pass', 'Fail', 'Good', 'Excellent'], dtype=object)

In [7]:
features = ['Parameter1', 'Parameter2', 'Parameter3', 'Parameter4', 'Parameter5',
            'Parameter6', 'Parameter7', 'Parameter8', 'Parameter9', 'Parameter10',
            'Attribute1', 'Attribute2', 'Attribute3', 'Attribute4', 'Attribute5',
            'Attribute6', 'Attribute7', 'Attribute8', 'Attribute9', 'Attribute10']
test_features = ['Parameter1', 'Parameter10', 'Parameter2', 'Parameter3',
                 'Parameter4', 'Parameter5', 'Parameter6', 'Parameter7', 
                 'Parameter8', 'Parameter9']

### 特征类别观察

In [8]:
for feature in test_features:
    print(feature, len(train[feature].value_counts().tolist()))

Parameter1 6000
Parameter10 41
Parameter2 6000
Parameter3 6000
Parameter4 6000
Parameter5 132
Parameter6 80
Parameter7 14
Parameter8 23
Parameter9 16


In [9]:
def label_map(x):
    labels = {'Excellent':1, 'Good':2, 'Pass':3, 'Fail':4}
    return labels[x]

train['label'] = train.Quality_label.apply(label_map)
train.head()

Unnamed: 0,Parameter1,Parameter2,Parameter3,Parameter4,Parameter5,Parameter6,Parameter7,Parameter8,Parameter9,Parameter10,Attribute1,Attribute2,Attribute3,Attribute4,Attribute5,Attribute6,Attribute7,Attribute8,Attribute9,Attribute10,Quality_label,label
0,0.00166,0.591013,147.608373,38.186345,0.000421,0.000612,2286.523413,0.035407,0.593081,1.010385,6.856075,0.168761,1.098755,36.955992,8.454598,11.438066,177.24312,338.729256,2.021704,0.079526,Pass,3
1,1.601749,0.015052,0.035864,51.130326,0.000909,0.002397,2286.523413,0.035407,0.593081,1.010385,0.000362,11.649033,0.066671,225.632949,0.48186,20597.447822,3.72333,15.37619,0.986973,4.634376,Fail,4
2,0.098039,69.233685,0.08092,0.112265,0.000909,0.001972,2286.523413,0.035407,0.593081,1.010385,0.022201,0.078213,110.079689,2.208138,0.073525,236.079314,0.064196,0.576302,33.87579,1.813727,Fail,4
3,18.18186,0.047325,0.018061,1.098102,0.000909,0.002397,2286.523413,0.035407,0.593081,1.010385,1.459004,0.380281,0.011491,0.654517,0.025872,176.948915,0.029777,0.246726,27.117165,0.081819,Fail,4
4,0.012085,0.008749,0.005509,524.327396,0.000909,0.002397,2286.523413,0.035407,0.593081,1.010385,11.576647,1.555672,38.613386,0.260989,0.00938,194.798039,0.055053,0.014725,13.569707,18.138496,Fail,4


In [None]:
pfr = ppf.ProfileReport(train)
pfr.to_file('train_EDA.html')

### 异常点观察

In [None]:
plt.scatter(train.Quality_label, train.Parameter1)
# 剔除train.Parameter1>12的数据

In [None]:
train = train[train.Parameter1<=200000]

In [None]:
plt.scatter(train.Quality_label, train.Parameter2)
# 剔除train.Parameter2>14的数据

In [None]:
train = train[train.Parameter2<=1000000]

In [None]:
plt.scatter(train.Quality_label, train.Parameter3)
# 剔除train.Parameter3>12的数据

In [None]:
train = train[train.Parameter3<=200000]

In [None]:
plt.scatter(train.Quality_label, train.Parameter4)

In [None]:
train = train[train.Parameter4<=80000]

In [None]:
plt.scatter(train.Quality_label, train.Parameter5)
# 剔除train.Parameter5>4的数据

In [None]:
train = train[train.Parameter5<=60]

In [None]:
plt.scatter(train.Quality_label, train.Parameter6)

In [None]:
plt.scatter(train.Quality_label, train.Parameter7)

In [None]:
train = train[train.Parameter7<=30000]

In [None]:
plt.scatter(train.Quality_label, train.Parameter8)
# 剔除train.Parameter8>8的数据

In [None]:
train = train[train.Parameter8<=5000]

In [None]:
plt.scatter(train.Quality_label, train.Parameter9)
# 剔除train.Parameter9>12的数据

In [None]:
train = train[train.Parameter9<=1e8]

In [None]:
plt.scatter(train.Quality_label, train.Parameter10)

In [None]:
train.shape

In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(train[test_features].corr(), vmax=0.8, vmin=-0.8, linewidths=0.2, annot=True)

### 构建数据组合

In [10]:
selected_features = ['Parameter1', 'Parameter2', 'Parameter3',
                     'Parameter4', 'Parameter6', 'Parameter7', 
                     'Parameter8', 'Parameter9', 'Parameter10']
features = ['Parameter6', 'Parameter7', 'Parameter8', 
            'Parameter9', 'Parameter10']
full = pd.concat([train[selected_features], test[selected_features]], ignore_index=True)

#### Label Encoding

In [11]:
for feature in features:
    le_feature = feature + '_le'
    full[le_feature] = LabelEncoder().fit_transform(full[feature])

In [12]:
full.head()

Unnamed: 0,Parameter1,Parameter2,Parameter3,Parameter4,Parameter6,Parameter7,Parameter8,Parameter9,Parameter10,Parameter6_le,Parameter7_le,Parameter8_le,Parameter9_le,Parameter10_le
0,0.00166,0.591013,147.608373,38.186345,0.000612,2286.523413,0.035407,0.593081,1.010385,24,12,1,5,24
1,1.601749,0.015052,0.035864,51.130326,0.002397,2286.523413,0.035407,0.593081,1.010385,31,12,1,5,24
2,0.098039,69.233685,0.08092,0.112265,0.001972,2286.523413,0.035407,0.593081,1.010385,30,12,1,5,24
3,18.18186,0.047325,0.018061,1.098102,0.002397,2286.523413,0.035407,0.593081,1.010385,31,12,1,5,24
4,0.012085,0.008749,0.005509,524.327396,0.002397,2286.523413,0.035407,0.593081,1.010385,31,12,1,5,24


In [13]:
trainX = full[:train.shape[0]]
trainX['label'] = train.label
testX = full[train.shape[0]:]

print('shape of trainX:', trainX.shape)
print('shape of testX:', testX.shape)

shape of trainX: (6000, 15)
shape of testX: (6000, 14)


#### Target Encoding

In [18]:
def parameter_ctr(train, test, col, label):
    # train: 数据集
    # col: 类别型特征
    # label：标签值
    new_col = col + '_ctr_' + str(label)
    train[new_col] = 0.0
    
    ctr = train[col+'_le'].value_counts() 
    k_map = {}
    values = train[col+'_le'].unique().tolist()
    values.extend(test[col+'_le'].unique().tolist())
    for elem in set(values):
        try:
            k_map[elem] = train[(train[col+'_le']==elem)&(train['label']==label)].shape[0] / ctr[elem]
        except:
            k_map[elem] = 0.0
    train[new_col] = train[col+'_le'].apply(lambda x: k_map[x])
    test[new_col] = test[col+'_le'].apply(lambda x: k_map[x])
    return train, test

features = ['Parameter6', 'Parameter7', 'Parameter8', 
            'Parameter9', 'Parameter10']
labels = [1, 2, 3, 4]
for feature in features:
    for label in labels:
        print(feature, label)
        trainX, testX = parameter_ctr(trainX, testX, feature, label)

Parameter6 1
Parameter6 2
Parameter6 3
Parameter6 4
Parameter7 1
Parameter7 2
Parameter7 3
Parameter7 4
Parameter8 1
Parameter8 2
Parameter8 3
Parameter8 4
Parameter9 1
Parameter9 2
Parameter9 3
Parameter9 4
Parameter10 1
Parameter10 2
Parameter10 3
Parameter10 4


In [21]:
print('shape of trainX:', trainX.shape)
print('shape of testX:', testX.shape)

shape of trainX: (6000, 35)
shape of testX: (6000, 34)


In [22]:
trainX.head()

Unnamed: 0,Parameter1,Parameter2,Parameter3,Parameter4,Parameter6,Parameter7,Parameter8,Parameter9,Parameter10,Parameter6_le,Parameter7_le,Parameter8_le,Parameter9_le,Parameter10_le,label,Parameter6_ctr_1,Parameter6_ctr_2,Parameter6_ctr_3,Parameter6_ctr_4,Parameter7_ctr_1,Parameter7_ctr_2,Parameter7_ctr_3,Parameter7_ctr_4,Parameter8_ctr_1,Parameter8_ctr_2,Parameter8_ctr_3,Parameter8_ctr_4,Parameter9_ctr_1,Parameter9_ctr_2,Parameter9_ctr_3,Parameter9_ctr_4,Parameter10_ctr_1,Parameter10_ctr_2,Parameter10_ctr_3,Parameter10_ctr_4
0,0.00166,0.591013,147.608373,38.186345,0.000612,2286.523413,0.035407,0.593081,1.010385,24,12,1,5,24,3,0.0,0.222222,0.555556,0.222222,0.12,0.328571,0.342857,0.208571,0.115016,0.222843,0.510783,0.151358,0.13731,0.226262,0.504059,0.132369,0.175439,0.251462,0.415205,0.157895
1,1.601749,0.015052,0.035864,51.130326,0.002397,2286.523413,0.035407,0.593081,1.010385,31,12,1,5,24,4,0.141509,0.386792,0.216981,0.254717,0.12,0.328571,0.342857,0.208571,0.115016,0.222843,0.510783,0.151358,0.13731,0.226262,0.504059,0.132369,0.175439,0.251462,0.415205,0.157895
2,0.098039,69.233685,0.08092,0.112265,0.001972,2286.523413,0.035407,0.593081,1.010385,30,12,1,5,24,4,0.163636,0.345455,0.236364,0.254545,0.12,0.328571,0.342857,0.208571,0.115016,0.222843,0.510783,0.151358,0.13731,0.226262,0.504059,0.132369,0.175439,0.251462,0.415205,0.157895
3,18.18186,0.047325,0.018061,1.098102,0.002397,2286.523413,0.035407,0.593081,1.010385,31,12,1,5,24,4,0.141509,0.386792,0.216981,0.254717,0.12,0.328571,0.342857,0.208571,0.115016,0.222843,0.510783,0.151358,0.13731,0.226262,0.504059,0.132369,0.175439,0.251462,0.415205,0.157895
4,0.012085,0.008749,0.005509,524.327396,0.002397,2286.523413,0.035407,0.593081,1.010385,31,12,1,5,24,4,0.141509,0.386792,0.216981,0.254717,0.12,0.328571,0.342857,0.208571,0.115016,0.222843,0.510783,0.151358,0.13731,0.226262,0.504059,0.132369,0.175439,0.251462,0.415205,0.157895


In [24]:
drop_cols = ['Parameter1', 'Parameter2', 'Parameter3', 'Parameter4',
             'Parameter6_le', 'Parameter7_le', 'Parameter8_le', 
             'Parameter9_le', 'Parameter10_le']
trainX.drop(drop_cols, axis=1, inplace=True)
testX.drop(drop_cols, axis=1, inplace=True)

#### Frequency Encoding

In [None]:
full['Parameter6_fre'] = LabelEncoder().fit_transform(full.Parameter6)

def parameter6_fre(x):
    ct = full.Parameter6_fre.value_counts() / full.shape[0]
    return ct[x]
full['Parameter6_fre'] = full.Parameter6_fre.apply(parameter6_fre)

In [None]:
full['Parameter7_fre'] = LabelEncoder().fit_transform(full.Parameter7)

def parameter7_fre(x):
    ct = full.Parameter7_fre.value_counts() / full.shape[0]
    return ct[x]
full['Parameter7_fre'] = full.Parameter7_fre.apply(parameter7_fre)

In [None]:
full['Parameter8_fre'] = LabelEncoder().fit_transform(full.Parameter8)

def parameter8_fre(x):
    ct = full.Parameter8_fre.value_counts() / full.shape[0]
    return ct[x]
full['Parameter8_fre'] = full.Parameter8_fre.apply(parameter8_fre)

In [None]:
full['Parameter9_fre'] = LabelEncoder().fit_transform(full.Parameter9)

def parameter9_fre(x):
    ct = full.Parameter9_fre.value_counts() / full.shape[0]
    return ct[x]
full['Parameter9_fre'] = full.Parameter9_fre.apply(parameter9_fre)

In [None]:
full['Parameter10_fre'] = LabelEncoder().fit_transform(full.Parameter10)

def parameter10_fre(x):
    ct = full.Parameter10_fre.value_counts() / full.shape[0]
    return ct[x]
full['Parameter10_fre'] = full.Parameter10_fre.apply(parameter10_fre)

In [None]:
full2 = full.copy()

In [None]:
full2.head()

#### 取log

In [23]:
features = ['Parameter6', 'Parameter7', 'Parameter8', 
            'Parameter9', 'Parameter10']
for feature in features:
    trainX[feature] = np.log(trainX[feature])
    testX[feature] = np.log(testX[feature])

In [None]:
full2.head()

#### 数据分割

In [25]:
trainY = trainX.label
trainX.drop(['label'], axis=1, inplace=True)

print('shape of trainX:', trainX.shape)
print('shape of trainY:', trainY.shape)
print('shape of testX:', testX.shape)

shape of trainX: (6000, 25)
shape of trainY: (6000,)
shape of testX: (6000, 25)


In [None]:
trainX = full2[:train.shape[0]]
trainY = train.label
testX = full2[train.shape[0]:]

print('shape of trainX:', trainX.shape)
print('shape of trainY:', trainY.shape)
print('shape of testX:', testX.shape)

#### 划分训练集和验证集

In [26]:
X_train, X_test, y_train, y_test = train_test_split(trainX, trainY, test_size=0.2, random_state=42)
X_train.shape

(4800, 25)

In [27]:
model = CatBoostClassifier(iterations=2000, depth=8, learning_rate=0.01, verbose=100, loss_function='MultiClass', random_state=42)
model.fit(X_train, y_train)

predY = model.predict(X_test)

0:	learn: 1.3811603	total: 189ms	remaining: 6m 17s
100:	learn: 1.1526625	total: 6.75s	remaining: 2m 6s
200:	learn: 1.0923219	total: 13.2s	remaining: 1m 57s
300:	learn: 1.0676606	total: 19.3s	remaining: 1m 48s
400:	learn: 1.0530586	total: 25.4s	remaining: 1m 41s
500:	learn: 1.0424796	total: 31.6s	remaining: 1m 34s
600:	learn: 1.0336861	total: 37.6s	remaining: 1m 27s
700:	learn: 1.0264888	total: 43.7s	remaining: 1m 20s
800:	learn: 1.0201364	total: 49.9s	remaining: 1m 14s
900:	learn: 1.0147765	total: 55.8s	remaining: 1m 8s
1000:	learn: 1.0098751	total: 1m 1s	remaining: 1m 1s
1100:	learn: 1.0029885	total: 1m 7s	remaining: 55s
1200:	learn: 0.9972456	total: 1m 13s	remaining: 49s
1300:	learn: 0.9917597	total: 1m 20s	remaining: 43.1s
1400:	learn: 0.9868271	total: 1m 26s	remaining: 37.2s
1500:	learn: 0.9819779	total: 1m 33s	remaining: 31.1s
1600:	learn: 0.9771667	total: 1m 40s	remaining: 25s
1700:	learn: 0.9728254	total: 1m 46s	remaining: 18.8s
1800:	learn: 0.9692523	total: 1m 53s	remaining: 12

In [None]:
model = CatBoostClassifier(iterations=2000, depth=8, learning_rate=0.01, verbose=100, loss_function='MultiClass', random_state=42)

param = {'depth':[6, 8, 10, 12, 14], 'l2_leaf_reg':[0.01, 0.1, 1, 5, 10, 20]}
cv = GridSearchCV(model, param_grid=param, verbose=1, cv=5, scoring='accuracy')
cv.fit(trainX, trainY)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
0:	learn: 1.3808613	total: 54.8ms	remaining: 1m 49s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


100:	learn: 1.1475246	total: 2.5s	remaining: 47s
200:	learn: 1.0879528	total: 4.82s	remaining: 43.1s
300:	learn: 1.0613635	total: 7.13s	remaining: 40.2s
400:	learn: 1.0443432	total: 9.39s	remaining: 37.4s
500:	learn: 1.0313075	total: 11.7s	remaining: 35.1s
600:	learn: 1.0213707	total: 14.1s	remaining: 32.8s
700:	learn: 1.0132601	total: 16.5s	remaining: 30.6s
800:	learn: 1.0058946	total: 18.8s	remaining: 28.1s
900:	learn: 0.9990488	total: 21s	remaining: 25.6s
1000:	learn: 0.9931558	total: 23.3s	remaining: 23.2s
1100:	learn: 0.9855714	total: 25.5s	remaining: 20.9s
1200:	learn: 0.9777662	total: 27.9s	remaining: 18.6s
1300:	learn: 0.9714455	total: 30.2s	remaining: 16.2s
1400:	learn: 0.9649116	total: 32.5s	remaining: 13.9s
1500:	learn: 0.9594167	total: 34.7s	remaining: 11.5s
1600:	learn: 0.9543895	total: 36.9s	remaining: 9.2s
1700:	learn: 0.9498524	total: 39.2s	remaining: 6.88s
1800:	learn: 0.9455463	total: 41.4s	remaining: 4.58s
1900:	learn: 0.9414242	total: 43.8s	remaining: 2.28s
1999:	le

1200:	learn: 0.9930174	total: 27.4s	remaining: 18.2s
1300:	learn: 0.9856585	total: 29.7s	remaining: 15.9s
1400:	learn: 0.9800436	total: 31.9s	remaining: 13.6s
1500:	learn: 0.9747940	total: 34.2s	remaining: 11.4s
1600:	learn: 0.9701764	total: 36.6s	remaining: 9.12s
1700:	learn: 0.9654687	total: 38.9s	remaining: 6.84s
1800:	learn: 0.9612305	total: 41.2s	remaining: 4.56s
1900:	learn: 0.9568769	total: 43.6s	remaining: 2.27s
1999:	learn: 0.9527597	total: 46s	remaining: 0us
0:	learn: 1.3815103	total: 47.6ms	remaining: 1m 35s
100:	learn: 1.1598185	total: 2.47s	remaining: 46.4s
200:	learn: 1.1011434	total: 4.83s	remaining: 43.2s
300:	learn: 1.0759592	total: 7.14s	remaining: 40.3s
400:	learn: 1.0595892	total: 9.48s	remaining: 37.8s
500:	learn: 1.0480176	total: 11.7s	remaining: 35.1s
600:	learn: 1.0375728	total: 14s	remaining: 32.5s
700:	learn: 1.0289739	total: 16.3s	remaining: 30.3s
800:	learn: 1.0216896	total: 18.6s	remaining: 27.8s
900:	learn: 1.0158742	total: 20.8s	remaining: 25.4s
1000:	lea

200:	learn: 1.1026616	total: 4.61s	remaining: 41.3s
300:	learn: 1.0791080	total: 6.83s	remaining: 38.5s
400:	learn: 1.0648529	total: 9.06s	remaining: 36.1s
500:	learn: 1.0555213	total: 11.2s	remaining: 33.6s
600:	learn: 1.0474303	total: 13.4s	remaining: 31.3s
700:	learn: 1.0407818	total: 15.7s	remaining: 29s
800:	learn: 1.0346172	total: 17.9s	remaining: 26.8s
900:	learn: 1.0291598	total: 20s	remaining: 24.4s
1000:	learn: 1.0236355	total: 22.3s	remaining: 22.2s
1100:	learn: 1.0180457	total: 24.5s	remaining: 20s
1200:	learn: 1.0122504	total: 26.8s	remaining: 17.8s
1300:	learn: 1.0069277	total: 29s	remaining: 15.6s
1400:	learn: 1.0022299	total: 31.3s	remaining: 13.4s
1500:	learn: 0.9983790	total: 33.5s	remaining: 11.2s
1600:	learn: 0.9943297	total: 35.7s	remaining: 8.91s
1700:	learn: 0.9906344	total: 38s	remaining: 6.68s
1800:	learn: 0.9866620	total: 40.3s	remaining: 4.45s
1900:	learn: 0.9834786	total: 42.5s	remaining: 2.21s
1999:	learn: 0.9800663	total: 44.8s	remaining: 0us
0:	learn: 1.3

1300:	learn: 1.0278702	total: 30.4s	remaining: 16.3s
1400:	learn: 1.0239284	total: 32.6s	remaining: 14s
1500:	learn: 1.0199842	total: 34.9s	remaining: 11.6s
1600:	learn: 1.0169044	total: 37.1s	remaining: 9.25s
1700:	learn: 1.0138692	total: 39.3s	remaining: 6.91s
1800:	learn: 1.0106600	total: 41.5s	remaining: 4.59s
1900:	learn: 1.0078902	total: 43.8s	remaining: 2.28s
1999:	learn: 1.0051645	total: 46.1s	remaining: 0us
0:	learn: 1.3820611	total: 45ms	remaining: 1m 29s
100:	learn: 1.1747930	total: 2.42s	remaining: 45.6s
200:	learn: 1.1200023	total: 4.64s	remaining: 41.5s
300:	learn: 1.0966933	total: 6.84s	remaining: 38.6s
400:	learn: 1.0829298	total: 9.02s	remaining: 36s
500:	learn: 1.0737843	total: 11.2s	remaining: 33.5s
600:	learn: 1.0669404	total: 13.3s	remaining: 31s
700:	learn: 1.0605726	total: 15.5s	remaining: 28.8s
800:	learn: 1.0551299	total: 17.7s	remaining: 26.5s
900:	learn: 1.0501471	total: 19.9s	remaining: 24.2s
1000:	learn: 1.0459434	total: 22s	remaining: 22s
1100:	learn: 1.04

300:	learn: 1.0366932	total: 19.1s	remaining: 1m 47s
400:	learn: 1.0169793	total: 25.2s	remaining: 1m 40s
500:	learn: 1.0026858	total: 31.3s	remaining: 1m 33s
600:	learn: 0.9908983	total: 37.3s	remaining: 1m 26s
700:	learn: 0.9813076	total: 43.3s	remaining: 1m 20s
800:	learn: 0.9726029	total: 49.2s	remaining: 1m 13s
900:	learn: 0.9660475	total: 55.1s	remaining: 1m 7s
1000:	learn: 0.9597826	total: 1m	remaining: 1m
1100:	learn: 0.9523615	total: 1m 6s	remaining: 54.2s
1200:	learn: 0.9459637	total: 1m 12s	remaining: 48.2s
1300:	learn: 0.9384653	total: 1m 19s	remaining: 42.5s
1400:	learn: 0.9312280	total: 1m 25s	remaining: 36.5s
1500:	learn: 0.9257711	total: 1m 32s	remaining: 30.7s
1600:	learn: 0.9196847	total: 1m 38s	remaining: 24.6s
1700:	learn: 0.9142000	total: 1m 45s	remaining: 18.5s
1800:	learn: 0.9088122	total: 1m 51s	remaining: 12.4s
1900:	learn: 0.9045925	total: 1m 58s	remaining: 6.18s
1999:	learn: 0.9010397	total: 2m 5s	remaining: 0us
0:	learn: 1.3810433	total: 102ms	remaining: 3m 

1200:	learn: 0.9643445	total: 1m 13s	remaining: 49s
1300:	learn: 0.9570078	total: 1m 20s	remaining: 43.1s
1400:	learn: 0.9516929	total: 1m 26s	remaining: 36.9s
1500:	learn: 0.9459493	total: 1m 33s	remaining: 31s
1600:	learn: 0.9401688	total: 1m 39s	remaining: 24.9s
1700:	learn: 0.9340950	total: 1m 46s	remaining: 18.7s
1800:	learn: 0.9288097	total: 1m 53s	remaining: 12.5s
1900:	learn: 0.9244215	total: 2m	remaining: 6.25s
1999:	learn: 0.9206645	total: 2m 6s	remaining: 0us
0:	learn: 1.3813066	total: 95.2ms	remaining: 3m 10s
100:	learn: 1.1447797	total: 6.71s	remaining: 2m 6s
200:	learn: 1.0819228	total: 13.2s	remaining: 1m 57s
300:	learn: 1.0541617	total: 19.6s	remaining: 1m 50s
400:	learn: 1.0361831	total: 26s	remaining: 1m 43s
500:	learn: 1.0218055	total: 32.4s	remaining: 1m 36s
600:	learn: 1.0090580	total: 38.7s	remaining: 1m 30s
700:	learn: 1.0006420	total: 44.4s	remaining: 1m 22s
800:	learn: 0.9933211	total: 50.2s	remaining: 1m 15s
900:	learn: 0.9874047	total: 55.8s	remaining: 1m 8s


1999:	learn: 0.9439053	total: 2m 10s	remaining: 0us
0:	learn: 1.3810168	total: 111ms	remaining: 3m 41s
100:	learn: 1.1498362	total: 6.91s	remaining: 2m 10s
200:	learn: 1.0891164	total: 13.5s	remaining: 2m
300:	learn: 1.0631207	total: 20s	remaining: 1m 52s
400:	learn: 1.0480929	total: 26.2s	remaining: 1m 44s
500:	learn: 1.0371274	total: 32.3s	remaining: 1m 36s
600:	learn: 1.0277727	total: 38.6s	remaining: 1m 29s
700:	learn: 1.0196583	total: 44.5s	remaining: 1m 22s
800:	learn: 1.0136634	total: 50.3s	remaining: 1m 15s
900:	learn: 1.0079276	total: 56.3s	remaining: 1m 8s
1000:	learn: 1.0030538	total: 1m 2s	remaining: 1m 1s
1100:	learn: 0.9973937	total: 1m 8s	remaining: 55.8s
1200:	learn: 0.9912363	total: 1m 14s	remaining: 49.7s
1300:	learn: 0.9849106	total: 1m 21s	remaining: 43.7s
1400:	learn: 0.9794733	total: 1m 28s	remaining: 37.7s
1500:	learn: 0.9745008	total: 1m 34s	remaining: 31.5s
1600:	learn: 0.9702938	total: 1m 41s	remaining: 25.3s
1700:	learn: 0.9663537	total: 1m 48s	remaining: 19s

800:	learn: 1.0347998	total: 54s	remaining: 1m 20s
900:	learn: 1.0301080	total: 1m	remaining: 1m 13s
1000:	learn: 1.0255220	total: 1m 6s	remaining: 1m 6s
1100:	learn: 1.0202818	total: 1m 13s	remaining: 1m
1200:	learn: 1.0157811	total: 1m 20s	remaining: 53.7s
1300:	learn: 1.0110639	total: 1m 27s	remaining: 47.1s
1400:	learn: 1.0062935	total: 1m 34s	remaining: 40.5s
1500:	learn: 1.0022284	total: 1m 41s	remaining: 33.9s
1600:	learn: 0.9981518	total: 1m 49s	remaining: 27.2s
1700:	learn: 0.9944060	total: 1m 56s	remaining: 20.4s
1800:	learn: 0.9913311	total: 2m 3s	remaining: 13.6s
1900:	learn: 0.9879538	total: 2m 10s	remaining: 6.8s
1999:	learn: 0.9848753	total: 2m 17s	remaining: 0us
0:	learn: 1.3817732	total: 97.9ms	remaining: 3m 15s
100:	learn: 1.1695265	total: 7.04s	remaining: 2m 12s
200:	learn: 1.1095832	total: 13.9s	remaining: 2m 4s
300:	learn: 1.0841988	total: 20.8s	remaining: 1m 57s
400:	learn: 1.0694153	total: 27.3s	remaining: 1m 48s
500:	learn: 1.0586779	total: 33.9s	remaining: 1m 4

1600:	learn: 1.0183308	total: 1m 51s	remaining: 27.8s
1700:	learn: 1.0148176	total: 1m 58s	remaining: 20.9s
1800:	learn: 1.0121604	total: 2m 6s	remaining: 13.9s
1900:	learn: 1.0092652	total: 2m 13s	remaining: 6.95s
1999:	learn: 1.0062995	total: 2m 20s	remaining: 0us
0:	learn: 1.3800887	total: 285ms	remaining: 9m 29s
100:	learn: 1.1196366	total: 23.2s	remaining: 7m 15s
200:	learn: 1.0467654	total: 45.2s	remaining: 6m 44s
300:	learn: 1.0146031	total: 1m 4s	remaining: 6m 6s
400:	learn: 0.9927118	total: 1m 23s	remaining: 5m 33s
500:	learn: 0.9789926	total: 1m 42s	remaining: 5m 5s
600:	learn: 0.9646209	total: 2m 1s	remaining: 4m 43s
700:	learn: 0.9548644	total: 2m 19s	remaining: 4m 19s
800:	learn: 0.9466654	total: 2m 36s	remaining: 3m 54s
900:	learn: 0.9403134	total: 2m 53s	remaining: 3m 31s
1000:	learn: 0.9352305	total: 3m 6s	remaining: 3m 5s
1100:	learn: 0.9291466	total: 3m 21s	remaining: 2m 44s
1200:	learn: 0.9226480	total: 3m 39s	remaining: 2m 26s
1300:	learn: 0.9145208	total: 4m	remain

200:	learn: 1.0611308	total: 45.3s	remaining: 6m 45s
300:	learn: 1.0297832	total: 1m 6s	remaining: 6m 15s
400:	learn: 1.0111031	total: 1m 25s	remaining: 5m 42s
500:	learn: 0.9934318	total: 1m 47s	remaining: 5m 23s
600:	learn: 0.9813643	total: 2m 8s	remaining: 4m 58s
700:	learn: 0.9726267	total: 2m 25s	remaining: 4m 29s
800:	learn: 0.9647790	total: 2m 43s	remaining: 4m 5s
900:	learn: 0.9577213	total: 3m 2s	remaining: 3m 42s
1000:	learn: 0.9517717	total: 3m 19s	remaining: 3m 18s
1100:	learn: 0.9452245	total: 3m 35s	remaining: 2m 56s
1200:	learn: 0.9391343	total: 3m 54s	remaining: 2m 35s
1300:	learn: 0.9322362	total: 4m 14s	remaining: 2m 16s
1400:	learn: 0.9252911	total: 4m 36s	remaining: 1m 58s
1500:	learn: 0.9192324	total: 4m 56s	remaining: 1m 38s
1600:	learn: 0.9126414	total: 5m 20s	remaining: 1m 19s
1700:	learn: 0.9072627	total: 5m 43s	remaining: 1m
1800:	learn: 0.9029062	total: 6m 6s	remaining: 40.5s
1900:	learn: 0.8987440	total: 6m 29s	remaining: 20.3s
1999:	learn: 0.8939828	total: 

800:	learn: 0.9862285	total: 2m 46s	remaining: 4m 9s
900:	learn: 0.9806169	total: 3m 4s	remaining: 3m 44s
1000:	learn: 0.9759371	total: 3m 19s	remaining: 3m 19s
1100:	learn: 0.9700923	total: 3m 36s	remaining: 2m 57s
1200:	learn: 0.9639732	total: 3m 57s	remaining: 2m 38s
1300:	learn: 0.9572109	total: 4m 20s	remaining: 2m 20s
1400:	learn: 0.9514633	total: 4m 44s	remaining: 2m 1s
1500:	learn: 0.9456928	total: 5m 8s	remaining: 1m 42s
1600:	learn: 0.9401922	total: 5m 32s	remaining: 1m 22s
1700:	learn: 0.9353906	total: 5m 57s	remaining: 1m 2s
1800:	learn: 0.9316374	total: 6m 21s	remaining: 42.1s
1900:	learn: 0.9280391	total: 6m 45s	remaining: 21.1s
1999:	learn: 0.9244614	total: 7m 8s	remaining: 0us
0:	learn: 1.3809133	total: 277ms	remaining: 9m 14s
100:	learn: 1.1426531	total: 25s	remaining: 7m 50s
200:	learn: 1.0775734	total: 47.8s	remaining: 7m 7s
300:	learn: 1.0499921	total: 1m 9s	remaining: 6m 32s
400:	learn: 1.0338628	total: 1m 30s	remaining: 5m 59s
500:	learn: 1.0219407	total: 1m 49s	r

In [33]:
GridSearchCV?

In [None]:
model = lgb.LGBMClassifier()
model.fit(X_train, y_train)
predY = model.predict(X_test)

In [29]:
sum(np.array(y_test.tolist()).reshape(-1,1) == predY) / predY.shape[0]

array([0.53416667])

### 模型构建

#### LogisticRegression

In [None]:
lr = LogisticRegression()
lr.fit(trainX, trainY)
predY = lr.predict(testX)
proba = lr.predict_proba(testX)

#### AdaBoostClassifier

In [None]:
ada = AdaBoostClassifier()
ada.fit(trainX, trainY)
predY = ada.predict(testX)

#### xgboost

In [None]:
xgbc = xgb.XGBClassifier()
xgbc.fit(trainX, trainY)
predY = xgbc.predict(testX)

#### lightgbm

In [None]:
lgbc = lgb.LGBMClassifier()
lgbc.fit(trainX, trainY)
predY = lgbc.predict(testX)

#### catboost

In [None]:
model = CatBoostClassifier(iterations=2000, depth=8, learning_rate=0.01, verbose=100, loss_function='MultiClass', random_state=42)
model.fit(trainX, trainY)
predY = model.predict(testX)

_______

In [None]:
proba = model.predict_proba(testX)
proba

In [None]:
test['pred'] = predY
test['prob_Excellent'] = 0.0
test['prob_Good'] = 0.0
test['prob_Pass'] = 0.0
test['prob_Fail'] = 0.0
test.loc[:, ['prob_Excellent', 'prob_Good', 'prob_Pass', 'prob_Fail']] = model.predict_proba(testX)
test.head(10)

In [None]:
result = test.groupby(['Group'], as_index=False)['prob_Excellent', 'prob_Good', 'prob_Pass', 'prob_Fail'].mean()
result.columns = ['Group','Excellent ratio','Good ratio','Pass ratio','Fail ratio']
result.to_csv('submission14.csv', index=False)

In [None]:
result.head().append(result.tail())

_________