In [28]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import RFE
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier
import xgboost as xgb
import lightgbm as lgb
import pandas_profiling as ppf
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
import os
import gc

pd.set_option('display.max_columns', 50)
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('.\\data\\first_round_training_data.csv')
test = pd.read_csv('.\\data\\first_round_testing_data.csv')
submit_example = pd.read_csv('.\\data\\submit_example.csv')

print('shape of train:', train.shape)
print('shape of test:', test.shape)
print('shape of submit:', submit_example.shape)

shape of train: (6000, 21)
shape of test: (6000, 11)
shape of submit: (120, 5)


In [3]:
train.head()

Unnamed: 0,Parameter1,Parameter2,Parameter3,Parameter4,Parameter5,Parameter6,Parameter7,Parameter8,Parameter9,Parameter10,Attribute1,Attribute2,Attribute3,Attribute4,Attribute5,Attribute6,Attribute7,Attribute8,Attribute9,Attribute10,Quality_label
0,0.00166,0.591013,147.608373,38.186345,0.000421,0.000612,2286.523413,0.035407,0.593081,1.010385,6.856075,0.168761,1.098755,36.955992,8.454598,11.438066,177.24312,338.729256,2.021704,0.079526,Pass
1,1.601749,0.015052,0.035864,51.130326,0.000909,0.002397,2286.523413,0.035407,0.593081,1.010385,0.000362,11.649033,0.066671,225.632949,0.48186,20597.447822,3.72333,15.37619,0.986973,4.634376,Fail
2,0.098039,69.233685,0.08092,0.112265,0.000909,0.001972,2286.523413,0.035407,0.593081,1.010385,0.022201,0.078213,110.079689,2.208138,0.073525,236.079314,0.064196,0.576302,33.87579,1.813727,Fail
3,18.18186,0.047325,0.018061,1.098102,0.000909,0.002397,2286.523413,0.035407,0.593081,1.010385,1.459004,0.380281,0.011491,0.654517,0.025872,176.948915,0.029777,0.246726,27.117165,0.081819,Fail
4,0.012085,0.008749,0.005509,524.327396,0.000909,0.002397,2286.523413,0.035407,0.593081,1.010385,11.576647,1.555672,38.613386,0.260989,0.00938,194.798039,0.055053,0.014725,13.569707,18.138496,Fail


In [4]:
test.head().append(test.tail())

Unnamed: 0,Group,Parameter1,Parameter10,Parameter2,Parameter3,Parameter4,Parameter5,Parameter6,Parameter7,Parameter8,Parameter9
0,0,0.235279,0.19568,0.351488,0.186814,0.086456,1.43406,1.230409,0.038483,2.931083,2.005852
1,0,0.043799,0.523976,32.590688,0.886285,29.70249,0.027434,0.016852,0.600827,0.118077,887.611257
2,0,21.466737,0.523976,11.847792,79499.554454,13544.605343,0.027434,0.016852,0.600827,0.118077,887.611257
3,0,0.703795,0.010192,0.01808,317.015599,38.087562,0.428092,0.314162,0.038483,17.850021,0.05185
4,0,10.44796,0.010192,15.983891,0.996049,16.71569,0.477823,0.464037,0.038483,17.850021,0.05185
5995,119,765.876721,51.944717,0.053836,0.258969,0.125856,0.000218,0.000414,2286.523413,0.035407,0.593081
5996,119,0.071211,0.19568,0.924208,284.265495,15.66877,3.095123,1.817391,0.600827,17.850021,6.783967
5997,119,0.001922,0.073078,7.829744,16.138304,0.000376,0.74163,1.495371,0.600827,17.850021,0.05185
5998,119,5.4e-05,0.027291,0.672029,0.002134,0.149019,3.454681,3.262468,0.600827,17.850021,6.783967
5999,119,1.026527,0.073078,0.116152,2.923321,610.091923,1.031282,0.833011,0.038483,2.931083,2.005852


In [5]:
submit_example.head()

Unnamed: 0,Group,Excellent ratio,Good ratio,Pass ratio,Fail ratio
0,0,0.4,0.2,0.2,0.2
1,1,0.4,0.2,0.2,0.2
2,2,0.4,0.2,0.2,0.2
3,3,0.4,0.2,0.2,0.2
4,4,0.4,0.2,0.2,0.2


In [6]:
train.Quality_label.unique()

array(['Pass', 'Fail', 'Good', 'Excellent'], dtype=object)

In [7]:
features = ['Parameter1', 'Parameter2', 'Parameter3', 'Parameter4', 'Parameter5',
            'Parameter6', 'Parameter7', 'Parameter8', 'Parameter9', 'Parameter10',
            'Attribute1', 'Attribute2', 'Attribute3', 'Attribute4', 'Attribute5',
            'Attribute6', 'Attribute7', 'Attribute8', 'Attribute9', 'Attribute10']
test_features = ['Parameter1', 'Parameter10', 'Parameter2', 'Parameter3',
                 'Parameter4', 'Parameter5', 'Parameter6', 'Parameter7', 
                 'Parameter8', 'Parameter9']

### 特征类别观察

In [8]:
for feature in test_features:
    print(feature, len(train[feature].value_counts().tolist()))

Parameter1 6000
Parameter10 41
Parameter2 6000
Parameter3 6000
Parameter4 6000
Parameter5 132
Parameter6 80
Parameter7 14
Parameter8 23
Parameter9 16


In [9]:
def label_map(x):
    labels = {'Excellent':1, 'Good':2, 'Pass':3, 'Fail':4}
    return labels[x]

train['label'] = train.Quality_label.apply(label_map)
train.head()

Unnamed: 0,Parameter1,Parameter2,Parameter3,Parameter4,Parameter5,Parameter6,Parameter7,Parameter8,Parameter9,Parameter10,Attribute1,Attribute2,Attribute3,Attribute4,Attribute5,Attribute6,Attribute7,Attribute8,Attribute9,Attribute10,Quality_label,label
0,0.00166,0.591013,147.608373,38.186345,0.000421,0.000612,2286.523413,0.035407,0.593081,1.010385,6.856075,0.168761,1.098755,36.955992,8.454598,11.438066,177.24312,338.729256,2.021704,0.079526,Pass,3
1,1.601749,0.015052,0.035864,51.130326,0.000909,0.002397,2286.523413,0.035407,0.593081,1.010385,0.000362,11.649033,0.066671,225.632949,0.48186,20597.447822,3.72333,15.37619,0.986973,4.634376,Fail,4
2,0.098039,69.233685,0.08092,0.112265,0.000909,0.001972,2286.523413,0.035407,0.593081,1.010385,0.022201,0.078213,110.079689,2.208138,0.073525,236.079314,0.064196,0.576302,33.87579,1.813727,Fail,4
3,18.18186,0.047325,0.018061,1.098102,0.000909,0.002397,2286.523413,0.035407,0.593081,1.010385,1.459004,0.380281,0.011491,0.654517,0.025872,176.948915,0.029777,0.246726,27.117165,0.081819,Fail,4
4,0.012085,0.008749,0.005509,524.327396,0.000909,0.002397,2286.523413,0.035407,0.593081,1.010385,11.576647,1.555672,38.613386,0.260989,0.00938,194.798039,0.055053,0.014725,13.569707,18.138496,Fail,4


In [None]:
pfr = ppf.ProfileReport(train)
pfr.to_file('train_EDA.html')

### 异常点观察

In [None]:
plt.scatter(train.Quality_label, train.Parameter1)
# 剔除train.Parameter1>12的数据

In [None]:
train = train[train.Parameter1<=200000]

In [None]:
plt.scatter(train.Quality_label, train.Parameter2)
# 剔除train.Parameter2>14的数据

In [None]:
train = train[train.Parameter2<=1000000]

In [None]:
plt.scatter(train.Quality_label, train.Parameter3)
# 剔除train.Parameter3>12的数据

In [None]:
train = train[train.Parameter3<=200000]

In [None]:
plt.scatter(train.Quality_label, train.Parameter4)

In [None]:
train = train[train.Parameter4<=80000]

In [None]:
plt.scatter(train.Quality_label, train.Parameter5)
# 剔除train.Parameter5>4的数据

In [None]:
train = train[train.Parameter5<=60]

In [None]:
plt.scatter(train.Quality_label, train.Parameter6)

In [None]:
plt.scatter(train.Quality_label, train.Parameter7)

In [None]:
train = train[train.Parameter7<=30000]

In [None]:
plt.scatter(train.Quality_label, train.Parameter8)
# 剔除train.Parameter8>8的数据

In [None]:
train = train[train.Parameter8<=5000]

In [None]:
plt.scatter(train.Quality_label, train.Parameter9)
# 剔除train.Parameter9>12的数据

In [None]:
train = train[train.Parameter9<=1e8]

In [None]:
plt.scatter(train.Quality_label, train.Parameter10)

In [None]:
train.shape

In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(train[test_features].corr(), vmax=0.8, vmin=-0.8, linewidths=0.2, annot=True)

### 构建数据组合

In [10]:
selected_features = ['Parameter1', 'Parameter2', 'Parameter3',
                     'Parameter4', 'Parameter6', 'Parameter7', 
                     'Parameter8', 'Parameter9', 'Parameter10']
features = ['Parameter6', 'Parameter7', 'Parameter8', 
            'Parameter9', 'Parameter10']
full = pd.concat([train[selected_features], test[selected_features]], ignore_index=True)

#### Label Encoding

In [11]:
for feature in features:
    le_feature = feature + '_le'
    full[le_feature] = LabelEncoder().fit_transform(full[feature])

In [12]:
full.head()

Unnamed: 0,Parameter1,Parameter2,Parameter3,Parameter4,Parameter6,Parameter7,Parameter8,Parameter9,Parameter10,Parameter6_le,Parameter7_le,Parameter8_le,Parameter9_le,Parameter10_le
0,0.00166,0.591013,147.608373,38.186345,0.000612,2286.523413,0.035407,0.593081,1.010385,24,12,1,5,24
1,1.601749,0.015052,0.035864,51.130326,0.002397,2286.523413,0.035407,0.593081,1.010385,31,12,1,5,24
2,0.098039,69.233685,0.08092,0.112265,0.001972,2286.523413,0.035407,0.593081,1.010385,30,12,1,5,24
3,18.18186,0.047325,0.018061,1.098102,0.002397,2286.523413,0.035407,0.593081,1.010385,31,12,1,5,24
4,0.012085,0.008749,0.005509,524.327396,0.002397,2286.523413,0.035407,0.593081,1.010385,31,12,1,5,24


In [13]:
trainX = full[:train.shape[0]]
trainX['label'] = train.label
testX = full[train.shape[0]:]

print('shape of trainX:', trainX.shape)
print('shape of testX:', testX.shape)

shape of trainX: (6000, 15)
shape of testX: (6000, 14)


#### Target Encoding

In [18]:
def parameter_ctr(train, test, col, label):
    # train: 数据集
    # col: 类别型特征
    # label：标签值
    new_col = col + '_ctr_' + str(label)
    train[new_col] = 0.0
    
    ctr = train[col+'_le'].value_counts() 
    k_map = {}
    values = train[col+'_le'].unique().tolist()
    values.extend(test[col+'_le'].unique().tolist())
    for elem in set(values):
        try:
            k_map[elem] = train[(train[col+'_le']==elem)&(train['label']==label)].shape[0] / ctr[elem]
        except:
            k_map[elem] = 0.0
    train[new_col] = train[col+'_le'].apply(lambda x: k_map[x])
    test[new_col] = test[col+'_le'].apply(lambda x: k_map[x])
    return train, test

features = ['Parameter6', 'Parameter7', 'Parameter8', 
            'Parameter9', 'Parameter10']
labels = [1, 2, 3, 4]
for feature in features:
    for label in labels:
        print(feature, label)
        trainX, testX = parameter_ctr(trainX, testX, feature, label)

Parameter6 1
Parameter6 2
Parameter6 3
Parameter6 4
Parameter7 1
Parameter7 2
Parameter7 3
Parameter7 4
Parameter8 1
Parameter8 2
Parameter8 3
Parameter8 4
Parameter9 1
Parameter9 2
Parameter9 3
Parameter9 4
Parameter10 1
Parameter10 2
Parameter10 3
Parameter10 4


In [21]:
print('shape of trainX:', trainX.shape)
print('shape of testX:', testX.shape)

shape of trainX: (6000, 35)
shape of testX: (6000, 34)


In [22]:
trainX.head()

Unnamed: 0,Parameter1,Parameter2,Parameter3,Parameter4,Parameter6,Parameter7,Parameter8,Parameter9,Parameter10,Parameter6_le,Parameter7_le,Parameter8_le,Parameter9_le,Parameter10_le,label,Parameter6_ctr_1,Parameter6_ctr_2,Parameter6_ctr_3,Parameter6_ctr_4,Parameter7_ctr_1,Parameter7_ctr_2,Parameter7_ctr_3,Parameter7_ctr_4,Parameter8_ctr_1,Parameter8_ctr_2,Parameter8_ctr_3,Parameter8_ctr_4,Parameter9_ctr_1,Parameter9_ctr_2,Parameter9_ctr_3,Parameter9_ctr_4,Parameter10_ctr_1,Parameter10_ctr_2,Parameter10_ctr_3,Parameter10_ctr_4
0,0.00166,0.591013,147.608373,38.186345,0.000612,2286.523413,0.035407,0.593081,1.010385,24,12,1,5,24,3,0.0,0.222222,0.555556,0.222222,0.12,0.328571,0.342857,0.208571,0.115016,0.222843,0.510783,0.151358,0.13731,0.226262,0.504059,0.132369,0.175439,0.251462,0.415205,0.157895
1,1.601749,0.015052,0.035864,51.130326,0.002397,2286.523413,0.035407,0.593081,1.010385,31,12,1,5,24,4,0.141509,0.386792,0.216981,0.254717,0.12,0.328571,0.342857,0.208571,0.115016,0.222843,0.510783,0.151358,0.13731,0.226262,0.504059,0.132369,0.175439,0.251462,0.415205,0.157895
2,0.098039,69.233685,0.08092,0.112265,0.001972,2286.523413,0.035407,0.593081,1.010385,30,12,1,5,24,4,0.163636,0.345455,0.236364,0.254545,0.12,0.328571,0.342857,0.208571,0.115016,0.222843,0.510783,0.151358,0.13731,0.226262,0.504059,0.132369,0.175439,0.251462,0.415205,0.157895
3,18.18186,0.047325,0.018061,1.098102,0.002397,2286.523413,0.035407,0.593081,1.010385,31,12,1,5,24,4,0.141509,0.386792,0.216981,0.254717,0.12,0.328571,0.342857,0.208571,0.115016,0.222843,0.510783,0.151358,0.13731,0.226262,0.504059,0.132369,0.175439,0.251462,0.415205,0.157895
4,0.012085,0.008749,0.005509,524.327396,0.002397,2286.523413,0.035407,0.593081,1.010385,31,12,1,5,24,4,0.141509,0.386792,0.216981,0.254717,0.12,0.328571,0.342857,0.208571,0.115016,0.222843,0.510783,0.151358,0.13731,0.226262,0.504059,0.132369,0.175439,0.251462,0.415205,0.157895


In [24]:
drop_cols = ['Parameter1', 'Parameter2', 'Parameter3', 'Parameter4',
             'Parameter6_le', 'Parameter7_le', 'Parameter8_le', 
             'Parameter9_le', 'Parameter10_le']
trainX.drop(drop_cols, axis=1, inplace=True)
testX.drop(drop_cols, axis=1, inplace=True)

#### Frequency Encoding

In [None]:
full['Parameter6_fre'] = LabelEncoder().fit_transform(full.Parameter6)

def parameter6_fre(x):
    ct = full.Parameter6_fre.value_counts() / full.shape[0]
    return ct[x]
full['Parameter6_fre'] = full.Parameter6_fre.apply(parameter6_fre)

In [None]:
full['Parameter7_fre'] = LabelEncoder().fit_transform(full.Parameter7)

def parameter7_fre(x):
    ct = full.Parameter7_fre.value_counts() / full.shape[0]
    return ct[x]
full['Parameter7_fre'] = full.Parameter7_fre.apply(parameter7_fre)

In [None]:
full['Parameter8_fre'] = LabelEncoder().fit_transform(full.Parameter8)

def parameter8_fre(x):
    ct = full.Parameter8_fre.value_counts() / full.shape[0]
    return ct[x]
full['Parameter8_fre'] = full.Parameter8_fre.apply(parameter8_fre)

In [None]:
full['Parameter9_fre'] = LabelEncoder().fit_transform(full.Parameter9)

def parameter9_fre(x):
    ct = full.Parameter9_fre.value_counts() / full.shape[0]
    return ct[x]
full['Parameter9_fre'] = full.Parameter9_fre.apply(parameter9_fre)

In [None]:
full['Parameter10_fre'] = LabelEncoder().fit_transform(full.Parameter10)

def parameter10_fre(x):
    ct = full.Parameter10_fre.value_counts() / full.shape[0]
    return ct[x]
full['Parameter10_fre'] = full.Parameter10_fre.apply(parameter10_fre)

In [None]:
full2 = full.copy()

In [None]:
full2.head()

#### 取log

In [23]:
features = ['Parameter6', 'Parameter7', 'Parameter8', 
            'Parameter9', 'Parameter10']
for feature in features:
    trainX[feature] = np.log(trainX[feature])
    testX[feature] = np.log(testX[feature])

In [None]:
full2.head()

#### 数据分割

In [25]:
trainY = trainX.label
trainX.drop(['label'], axis=1, inplace=True)

print('shape of trainX:', trainX.shape)
print('shape of trainY:', trainY.shape)
print('shape of testX:', testX.shape)

shape of trainX: (6000, 25)
shape of trainY: (6000,)
shape of testX: (6000, 25)


In [None]:
trainX = full2[:train.shape[0]]
trainY = train.label
testX = full2[train.shape[0]:]

print('shape of trainX:', trainX.shape)
print('shape of trainY:', trainY.shape)
print('shape of testX:', testX.shape)

#### 划分训练集和验证集

In [26]:
X_train, X_test, y_train, y_test = train_test_split(trainX, trainY, test_size=0.2, random_state=42)
X_train.shape

(4800, 25)

In [27]:
model = CatBoostClassifier(iterations=2000, depth=8, learning_rate=0.01, verbose=100, loss_function='MultiClass', random_state=42)
model.fit(X_train, y_train)

predY = model.predict(X_test)

0:	learn: 1.3811603	total: 189ms	remaining: 6m 17s
100:	learn: 1.1526625	total: 6.75s	remaining: 2m 6s
200:	learn: 1.0923219	total: 13.2s	remaining: 1m 57s
300:	learn: 1.0676606	total: 19.3s	remaining: 1m 48s
400:	learn: 1.0530586	total: 25.4s	remaining: 1m 41s
500:	learn: 1.0424796	total: 31.6s	remaining: 1m 34s
600:	learn: 1.0336861	total: 37.6s	remaining: 1m 27s
700:	learn: 1.0264888	total: 43.7s	remaining: 1m 20s
800:	learn: 1.0201364	total: 49.9s	remaining: 1m 14s
900:	learn: 1.0147765	total: 55.8s	remaining: 1m 8s
1000:	learn: 1.0098751	total: 1m 1s	remaining: 1m 1s
1100:	learn: 1.0029885	total: 1m 7s	remaining: 55s
1200:	learn: 0.9972456	total: 1m 13s	remaining: 49s
1300:	learn: 0.9917597	total: 1m 20s	remaining: 43.1s
1400:	learn: 0.9868271	total: 1m 26s	remaining: 37.2s
1500:	learn: 0.9819779	total: 1m 33s	remaining: 31.1s
1600:	learn: 0.9771667	total: 1m 40s	remaining: 25s
1700:	learn: 0.9728254	total: 1m 46s	remaining: 18.8s
1800:	learn: 0.9692523	total: 1m 53s	remaining: 12

In [36]:
# model = CatBoostClassifier(iterations=2000, depth=8, learning_rate=0.01, 
#                            verbose=100, loss_function='MultiClass', random_state=42,
#                            n_estimators=1000, l2_leaf_reg=3, num_boost_round=100, num_trees=100)
model = CatBoostClassifier(iterations=2000, depth=8, learning_rate=0.01, verbose=100, loss_function='MultiClass', random_state=42)

param = {'depth':[6, 8, 10, 12, 14], 'l2_leaf_reg':[0.01, 0.1, 1, 5, 10, 20]}
cv = GridSearchCV(model, param_grid=param, verbose=1, cv=5, scoring='accuracy')
cv.fit(trainX, trainY)

Fitting 5 folds for each of 90 candidates, totalling 450 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


CatBoostError: only one of the parameters iterations, n_estimators, num_boost_round, num_trees should be initialized.

In [33]:
GridSearchCV?

In [None]:
model = lgb.LGBMClassifier()
model.fit(X_train, y_train)
predY = model.predict(X_test)

In [29]:
sum(np.array(y_test.tolist()).reshape(-1,1) == predY) / predY.shape[0]

array([0.53416667])

### 模型构建

#### LogisticRegression

In [None]:
lr = LogisticRegression()
lr.fit(trainX, trainY)
predY = lr.predict(testX)
proba = lr.predict_proba(testX)

#### AdaBoostClassifier

In [None]:
ada = AdaBoostClassifier()
ada.fit(trainX, trainY)
predY = ada.predict(testX)

#### xgboost

In [None]:
xgbc = xgb.XGBClassifier()
xgbc.fit(trainX, trainY)
predY = xgbc.predict(testX)

#### lightgbm

In [None]:
lgbc = lgb.LGBMClassifier()
lgbc.fit(trainX, trainY)
predY = lgbc.predict(testX)

#### catboost

In [None]:
model = CatBoostClassifier(iterations=2000, depth=8, learning_rate=0.01, verbose=100, loss_function='MultiClass', random_state=42)
model.fit(trainX, trainY)
predY = model.predict(testX)

_______

In [None]:
proba = model.predict_proba(testX)
proba

In [None]:
test['pred'] = predY
test['prob_Excellent'] = 0.0
test['prob_Good'] = 0.0
test['prob_Pass'] = 0.0
test['prob_Fail'] = 0.0
test.loc[:, ['prob_Excellent', 'prob_Good', 'prob_Pass', 'prob_Fail']] = model.predict_proba(testX)
test.head(10)

In [None]:
result = test.groupby(['Group'], as_index=False)['prob_Excellent', 'prob_Good', 'prob_Pass', 'prob_Fail'].mean()
result.columns = ['Group','Excellent ratio','Good ratio','Pass ratio','Fail ratio']
result.to_csv('submission14.csv', index=False)

In [None]:
result.head().append(result.tail())

_________