In [1]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import RFE
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier
import xgboost as xgb
import lightgbm as lgb
import pandas_profiling as ppf
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
import os
import gc

pd.set_option('display.max_columns', 50)
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('.\\data\\first_round_training_data.csv')
test = pd.read_csv('.\\data\\first_round_testing_data.csv')
submit_example = pd.read_csv('.\\data\\submit_example.csv')

print('shape of train:', train.shape)
print('shape of test:', test.shape)
print('shape of submit:', submit_example.shape)

shape of train: (6000, 21)
shape of test: (6000, 11)
shape of submit: (120, 5)


In [3]:
train.head()

Unnamed: 0,Parameter1,Parameter2,Parameter3,Parameter4,Parameter5,Parameter6,Parameter7,Parameter8,Parameter9,Parameter10,Attribute1,Attribute2,Attribute3,Attribute4,Attribute5,Attribute6,Attribute7,Attribute8,Attribute9,Attribute10,Quality_label
0,0.00166,0.591013,147.608373,38.186345,0.000421,0.000612,2286.523413,0.035407,0.593081,1.010385,6.856075,0.168761,1.098755,36.955992,8.454598,11.438066,177.24312,338.729256,2.021704,0.079526,Pass
1,1.601749,0.015052,0.035864,51.130326,0.000909,0.002397,2286.523413,0.035407,0.593081,1.010385,0.000362,11.649033,0.066671,225.632949,0.48186,20597.447822,3.72333,15.37619,0.986973,4.634376,Fail
2,0.098039,69.233685,0.08092,0.112265,0.000909,0.001972,2286.523413,0.035407,0.593081,1.010385,0.022201,0.078213,110.079689,2.208138,0.073525,236.079314,0.064196,0.576302,33.87579,1.813727,Fail
3,18.18186,0.047325,0.018061,1.098102,0.000909,0.002397,2286.523413,0.035407,0.593081,1.010385,1.459004,0.380281,0.011491,0.654517,0.025872,176.948915,0.029777,0.246726,27.117165,0.081819,Fail
4,0.012085,0.008749,0.005509,524.327396,0.000909,0.002397,2286.523413,0.035407,0.593081,1.010385,11.576647,1.555672,38.613386,0.260989,0.00938,194.798039,0.055053,0.014725,13.569707,18.138496,Fail


In [None]:
test.head().append(test.tail())

In [None]:
submit_example.head()

In [4]:
train.Quality_label.unique()

array(['Pass', 'Fail', 'Good', 'Excellent'], dtype=object)

In [6]:
features = ['Parameter1', 'Parameter2', 'Parameter3', 'Parameter4', 'Parameter5',
            'Parameter6', 'Parameter7', 'Parameter8', 'Parameter9', 'Parameter10']
test_features = ['Parameter1', 'Parameter10', 'Parameter2', 'Parameter3',
                 'Parameter4', 'Parameter5', 'Parameter6', 'Parameter7', 
                 'Parameter8', 'Parameter9']

### 特征类别观察

In [7]:
for feature in features:
    print(feature, len(train[feature].value_counts().tolist()))

Parameter1 6000
Parameter2 6000
Parameter3 6000
Parameter4 6000
Parameter5 132
Parameter6 80
Parameter7 14
Parameter8 23
Parameter9 16
Parameter10 41


In [8]:
def label_map(x):
    labels = {'Excellent':1, 'Good':2, 'Pass':3, 'Fail':4}
    return labels[x]

train['label'] = train.Quality_label.apply(label_map)
train.head()

Unnamed: 0,Parameter1,Parameter2,Parameter3,Parameter4,Parameter5,Parameter6,Parameter7,Parameter8,Parameter9,Parameter10,Attribute1,Attribute2,Attribute3,Attribute4,Attribute5,Attribute6,Attribute7,Attribute8,Attribute9,Attribute10,Quality_label,label
0,0.00166,0.591013,147.608373,38.186345,0.000421,0.000612,2286.523413,0.035407,0.593081,1.010385,6.856075,0.168761,1.098755,36.955992,8.454598,11.438066,177.24312,338.729256,2.021704,0.079526,Pass,3
1,1.601749,0.015052,0.035864,51.130326,0.000909,0.002397,2286.523413,0.035407,0.593081,1.010385,0.000362,11.649033,0.066671,225.632949,0.48186,20597.447822,3.72333,15.37619,0.986973,4.634376,Fail,4
2,0.098039,69.233685,0.08092,0.112265,0.000909,0.001972,2286.523413,0.035407,0.593081,1.010385,0.022201,0.078213,110.079689,2.208138,0.073525,236.079314,0.064196,0.576302,33.87579,1.813727,Fail,4
3,18.18186,0.047325,0.018061,1.098102,0.000909,0.002397,2286.523413,0.035407,0.593081,1.010385,1.459004,0.380281,0.011491,0.654517,0.025872,176.948915,0.029777,0.246726,27.117165,0.081819,Fail,4
4,0.012085,0.008749,0.005509,524.327396,0.000909,0.002397,2286.523413,0.035407,0.593081,1.010385,11.576647,1.555672,38.613386,0.260989,0.00938,194.798039,0.055053,0.014725,13.569707,18.138496,Fail,4


### 异常点观察

In [None]:
plt.scatter(train.Quality_label, train.Parameter1)
# 剔除train.Parameter1>12的数据

In [None]:
train = train[train.Parameter1<=200000]

In [None]:
plt.scatter(train.Quality_label, train.Parameter2)
# 剔除train.Parameter2>14的数据

In [None]:
train = train[train.Parameter2<=1000000]

In [None]:
plt.scatter(train.Quality_label, train.Parameter3)
# 剔除train.Parameter3>12的数据

In [None]:
train = train[train.Parameter3<=200000]

In [None]:
plt.scatter(train.Quality_label, train.Parameter4)

In [None]:
train = train[train.Parameter4<=80000]

In [None]:
plt.scatter(train.Quality_label, train.Parameter5)
# 剔除train.Parameter5>4的数据

In [None]:
train = train[train.Parameter5<=60]

In [None]:
plt.scatter(train.Quality_label, train.Parameter6)

In [None]:
plt.scatter(train.Quality_label, train.Parameter7)

In [None]:
train = train[train.Parameter7<=30000]

In [None]:
plt.scatter(train.Quality_label, train.Parameter8)
# 剔除train.Parameter8>8的数据

In [None]:
train = train[train.Parameter8<=5000]

In [None]:
plt.scatter(train.Quality_label, train.Parameter9)
# 剔除train.Parameter9>12的数据

In [None]:
train = train[train.Parameter9<=1e8]

In [None]:
plt.scatter(train.Quality_label, train.Parameter10)

In [None]:
train.shape

In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(train[test_features].corr(), vmax=0.8, vmin=-0.8, linewidths=0.2, annot=True)

### 构建数据组合

In [34]:
selected_features = ['Parameter1', 'Parameter2', 'Parameter3',
                     'Parameter4', 'Parameter6', 'Parameter7', 
                     'Parameter8', 'Parameter9', 'Parameter10', 'Parameter5']
features = ['Parameter5', 'Parameter6', 'Parameter7', 'Parameter8', 
            'Parameter9', 'Parameter10']
full = pd.concat([train[selected_features], test[selected_features]], ignore_index=True)

In [12]:
print('shape of full:', full.shape)
print('shape of train:', train.shape)
print('shape of test:', test.shape)

shape of full: (12000, 10)
shape of train: (6000, 22)
shape of test: (6000, 11)


#### Label Encoding

In [None]:
for feature in features:
    le_feature = feature + '_le'
    full[le_feature] = LabelEncoder().fit_transform(full[feature])

In [None]:
full.head()

In [None]:
trainX = full[:train.shape[0]]
trainX['label'] = train.label
testX = full[train.shape[0]:]

print('shape of trainX:', trainX.shape)
print('shape of testX:', testX.shape)

#### Target Encoding

In [None]:
def parameter_ctr(train, test, col, label):
    # train: 数据集
    # col: 类别型特征
    # label：标签值
    new_col = col + '_ctr_' + str(label)
    train[new_col] = 0.0
    
    ctr = train[col+'_le'].value_counts() 
    k_map = {}
    values = train[col+'_le'].unique().tolist()
    values.extend(test[col+'_le'].unique().tolist())
    for elem in set(values):
        try:
            k_map[elem] = train[(train[col+'_le']==elem)&(train['label']==label)].shape[0] / ctr[elem]
        except:
            k_map[elem] = sum(k_map.values()) / len(k_map.values())
    train[new_col] = train[col+'_le'].apply(lambda x: k_map[x])
    test[new_col] = test[col+'_le'].apply(lambda x: k_map[x])
    return train, test

features = ['Parameter6', 'Parameter7', 'Parameter8', 
            'Parameter9', 'Parameter10']
labels = [1, 2, 3, 4]
for feature in features:
    for label in labels:
        print(feature, label)
        trainX, testX = parameter_ctr(trainX, testX, feature, label)

In [None]:
print('shape of trainX:', trainX.shape)
print('shape of testX:', testX.shape)

In [None]:
trainX.head()

In [None]:
drop_cols = ['Parameter1', 'Parameter2', 'Parameter3', 'Parameter4',
             'Parameter6_le', 'Parameter7_le', 'Parameter8_le', 
             'Parameter9_le', 'Parameter10_le']
trainX.drop(drop_cols, axis=1, inplace=True)
testX.drop(drop_cols, axis=1, inplace=True)

#### Frequency Encoding

In [None]:
full['Parameter6_fre'] = LabelEncoder().fit_transform(full.Parameter6)

def parameter6_fre(x):
    ct = full.Parameter6_fre.value_counts() / full.shape[0]
    return ct[x]
full['Parameter6_fre'] = full.Parameter6_fre.apply(parameter6_fre)

In [None]:
full['Parameter7_fre'] = LabelEncoder().fit_transform(full.Parameter7)

def parameter7_fre(x):
    ct = full.Parameter7_fre.value_counts() / full.shape[0]
    return ct[x]
full['Parameter7_fre'] = full.Parameter7_fre.apply(parameter7_fre)

In [None]:
full['Parameter8_fre'] = LabelEncoder().fit_transform(full.Parameter8)

def parameter8_fre(x):
    ct = full.Parameter8_fre.value_counts() / full.shape[0]
    return ct[x]
full['Parameter8_fre'] = full.Parameter8_fre.apply(parameter8_fre)

In [None]:
full['Parameter9_fre'] = LabelEncoder().fit_transform(full.Parameter9)

def parameter9_fre(x):
    ct = full.Parameter9_fre.value_counts() / full.shape[0]
    return ct[x]
full['Parameter9_fre'] = full.Parameter9_fre.apply(parameter9_fre)

In [None]:
full['Parameter10_fre'] = LabelEncoder().fit_transform(full.Parameter10)

def parameter10_fre(x):
    ct = full.Parameter10_fre.value_counts() / full.shape[0]
    return ct[x]
full['Parameter10_fre'] = full.Parameter10_fre.apply(parameter10_fre)

In [None]:
full2 = full.copy()

In [None]:
full2.head()

#### WOE

In [16]:
from keras.models import Sequential
from keras.models import Model as KerasModel
from keras.layers import Input, Dense, Activation, Reshape
from keras.layers import Concatenate
from keras.layers.embeddings import Embedding
import pickle
import csv

In [15]:
train.head()

Unnamed: 0,Parameter1,Parameter2,Parameter3,Parameter4,Parameter5,Parameter6,Parameter7,Parameter8,Parameter9,Parameter10,Attribute1,Attribute2,Attribute3,Attribute4,Attribute5,Attribute6,Attribute7,Attribute8,Attribute9,Attribute10,Quality_label,label
0,0.00166,0.591013,147.608373,38.186345,0.000421,0.000612,2286.523413,0.035407,0.593081,1.010385,6.856075,0.168761,1.098755,36.955992,8.454598,11.438066,177.24312,338.729256,2.021704,0.079526,Pass,3
1,1.601749,0.015052,0.035864,51.130326,0.000909,0.002397,2286.523413,0.035407,0.593081,1.010385,0.000362,11.649033,0.066671,225.632949,0.48186,20597.447822,3.72333,15.37619,0.986973,4.634376,Fail,4
2,0.098039,69.233685,0.08092,0.112265,0.000909,0.001972,2286.523413,0.035407,0.593081,1.010385,0.022201,0.078213,110.079689,2.208138,0.073525,236.079314,0.064196,0.576302,33.87579,1.813727,Fail,4
3,18.18186,0.047325,0.018061,1.098102,0.000909,0.002397,2286.523413,0.035407,0.593081,1.010385,1.459004,0.380281,0.011491,0.654517,0.025872,176.948915,0.029777,0.246726,27.117165,0.081819,Fail,4
4,0.012085,0.008749,0.005509,524.327396,0.000909,0.002397,2286.523413,0.035407,0.593081,1.010385,11.576647,1.555672,38.613386,0.260989,0.00938,194.798039,0.055053,0.014725,13.569707,18.138496,Fail,4


In [None]:
def woe_preprocess(train, test, col, label):
    cnt = train.label.value_counts()

#### cv

In [17]:
from sklearn.model_selection import KFold

In [20]:
kfold.split?

In [54]:
train['prob_Excellent'] = 0.0
train['prob_Good'] = 0.0
train['prob_Pass'] = 0.0
train['prob_Fail'] = 0.0

kfold = KFold(n_splits=5, shuffle=False)
split = kfold.split(train, train.label)
result_stage1 = []
for train_index, val_index in split:
    X_train = train.iloc[train_index]
    X_val = train.iloc[val_index]
    model = CatBoostClassifier(iterations=5000, depth=10, learning_rate=0.02, 
                           verbose=500, loss_function='MultiClass', 
                           random_state=924, l2_leaf_reg=0.01)
    model.fit(X_train[features], X_train.label)
    # X_val.loc[:, ['prob_Excellent', 'prob_Good', 'prob_Pass', 'prob_Fail']] = model.predict_proba(X_val[features])
    pred = model.predict_proba(test[features])
    result_stage1.append(pred)

0:	learn: 1.3754394	total: 256ms	remaining: 21m 19s
500:	learn: 0.9872785	total: 1m 15s	remaining: 11m 18s
1000:	learn: 0.9255206	total: 2m 48s	remaining: 11m 14s
1500:	learn: 0.8930087	total: 4m 29s	remaining: 10m 27s
2000:	learn: 0.8697867	total: 6m 13s	remaining: 9m 19s
2500:	learn: 0.8525119	total: 7m 59s	remaining: 7m 58s
3000:	learn: 0.8347634	total: 9m 44s	remaining: 6m 29s
3500:	learn: 0.8208507	total: 11m 30s	remaining: 4m 55s
4000:	learn: 0.8070770	total: 13m 15s	remaining: 3m 18s
4500:	learn: 0.7971018	total: 15m	remaining: 1m 39s
4999:	learn: 0.7873111	total: 16m 45s	remaining: 0us
0:	learn: 1.3767111	total: 272ms	remaining: 22m 40s
500:	learn: 1.0182878	total: 1m 15s	remaining: 11m 17s
1000:	learn: 0.9557123	total: 2m 52s	remaining: 11m 27s
1500:	learn: 0.9211547	total: 4m 43s	remaining: 11m 1s
2000:	learn: 0.8977087	total: 6m 38s	remaining: 9m 57s
2500:	learn: 0.8775090	total: 8m 23s	remaining: 8m 23s
3000:	learn: 0.8631270	total: 10m 8s	remaining: 6m 45s
3500:	learn: 0.8

In [63]:
result_submission = np.zeros((6000, 4))
for i in range(result_submission.shape[0]):
    for j in range(result_submission.shape[1]):
        elem_sum = 0.0
        for k in range(5):
            elem_sum += result_stage1[k][i, j]
        result_submission[i, j] = elem_sum / 5.0

In [76]:
result_df = pd.DataFrame(result_submission, columns=['prob_Excellent', 'prob_Good', 'prob_Pass', 'prob_Fail'])
result_df['Group'] = test.Group

In [79]:
result_df = result_df.groupby(['Group'], as_index=False)['prob_Excellent', 'prob_Good', 'prob_Pass', 'prob_Fail'].mean()
result_df.columns = ['Group','Excellent ratio','Good ratio','Pass ratio','Fail ratio']
result_df.to_csv('submission17.csv', index=False)

#### 取log

In [None]:
features = ['Parameter6', 'Parameter7', 'Parameter8', 
            'Parameter9', 'Parameter10']
for feature in features:
    trainX[feature] = np.log(trainX[feature])
    testX[feature] = np.log(testX[feature])

In [None]:
full2.head()

#### 数据分割

In [None]:
trainY = trainX.label
trainX.drop(['label'], axis=1, inplace=True)

print('shape of trainX:', trainX.shape)
print('shape of trainY:', trainY.shape)
print('shape of testX:', testX.shape)

In [None]:
trainX.drop(['Parameter6', 'Parameter7', 'Parameter8', 'Parameter9', 'Parameter10'], axis=1, inplace=True)
trainX.head()

In [None]:
testX.drop(['Parameter6', 'Parameter7', 'Parameter8', 'Parameter9', 'Parameter10'], axis=1, inplace=True)
testX.head()

In [None]:
trainX = full2[:train.shape[0]]
trainY = train.label
testX = full2[train.shape[0]:]

print('shape of trainX:', trainX.shape)
print('shape of trainY:', trainY.shape)
print('shape of testX:', testX.shape)

#### 划分训练集和验证集

In [None]:
X_train, X_test, y_train, y_test = train_test_split(trainX, trainY, test_size=0.2, random_state=42)
X_train.shape

In [None]:
model = CatBoostClassifier(iterations=2000, depth=8, learning_rate=0.01, verbose=100, loss_function='MultiClass', random_state=42)
model.fit(X_train, y_train)

predY = model.predict(X_test)

In [None]:
model = CatBoostClassifier(iterations=2000, depth=8, learning_rate=0.01, verbose=1000, loss_function='MultiClass', random_state=42)

param = {'depth':[6, 8, 10], 'l2_leaf_reg':[0.01, 0.1]}
cv = GridSearchCV(model, param_grid=param, verbose=1, cv=5, scoring='accuracy')
cv.fit(trainX, trainY)

In [None]:
cv.best_params_

In [None]:
cv.best_score_

In [None]:
model = lgb.LGBMClassifier()
model.fit(X_train, y_train)
predY = model.predict(X_test)

In [None]:
sum(np.array(y_test.tolist()).reshape(-1,1) == predY) / predY.shape[0]

### 模型构建

#### LogisticRegression

In [None]:
lr = LogisticRegression()
lr.fit(trainX, trainY)
predY = lr.predict(testX)
proba = lr.predict_proba(testX)

#### AdaBoostClassifier

In [None]:
ada = AdaBoostClassifier()
ada.fit(trainX, trainY)
predY = ada.predict(testX)

#### xgboost

In [None]:
xgbc = xgb.XGBClassifier()
xgbc.fit(trainX, trainY)
predY = xgbc.predict(testX)

#### lightgbm

In [None]:
lgbc = lgb.LGBMClassifier()
lgbc.fit(trainX, trainY)
predY = lgbc.predict(testX)

#### catboost

In [None]:
model = CatBoostClassifier(iterations=4000, depth=10, learning_rate=0.01, 
                           verbose=500, loss_function='MultiClass', 
                           random_state=42, l2_leaf_reg=0.01)
model.fit(trainX, trainY)
predY = model.predict(testX)

_______

In [None]:
proba = model.predict_proba(testX)
proba

In [None]:
test['pred'] = predY
test['prob_Excellent'] = 0.0
test['prob_Good'] = 0.0
test['prob_Pass'] = 0.0
test['prob_Fail'] = 0.0
test.loc[:, ['prob_Excellent', 'prob_Good', 'prob_Pass', 'prob_Fail']] = model.predict_proba(testX)
test.head(10)

In [None]:
result = test.groupby(['Group'], as_index=False)['prob_Excellent', 'prob_Good', 'prob_Pass', 'prob_Fail'].mean()
result.columns = ['Group','Excellent ratio','Good ratio','Pass ratio','Fail ratio']
result.to_csv('submission17.csv', index=False)

_________

In [None]:
20000/14584 * 11000