In [9]:
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import pandas as pd
import numpy as np
import math
import gc

pd.set_option('display.max_columns', 100)

In [17]:
train = pd.read_csv('.\\data\\first_round_training_data.csv')
test = pd.read_csv('.\\data\\first_round_testing_data.csv')
submit_example = pd.read_csv('.\\data\\submit_example.csv')

print('shape of train:', train.shape)
print('shape of test:', test.shape)
print('shape of submit:', submit_example.shape)

shape of train: (6000, 21)
shape of test: (6000, 11)
shape of submit: (120, 5)


In [18]:
def label_map(x):
    labels = {'Excellent':1, 'Good':2, 'Pass':3, 'Fail':4}
    return labels[x]

train['label'] = train.Quality_label.apply(label_map)
train.head()

Unnamed: 0,Parameter1,Parameter2,Parameter3,Parameter4,Parameter5,Parameter6,Parameter7,Parameter8,Parameter9,Parameter10,Attribute1,Attribute2,Attribute3,Attribute4,Attribute5,Attribute6,Attribute7,Attribute8,Attribute9,Attribute10,Quality_label,label
0,0.00166,0.591013,147.608373,38.186345,0.000421,0.000612,2286.523413,0.035407,0.593081,1.010385,6.856075,0.168761,1.098755,36.955992,8.454598,11.438066,177.24312,338.729256,2.021704,0.079526,Pass,3
1,1.601749,0.015052,0.035864,51.130326,0.000909,0.002397,2286.523413,0.035407,0.593081,1.010385,0.000362,11.649033,0.066671,225.632949,0.48186,20597.447822,3.72333,15.37619,0.986973,4.634376,Fail,4
2,0.098039,69.233685,0.08092,0.112265,0.000909,0.001972,2286.523413,0.035407,0.593081,1.010385,0.022201,0.078213,110.079689,2.208138,0.073525,236.079314,0.064196,0.576302,33.87579,1.813727,Fail,4
3,18.18186,0.047325,0.018061,1.098102,0.000909,0.002397,2286.523413,0.035407,0.593081,1.010385,1.459004,0.380281,0.011491,0.654517,0.025872,176.948915,0.029777,0.246726,27.117165,0.081819,Fail,4
4,0.012085,0.008749,0.005509,524.327396,0.000909,0.002397,2286.523413,0.035407,0.593081,1.010385,11.576647,1.555672,38.613386,0.260989,0.00938,194.798039,0.055053,0.014725,13.569707,18.138496,Fail,4


In [59]:
features = ['Parameter5', 'Parameter6', 'Parameter7', 'Parameter8', 
            'Parameter9', 'Parameter10']
full = pd.concat([train[features], test[features]], ignore_index=True)

In [63]:
full.shape

(12000, 12)

In [61]:
for feature in features:
    lr = LabelEncoder()
    full[feature+'_le'] = lr.fit_transform(full[feature])

In [62]:
full.head()

Unnamed: 0,Parameter5,Parameter6,Parameter7,Parameter8,Parameter9,Parameter10,Parameter5_le,Parameter6_le,Parameter7_le,Parameter8_le,Parameter9_le,Parameter10_le
0,0.000421,0.000612,2286.523413,0.035407,0.593081,1.010385,25,24,12,1,5,24
1,0.000909,0.002397,2286.523413,0.035407,0.593081,1.010385,33,31,12,1,5,24
2,0.000909,0.001972,2286.523413,0.035407,0.593081,1.010385,33,30,12,1,5,24
3,0.000909,0.002397,2286.523413,0.035407,0.593081,1.010385,33,31,12,1,5,24
4,0.000909,0.002397,2286.523413,0.035407,0.593081,1.010385,33,31,12,1,5,24


In [68]:
def woe_preprocess(full, train, col, label):
    # col: Parameter5_le
    # label: 1/2/3/4
    label_cnt = train.label.value_counts()
    total_cnt = label_cnt.sum()
    label_cnt = train[col].value_counts()
    woe_map = {}
    for cat in label_cnt.index:
        cnt_cat_label = train[(train[col]==cat)&(train.label==label)].shape[0]
        cnt_cat_not_label = train[(train[col]==cat)&(train.label!=label)].shape[0]
        woe_map[cat] = math.log((cnt_cat_not_label + 0.5) / (cnt_cat_label + 0.5))
    full[col+'_woe_'+str(label)] = full[col].apply(lambda x: woe_map.get(x))
    # test = pd.merge(test, train[[col.split('_')[0], col+'_woe_'+str(label)]], on=[col.split('_')[0]], how='left')
    return full

def woe_preprocess2(train, test, col, label):
    le = LabelEncoder()
    le.fit(train[col])
    train[col+'_le'] = le.transform(train[col])
    test[col+'_le'] = le.transform(test[col])
    label_cnt = train.label.value_counts()
    total_cnt = label_cnt.sum()
    label_cnt = train[col].value_counts()
    woe_map = {}
    for cat in label_cnt.index:
        cnt_cat_label = train[(train[col]==cat)&(train.label==label)].shape[0]
        cnt_cat_not_label = train[(train[col]==cat)&(train.label!=label)].shape[0]
        woe_map[cat] = math.log((cnt_cat_not_label + 0.5) / (cnt_cat_label + 0.5))
    train[col+'_le_woe_'+str(label)] = train[col].apply(lambda x: woe_map.get(x))
    test = pd.merge(test, train[[col+'_le', col+'_le_woe_'+str(label)]], on=[col+'_le'], how='left')
    return train, test

In [69]:
for feature in tqdm(features):
    for label in [1,2,3,4]:
        # print(feature, label)
        full = woe_preprocess(full, train, feature+'_le', label)

100%|█████████████████████████████████████████████████| 6/6 [00:04<00:00,  1.28it/s]


In [71]:
full.head(5).append(full.tail(5))

Unnamed: 0,Parameter5,Parameter6,Parameter7,Parameter8,Parameter9,Parameter10,Parameter5_le,Parameter6_le,Parameter7_le,Parameter8_le,Parameter9_le,Parameter10_le,Parameter5_le_woe_1,Parameter5_le_woe_2,Parameter5_le_woe_3,Parameter5_le_woe_4,Parameter6_le_woe_1,Parameter6_le_woe_2,Parameter6_le_woe_3,Parameter6_le_woe_4,Parameter7_le_woe_1,Parameter7_le_woe_2,Parameter7_le_woe_3,Parameter7_le_woe_4,Parameter8_le_woe_1,Parameter8_le_woe_2,Parameter8_le_woe_3,Parameter8_le_woe_4,Parameter9_le_woe_1,Parameter9_le_woe_2,Parameter9_le_woe_3,Parameter9_le_woe_4,Parameter10_le_woe_1,Parameter10_le_woe_2,Parameter10_le_woe_3,Parameter10_le_woe_4
0,0.000421,0.000612,2286.523413,0.035407,0.593081,1.010385,25,24,12,1,5,24,1.650681,0.705886,0.705886,1.418383,0.796331,1.062894,1.209838,1.209838,1.94591,1.94591,-0.510826,0.510826,2.03899,1.248534,-0.04312,1.722908,0.720581,0.648387,1.317337,2.002249,1.877155,1.877155,-0.707069,2.571616
1,0.000909,0.002397,2286.523413,0.035407,0.593081,1.010385,33,31,12,1,5,24,0.998529,0.635989,2.036882,0.635989,0.619039,1.098612,0.200671,2.944439,1.94591,1.94591,-0.510826,0.510826,2.03899,1.248534,-0.04312,1.722908,0.720581,0.648387,1.317337,2.002249,1.877155,1.877155,-0.707069,2.571616
2,0.000909,0.001972,2286.523413,0.035407,0.593081,1.010385,33,30,12,1,5,24,0.998529,0.635989,2.036882,0.635989,1.512588,0.887303,1.17412,0.756326,1.94591,1.94591,-0.510826,0.510826,2.03899,1.248534,-0.04312,1.722908,0.720581,0.648387,1.317337,2.002249,1.877155,1.877155,-0.707069,2.571616
3,0.000909,0.002397,2286.523413,0.035407,0.593081,1.010385,33,31,12,1,5,24,0.998529,0.635989,2.036882,0.635989,0.619039,1.098612,0.200671,2.944439,1.94591,1.94591,-0.510826,0.510826,2.03899,1.248534,-0.04312,1.722908,0.720581,0.648387,1.317337,2.002249,1.877155,1.877155,-0.707069,2.571616
4,0.000909,0.002397,2286.523413,0.035407,0.593081,1.010385,33,31,12,1,5,24,0.998529,0.635989,2.036882,0.635989,0.619039,1.098612,0.200671,2.944439,1.94591,1.94591,-0.510826,0.510826,2.03899,1.248534,-0.04312,1.722908,0.720581,0.648387,1.317337,2.002249,1.877155,1.877155,-0.707069,2.571616
11995,0.000218,0.000414,2286.523413,0.035407,0.593081,51.944717,19,22,12,1,5,37,2.564949,1.299283,1.299283,-0.587787,2.944439,1.098612,-0.200671,1.098612,1.94591,1.94591,-0.510826,0.510826,2.03899,1.248534,-0.04312,1.722908,0.720581,0.648387,1.317337,2.002249,1.665008,0.52702,0.592504,1.911719
11996,3.095123,1.817391,0.600827,17.850021,6.783967,0.19568,110,67,5,15,7,19,1.340385,0.637797,0.591098,2.18546,1.396406,0.965359,0.493658,1.734601,1.557634,1.731954,-0.513229,2.918661,1.867745,2.693125,-1.275543,3.310543,1.405343,0.292987,1.405343,1.483838,1.118613,1.286665,0.341592,1.928961
11997,0.74163,1.495371,0.600827,17.850021,0.05185,0.073078,96,66,5,15,3,15,0.865418,0.888259,0.865418,1.984376,1.590569,1.380371,-0.145827,2.236515,1.557634,1.731954,-0.513229,2.918661,1.867745,2.693125,-1.275543,3.310543,1.836732,1.228989,-0.016232,1.879048,0.783249,1.046714,0.605674,2.482967
11998,3.454681,3.262468,0.600827,17.850021,6.783967,0.027291,111,70,5,15,7,12,1.44977,1.163635,0.252252,1.828841,1.810109,1.481057,-0.315081,2.192274,1.557634,1.731954,-0.513229,2.918661,1.867745,2.693125,-1.275543,3.310543,1.405343,0.292987,1.405343,1.483838,0.97538,0.369217,1.371479,2.014154
11999,1.031282,0.833011,0.038483,2.931083,2.005852,0.073078,99,63,3,10,6,15,1.39083,1.30236,0.13062,1.929449,0.87068,0.579609,1.078662,2.244042,1.932864,1.267411,-0.155926,2.03613,1.098612,1.098612,-0.336472,2.397895,2.586689,1.098612,1.734601,-0.200671,0.783249,1.046714,0.605674,2.482967


In [78]:
# test部分woe的缺失值用对应部分的mean表示
for col in full.columns:
    mean = full[col].mean()
    full[col].fillna(mean, inplace=True)