In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# !pip install category_encoders

In [2]:
# !cp -r /content/drive/MyDrive/Current\ Workspace/Intro\ ML/dataset .

In [3]:
import joblib
import numpy as np
import pandas as pd
import os

from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import RobustScaler
from category_encoders import WOEEncoder
from sklearn.impute import KNNImputer
from sklearn.linear_model import HuberRegressor
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.base import clone

from sklearn.linear_model import LogisticRegression

In [4]:
PATH_TRAIN = os.path.join("dataset", "train.csv")
PATH_TEST = os.path.join("dataset", "test.csv")
PATH_SAMPLE = os.path.join("dataset", "sample_submission.csv")
submission = pd.read_csv(PATH_SAMPLE)
train = pd.read_csv(PATH_TRAIN, index_col='id')
test = pd.read_csv(PATH_TEST, index_col='id')

In [5]:
train.head()

Unnamed: 0_level_0,product_code,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,...,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,failure
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,A,80.1,material_7,material_8,9,5,7,8,4,18.04,...,10.672,15.859,17.594,15.193,15.029,,13.034,14.684,764.1,0
1,A,84.89,material_7,material_8,9,5,14,3,3,18.213,...,12.448,17.947,17.915,11.755,14.732,15.425,14.395,15.631,682.057,0
2,A,82.43,material_7,material_8,9,5,12,1,5,18.057,...,12.715,15.607,,13.798,16.711,18.631,14.094,17.946,663.376,0
3,A,101.07,material_7,material_8,9,5,13,2,6,17.295,...,12.471,16.346,18.377,10.02,15.25,15.562,16.154,17.172,826.282,0
4,A,188.06,material_7,material_8,9,5,9,2,8,19.346,...,10.337,17.082,19.932,12.428,16.182,12.76,13.153,16.412,579.885,0


In [6]:
# each column statistics
df_total = pd.concat([train, test])
for col in df_total.columns:
    print(df_total[col].value_counts())
    print()

C    5765
F    5422
E    5343
B    5250
I    5228
D    5112
G    5107
A    5100
H    5018
Name: product_code, dtype: int64

101.18    16
97.28     15
96.06     14
108.72    14
102.84    13
          ..
63.89      1
196.57     1
213.79     1
54.17      1
67.73      1
Name: loading, Length: 14652, dtype: int64

material_7    31566
material_5    15779
Name: attribute_0, dtype: int64

material_6    15872
material_5    15590
material_8    10865
material_7     5018
Name: attribute_1, dtype: int64

6    15877
9    15435
5     5765
8     5250
7     5018
Name: attribute_2, dtype: int64

8    11015
9    10361
5    10328
4     5422
6     5112
7     5107
Name: attribute_3, dtype: int64

6     4788
7     4719
8     4378
5     4334
9     3795
4     3735
10    3242
3     3000
11    2586
2     2135
12    2008
1     1632
13    1544
0     1459
14    1193
15     850
16     606
17     430
18     339
19     192
20     140
21      92
22      57
24      34
23      31
25      13
26       7
29       3
27      

In [7]:
# null values
train.isnull().sum()

product_code         0
loading            250
attribute_0          0
attribute_1          0
attribute_2          0
attribute_3          0
measurement_0        0
measurement_1        0
measurement_2        0
measurement_3      381
measurement_4      538
measurement_5      676
measurement_6      796
measurement_7      937
measurement_8     1048
measurement_9     1227
measurement_10    1300
measurement_11    1468
measurement_12    1601
measurement_13    1774
measurement_14    1874
measurement_15    2009
measurement_16    2110
measurement_17    2284
failure              0
dtype: int64

# Data Preprocessing

In [8]:
def data_preprocessing(df_train, df_test):
    data = pd.concat([df_train, df_test])

    data['m3_missing'] = data['measurement_3'].isnull().astype(np.int8)
    data['m5_missing'] = data['measurement_5'].isnull().astype(np.int8)
    data['area'] = data['attribute_2'] * data['attribute_3']

    # key: top 10 best measurement columns
    # value: for each product code, there is a list of training features used to fill the missing cells in Phase 1
    fill_dict = dict()
    fill_dict['measurement_17'] = {
        'A': ['measurement_5', 'measurement_6', 'measurement_8'],
        'B': ['measurement_4', 'measurement_5', 'measurement_7'],
        'C': ['measurement_5', 'measurement_7', 'measurement_8', 'measurement_9'],
        'D': ['measurement_5', 'measurement_6', 'measurement_7', 'measurement_8'],
        'E': ['measurement_4', 'measurement_5', 'measurement_6', 'measurement_8'],
        'F': ['measurement_4', 'measurement_5', 'measurement_6', 'measurement_7'],
        'G': ['measurement_4', 'measurement_6', 'measurement_8', 'measurement_9'],
        'H': ['measurement_4', 'measurement_5', 'measurement_7', 'measurement_8', 'measurement_9'],
        'I': ['measurement_3', 'measurement_7', 'measurement_8']
    }

    #  1. select top 10 best measurement columns, selected by highest correlation value (except 17, done above):
    exclude_column = ['product_code', 'loading', 'attribute_0', 'attribute_1',
                      'attribute_2', 'attribute_3', 'measurement_0', 'measurement_1', 'loading', 'm3_missing', 'm5_missing']   # exclude these columns from correlation
    col_a = [f'measurement_{i}' for i in range(3, 17)]
    col_b = []
    measurement_df = data.drop(exclude_column, axis=1)
    corr_df = measurement_df.corr()

    # sum the top 3(exclude self) correlation values as the correlation score, for each feature
    for i in range(3, 17):
        corr = sorted(np.absolute(corr_df[f'measurement_{i}']), reverse=True)
        col_b.append(np.sum(corr[1:4]))

    corr_rank = pd.DataFrame()
    corr_rank['columnName'] = col_a
    corr_rank['correlation score'] = col_b
    corr_rank = corr_rank.sort_values(
        by='correlation score', ascending=False).reset_index(drop=True)
    # print(col_a)
    # print(col_b)
    print("corr_rank:")
    print(corr_rank)

    # 2. select training features
    # select top 4(exclude self) correlation values columns as training features (for each product_code, for each top 10 columns)
    for i in range(10):
        target_column = corr_rank['columnName'][i]
        tmp_fill_dict = {}
        for x in data["product_code"].unique():
            mini_corr_df = data[data["product_code"] == x].drop(
                exclude_column, axis=1).corr()
            corr = np.absolute(mini_corr_df[target_column]).sort_values(
                ascending=False)
            tmp_fill_dict[x] = corr[1:5].index.tolist()

        fill_dict[target_column] = tmp_fill_dict

    print("fill_dict:")
    for x, y in fill_dict.items():
        print(x)
        print("     ", y)
        print()

    feature_miss = [f'measurement_{i}' for i in range(0, 18)] + ['loading']

    # 3. fill missing values (group by product_code)
    for code in data["product_code"].unique():
        # Phase1: HuberRegressor
        # fill the missing values of each target feature (top 10 best measurement columns)
        for target_column in list(fill_dict.keys()):
            tmp = data[data["product_code"] == code]

            # training features to fill missing cells
            feat_column = fill_dict[target_column][code]

            tmp_train = tmp[feat_column+[target_column]].dropna(how='any')

            # select rows that have missing value of target feature, and fill their target feature
            # extra condition: if all of its training features exists
            tmp_test = tmp[(tmp[feat_column].isnull().sum(axis=1)
                            == 0) & (tmp[target_column].isnull())]

            model_HR = HuberRegressor(epsilon=1.9)
            model_HR.fit(tmp_train[feat_column], tmp_train[target_column])

            # fill target feature
            data.loc[(data["product_code"] == code) & (data[feat_column].isnull().sum(axis=1) == 0) & (
                data[target_column].isnull()), target_column] = model_HR.predict(tmp_test[feat_column])

        # Phase2: KNNImputer
        # fill the remaining missing cells
        model_KNN = KNNImputer(n_neighbors=3)
        data.loc[data["product_code"] == code, feature_miss] = model_KNN.fit_transform(
            data.loc[data["product_code"] == code, feature_miss])

    columns = [f'measurement_{i}' for i in range(3, 17)]
    data['measurement_avg'] = data[columns].mean(axis=1)

    df_train_new = data.iloc[:df_train.shape[0], :]
    df_test_new = data.iloc[df_train.shape[0]:, :]

    return df_train_new, df_test_new

In [9]:
train, test = data_preprocessing(train, test)

corr_rank:
        columnName  correlation score
0    measurement_8           0.454339
1   measurement_11           0.395141
2    measurement_5           0.386080
3    measurement_6           0.364810
4    measurement_7           0.335832
5    measurement_4           0.330860
6   measurement_15           0.300999
7   measurement_10           0.300148
8   measurement_16           0.251591
9   measurement_14           0.224950
10   measurement_9           0.200675
11  measurement_13           0.166342
12  measurement_12           0.142478
13   measurement_3           0.091591
fill_dict:
measurement_17
      {'A': ['measurement_5', 'measurement_6', 'measurement_8'], 'B': ['measurement_4', 'measurement_5', 'measurement_7'], 'C': ['measurement_5', 'measurement_7', 'measurement_8', 'measurement_9'], 'D': ['measurement_5', 'measurement_6', 'measurement_7', 'measurement_8'], 'E': ['measurement_4', 'measurement_5', 'measurement_6', 'measurement_8'], 'F': ['measurement_4', 'measurement_5', 'meas

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

In [10]:
# (missing values are filled)
train.isnull().sum()

product_code       0
loading            0
attribute_0        0
attribute_1        0
attribute_2        0
attribute_3        0
measurement_0      0
measurement_1      0
measurement_2      0
measurement_3      0
measurement_4      0
measurement_5      0
measurement_6      0
measurement_7      0
measurement_8      0
measurement_9      0
measurement_10     0
measurement_11     0
measurement_12     0
measurement_13     0
measurement_14     0
measurement_15     0
measurement_16     0
measurement_17     0
failure            0
m3_missing         0
m5_missing         0
area               0
measurement_avg    0
dtype: int64

In [11]:
print(train.columns)
# train['m5_missing'].value_counts()

Index(['product_code', 'loading', 'attribute_0', 'attribute_1', 'attribute_2',
       'attribute_3', 'measurement_0', 'measurement_1', 'measurement_2',
       'measurement_3', 'measurement_4', 'measurement_5', 'measurement_6',
       'measurement_7', 'measurement_8', 'measurement_9', 'measurement_10',
       'measurement_11', 'measurement_12', 'measurement_13', 'measurement_14',
       'measurement_15', 'measurement_16', 'measurement_17', 'failure',
       'm3_missing', 'm5_missing', 'area', 'measurement_avg'],
      dtype='object')


In [12]:
class ProductSplitter3v2():
    ''' each split includes 3 types of products in the train set and 2 in the validation set. '''

    def split(self, X: pd.DataFrame, _y=None, _groups=None):
        indices = list(X.groupby("product_code").indices.values())
        print(indices)
        for i in range(len(indices)):
            for j in range(i + 1, len(indices)):
                yield [
                    np.concatenate(
                        [ix for k, ix in enumerate(indices) if k not in [i, j]]),
                    np.concatenate(
                        [ix for k, ix in enumerate(indices) if k in [i, j]])
                ]


cv = ProductSplitter3v2()
train_data_split = list(cv.split(train))

[array([   0,    1,    2, ..., 5097, 5098, 5099]), array([ 5100,  5101,  5102, ..., 10347, 10348, 10349]), array([10350, 10351, 10352, ..., 16112, 16113, 16114]), array([16115, 16116, 16117, ..., 21224, 21225, 21226]), array([21227, 21228, 21229, ..., 26567, 26568, 26569])]


In [13]:
train_data_split

[[array([10350, 10351, 10352, ..., 26567, 26568, 26569]),
  array([    0,     1,     2, ..., 10347, 10348, 10349])],
 [array([ 5100,  5101,  5102, ..., 26567, 26568, 26569]),
  array([    0,     1,     2, ..., 16112, 16113, 16114])],
 [array([ 5100,  5101,  5102, ..., 26567, 26568, 26569]),
  array([    0,     1,     2, ..., 21224, 21225, 21226])],
 [array([ 5100,  5101,  5102, ..., 21224, 21225, 21226]),
  array([    0,     1,     2, ..., 26567, 26568, 26569])],
 [array([    0,     1,     2, ..., 26567, 26568, 26569]),
  array([ 5100,  5101,  5102, ..., 16112, 16113, 16114])],
 [array([    0,     1,     2, ..., 26567, 26568, 26569]),
  array([ 5100,  5101,  5102, ..., 21224, 21225, 21226])],
 [array([    0,     1,     2, ..., 21224, 21225, 21226]),
  array([ 5100,  5101,  5102, ..., 26567, 26568, 26569])],
 [array([    0,     1,     2, ..., 26567, 26568, 26569]),
  array([10350, 10351, 10352, ..., 21224, 21225, 21226])],
 [array([    0,     1,     2, ..., 21224, 21225, 21226]),
  arra

# train

In [14]:
features_list = ['loading', 'attribute_0', 'measurement_17', 'measurement_0',
                 'measurement_1', 'measurement_2', 'area', 'm3_missing', 'm5_missing', 'measurement_avg']

In [15]:
def train_model(raw_model, train, test, features_list):
    models = list()

    test_pred = np.zeros(test.shape[0])
    scores = [0 for _ in range(10)]

    # k-fold cross validation
    for i, (train_idx, val_idx) in enumerate(train_data_split):

        # only train on target features
        X_train = train[features_list].iloc[train_idx]
        X_val = train[features_list].iloc[val_idx]
        y_train = train['failure'].iloc[train_idx]
        y_val = train['failure'].iloc[val_idx]

       # model
        model = make_pipeline(
            make_column_transformer(
                (WOEEncoder(), ['attribute_0']),
                (FunctionTransformer(np.log1p), ['loading']),
                remainder='passthrough'
            ),
            RobustScaler(),
            clone(raw_model)
        )
        model.fit(X_train, y_train)

        models.append(model)    # save models

        # predict
        val_pred = model.predict_proba(X_val)[:, 1]
        test_pred += model.predict_proba(test[features_list])[
            :, 1] / len(train_data_split)
        print(test_pred)
        scores[i] = roc_auc_score(y_val, val_pred)

    print(f'val score:{np.mean(scores)}')
    print(scores)

    joblib.dump(models, 'models.sav', compress=1)   # save models

    # submission['failure'] = test_pred
    # submission.to_csv('submission_test.csv', index=False)

# Model

In [16]:
params = dict(
    penalty='l2',
    C=0.00959152,
    solver='liblinear',
    max_iter=200,
    random_state=100,
)
model = LogisticRegression(**params)
train_model(model, train, test, features_list)

[0.02150519 0.01890775 0.02069056 ... 0.0139574  0.02380552 0.01633407]
[0.03846458 0.03213876 0.03596896 ... 0.02687707 0.04808423 0.03470692]
[0.05519484 0.04644268 0.05136284 ... 0.04047652 0.0713558  0.05175   ]
[0.0740684  0.06227275 0.06884745 ... 0.05420517 0.09539902 0.06863177]
[0.09585711 0.08116444 0.08929533 ... 0.06774899 0.11881448 0.08575125]
[0.11769712 0.10133193 0.11002138 ... 0.0816926  0.14124213 0.10188741]
[0.13725111 0.11882971 0.1287775  ... 0.09606495 0.16538905 0.11827717]
[0.1561371  0.13481876 0.14578738 ... 0.10928234 0.1885312  0.13616298]
[0.17351452 0.14939264 0.16169572 ... 0.12300877 0.2125481  0.15386802]
[0.19163952 0.16517639 0.17832202 ... 0.1367218  0.23582301 0.17094061]
val score:0.5895491568757363
[0.5934175474091082, 0.5856071622850465, 0.5954750166227579, 0.5874186761769179, 0.5892700578974246, 0.5950661853867248, 0.5859412846314811, 0.5886073827692562, 0.5841300592766452, 0.5905581963020015]


In [None]:
# !cp models.sav /content/drive/MyDrive/Current\ Workspace/Intro\ ML/