In [13]:
# from google.colab import drive
# drive.mount('/content/drive')

In [14]:
import joblib
import numpy as np
import pandas as pd
import os

from sklearn.impute import KNNImputer
from sklearn.linear_model import HuberRegressor

In [15]:
# !pip install category_encoders

In [16]:
# copy dataset
# !cp -r /content/drive/MyDrive/Current\ Workspace/Intro\ ML/dataset .
# copy model
# !cp /content/drive/MyDrive/Current\ Workspace/Intro\ ML/models.sav .

In [17]:
PATH_TRAIN = os.path.join("dataset", "train.csv")
PATH_TEST = os.path.join("dataset", "test.csv")
PATH_SAMPLE = os.path.join("dataset", "sample_submission.csv")
submission = pd.read_csv(PATH_SAMPLE)
train = pd.read_csv(PATH_TRAIN, index_col='id')
test = pd.read_csv(PATH_TEST, index_col='id')

In [18]:
train
# test

Unnamed: 0_level_0,product_code,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,...,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,failure
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,A,80.10,material_7,material_8,9,5,7,8,4,18.040,...,10.672,15.859,17.594,15.193,15.029,,13.034,14.684,764.100,0
1,A,84.89,material_7,material_8,9,5,14,3,3,18.213,...,12.448,17.947,17.915,11.755,14.732,15.425,14.395,15.631,682.057,0
2,A,82.43,material_7,material_8,9,5,12,1,5,18.057,...,12.715,15.607,,13.798,16.711,18.631,14.094,17.946,663.376,0
3,A,101.07,material_7,material_8,9,5,13,2,6,17.295,...,12.471,16.346,18.377,10.020,15.250,15.562,16.154,17.172,826.282,0
4,A,188.06,material_7,material_8,9,5,9,2,8,19.346,...,10.337,17.082,19.932,12.428,16.182,12.760,13.153,16.412,579.885,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26565,E,158.95,material_7,material_6,6,9,6,16,4,16.301,...,,12.177,17.942,10.112,15.795,18.572,16.144,,729.131,0
26566,E,146.02,material_7,material_6,6,9,10,12,8,17.543,...,11.242,14.179,20.564,10.234,14.450,14.322,13.146,16.471,853.924,0
26567,E,115.62,material_7,material_6,6,9,1,10,1,15.670,...,11.407,16.437,17.476,8.668,15.069,16.599,15.590,14.065,750.364,0
26568,E,106.38,material_7,material_6,6,9,2,9,4,18.059,...,11.392,17.064,17.814,14.928,16.273,15.485,13.624,12.865,730.156,0


# Load model

In [19]:
PATH_MODEL = 'models.sav'
models = joblib.load(PATH_MODEL)

In [20]:
print(len(models))

10


# Preprocessing

In [21]:
def data_preprocessing(df_train, df_test):
    data = pd.concat([df_train, df_test])

    data['m3_missing'] = data['measurement_3'].isnull().astype(np.int8)
    data['m5_missing'] = data['measurement_5'].isnull().astype(np.int8)
    data['area'] = data['attribute_2'] * data['attribute_3']

    # key: top 10 best measurement columns
    # value: for each product code, there is a list of training features used to fill the missing cells in Phase 1
    fill_dict = dict()
    fill_dict['measurement_17'] = {
        'A': ['measurement_5', 'measurement_6', 'measurement_8'],
        'B': ['measurement_4', 'measurement_5', 'measurement_7'],
        'C': ['measurement_5', 'measurement_7', 'measurement_8', 'measurement_9'],
        'D': ['measurement_5', 'measurement_6', 'measurement_7', 'measurement_8'],
        'E': ['measurement_4', 'measurement_5', 'measurement_6', 'measurement_8'],
        'F': ['measurement_4', 'measurement_5', 'measurement_6', 'measurement_7'],
        'G': ['measurement_4', 'measurement_6', 'measurement_8', 'measurement_9'],
        'H': ['measurement_4', 'measurement_5', 'measurement_7', 'measurement_8', 'measurement_9'],
        'I': ['measurement_3', 'measurement_7', 'measurement_8']
    }

    #  1. select top 10 best measurement columns, selected by highest correlation value (except 17, done above):
    exclude_column = ['product_code', 'loading', 'attribute_0', 'attribute_1',
                      'attribute_2', 'attribute_3', 'measurement_0', 'measurement_1', 'loading', 'm3_missing', 'm5_missing']   # exclude these columns from correlation
    col_a = [f'measurement_{i}' for i in range(3, 17)]
    col_b = []
    measurement_df = data.drop(exclude_column, axis=1)
    corr_df = measurement_df.corr()

    # sum the top 3(exclude self) correlation values as the correlation score, for each feature
    for i in range(3, 17):
        corr = sorted(np.absolute(corr_df[f'measurement_{i}']), reverse=True)
        col_b.append(np.sum(corr[1:4]))

    corr_rank = pd.DataFrame()
    corr_rank['columnName'] = col_a
    corr_rank['correlation score'] = col_b
    corr_rank = corr_rank.sort_values(
        by='correlation score', ascending=False).reset_index(drop=True)
    # print(col_a)
    # print(col_b)
    print("corr_rank:")
    print(corr_rank)

    # 2. select training features
    # select top 4(exclude self) correlation values columns as training features (for each product_code, for each top 10 columns)
    for i in range(10):
        target_column = corr_rank['columnName'][i]
        tmp_fill_dict = {}
        for x in data["product_code"].unique():
            mini_corr_df = data[data["product_code"] == x].drop(
                exclude_column, axis=1).corr()
            corr = np.absolute(mini_corr_df[target_column]).sort_values(
                ascending=False)
            tmp_fill_dict[x] = corr[1:5].index.tolist()

        fill_dict[target_column] = tmp_fill_dict

    print("fill_dict:")
    for x, y in fill_dict.items():
        print(x)
        print("     ", y)
        print()

    feature_miss = [f'measurement_{i}' for i in range(0, 18)] + ['loading']

    # 3. fill missing values (group by product_code)
    for code in data["product_code"].unique():
        # Phase1: HuberRegressor
        # fill the missing values of each target feature (top 10 best measurement columns)
        for target_column in list(fill_dict.keys()):
            tmp = data[data["product_code"] == code]

            # training features to fill missing cells
            feat_column = fill_dict[target_column][code]

            tmp_train = tmp[feat_column+[target_column]].dropna(how='any')

            # select rows that have missing value of target feature, and fill their target feature
            # extra condition: if all of its training features exists
            tmp_test = tmp[(tmp[feat_column].isnull().sum(axis=1)
                            == 0) & (tmp[target_column].isnull())]

            model_HR = HuberRegressor(epsilon=1.9)
            model_HR.fit(tmp_train[feat_column], tmp_train[target_column])

            # fill target feature
            data.loc[(data["product_code"] == code) & (data[feat_column].isnull().sum(axis=1) == 0) & (
                data[target_column].isnull()), target_column] = model_HR.predict(tmp_test[feat_column])

        # Phase2: KNNImputer
        # fill the remaining missing cells
        model_KNN = KNNImputer(n_neighbors=3)
        data.loc[data["product_code"] == code, feature_miss] = model_KNN.fit_transform(
            data.loc[data["product_code"] == code, feature_miss])

    columns = [f'measurement_{i}' for i in range(3, 17)]
    data['measurement_avg'] = data[columns].mean(axis=1)

    df_train_new = data.iloc[:df_train.shape[0], :]
    df_test_new = data.iloc[df_train.shape[0]:, :]

    return df_train_new, df_test_new

In [22]:
train, test = data_preprocessing(train, test)

corr_rank:
        columnName  correlation score
0    measurement_8           0.454339
1   measurement_11           0.395141
2    measurement_5           0.386080
3    measurement_6           0.364810
4    measurement_7           0.335832
5    measurement_4           0.330860
6   measurement_15           0.300999
7   measurement_10           0.300148
8   measurement_16           0.251591
9   measurement_14           0.224950
10   measurement_9           0.200675
11  measurement_13           0.166342
12  measurement_12           0.142478
13   measurement_3           0.091591
fill_dict:
measurement_17
      {'A': ['measurement_5', 'measurement_6', 'measurement_8'], 'B': ['measurement_4', 'measurement_5', 'measurement_7'], 'C': ['measurement_5', 'measurement_7', 'measurement_8', 'measurement_9'], 'D': ['measurement_5', 'measurement_6', 'measurement_7', 'measurement_8'], 'E': ['measurement_4', 'measurement_5', 'measurement_6', 'measurement_8'], 'F': ['measurement_4', 'measurement_5', 'meas

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

# Predict

In [23]:
features_list = ['loading', 'attribute_0', 'measurement_17', 'measurement_0',
                 'measurement_1', 'measurement_2', 'area', 'm3_missing', 'm5_missing', 'measurement_avg']

In [24]:
def predict(models, test, features_list):
    preds = np.zeros(test.shape[0])

    # take average value of all models
    for i in range(10):
        preds += models[i].predict_proba(test[features_list])[:, 1] / 10

    return preds

In [25]:
submission['failure'] = predict(models, test, features_list)
submission.to_csv('submission_0811521.csv', index=False)

In [26]:
# !cp submission_0811521.csv /content/drive/MyDrive/Current\ Workspace/Intro\ ML/dataset