In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install feature_engine
!pip install tensorflow_addons

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting feature_engine
  Downloading feature_engine-1.5.2-py2.py3-none-any.whl (290 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.0/290.0 KB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: feature_engine
Successfully installed feature_engine-1.5.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow_addons
  Downloading tensorflow_addons-0.19.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorflow_addons
Successfully installed tensorflow_addons-0.19.0


In [3]:
import numpy as np
import pandas as pd
import os
import sys

import joblib # saving sklearn modules
from sklearn.impute import KNNImputer # completing missing values using k-Nearest Neighbors
from feature_engine.encoding import WoEEncoder # encode only categorical variables (type ‘object’)
from sklearn.linear_model import LogisticRegression, HuberRegressor

from tensorflow.keras import Sequential  
from tensorflow.keras.layers import BatchNormalization, Dropout, Dense
import tensorflow as tf
from tensorflow_addons.optimizers import AdamW
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

In [4]:
Train = pd.read_csv('/content/drive/MyDrive/大三上作業/ML_final/train.csv')
Test = pd.read_csv('/content/drive/MyDrive/大三上作業/ML_final/test.csv')

In [17]:
data = pd.concat([Train, Test])
# create lists that will be used
codes = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I']
drops = ['id', 'product_code', 'loading', 'attribute_0', 'attribute_1', 'attribute_2', 'attribute_3', 'loading', 'missing3', 'missing5']
featureM = [f for f in data.columns if f.startswith('measurement') or f=='loading']
features = ['loading', 'attribute_0', 'measurement_17', 'measurement_0','measurement_1', 'measurement_2', 'attr23', 'missing3', 'missing5']

# create a dict for measurements 3-17 because they contain value lost (*reference)
measurements = {}
# set up measurement 17 manually
measurements['measurement_17'] = {
        'A': ['measurement_5','measurement_6','measurement_8','measurement_7'],
        'B': ['measurement_4','measurement_5','measurement_7','measurement_9'],
        'C': ['measurement_5','measurement_7','measurement_8','measurement_9'],
        'D': ['measurement_5','measurement_6','measurement_7','measurement_8'],
        'E': ['measurement_4','measurement_5','measurement_6','measurement_8'],
        'F': ['measurement_4','measurement_5','measurement_6','measurement_7'],
        'G': ['measurement_4','measurement_6','measurement_8','measurement_9'],
        'H': ['measurement_4','measurement_5','measurement_7','measurement_8','measurement_9'],
        'I': ['measurement_3','measurement_7','measurement_8','measurement_9']
    }

# when measurement_3 is missing, the failure rate is 0.160 (much lower than average)    
data['missing3'] = data['measurement_3'].isna().astype(np.int8)
# when measurement_5 is missing, the failure rate is 0.254 (much higher than average)
data['missing5'] = data['measurement_5'].isna().astype(np.int8)
# attribute_2 has two values (5 and 8) which occur only in the training data, and another value (7) occurs only in the test data, attribute_3 is similar.
data['attr23'] = data['attribute_2'] * data['attribute_3']

# get the correlation values of measurements 3-16
colName = []
value = []
for i in range(3,17):
    colName.append(f'measurement_{i}')
    corr = np.absolute(data.drop(drops, axis=1).corr()[f'measurement_{i}']).sort_values(ascending=False)
    value.append(np.sum(corr[1:4]))

# take the measurements in order of sorted correlation
measurementCorr = pd.DataFrame({'column name': colName, 'values': value})
sortedMeasurement = measurementCorr.sort_values(by='values', ascending=False).reset_index(drop=True)
for i in range(10): # only take the top 10
    whichMeasurement = sortedMeasurement.iloc[i, 0]
    best4 = {} # pick 4 measurements with best correlations for each product_code
    for c in codes: 
        corr = np.absolute(data[data.product_code == c].drop(drops, axis=1).corr()[whichMeasurement]).sort_values(ascending=False)
        best4[c] = corr[1:5].index.tolist()
    measurements[whichMeasurement] = best4 # measurement_i | product_code : measurement_i1, measurement_i2, measurement_i3, measurement_i4

# for each product_code
for code in codes:
    # fill the measurement columns with linear model
    for measure in list(measurements.keys()):
        dataM = data[data.product_code == code]
        best4 = measurements[measure][code] # product_code : measurement_i1, measurement_i2, measurement_i3, measurement_i4
        trainM = dataM[best4+[measure]].dropna(how='any') # 4 related + itself as target (all should not be null)
        testM = dataM[(dataM[best4].isnull().sum(axis=1)==0) & (dataM[measure].isnull())] # 4 related (no null) + itself as target (null)
        modelLinear = HuberRegressor(epsilon=1.9)
        modelLinear.fit(trainM[best4], trainM[measure])
        data.loc[(data.product_code==code)&(data[best4].isnull().sum(axis=1)==0)&(data[measure].isnull()), measure] = modelLinear.predict(testM[best4])
    # fill the others N/A columns with KNN (k=3)
    modelKnn = KNNImputer(n_neighbors=3)
    data.loc[data.product_code==code, featureM] = modelKnn.fit_transform(data.loc[data.product_code==code, featureM])

# encode attribute_0
train = data.iloc[:len(Train.index)]
encode = WoEEncoder(variables=['attribute_0'])
encode.fit(train, train['failure'])
train = encode.transform(train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

In [18]:
# first 3 codes for training, latter 2 for validation (*reference)
folds = {'fold_1': [['C', 'D', 'E'], ['A', 'B']], 
         'fold_2': [['B', 'D', 'E'], ['A', 'C']],
         'fold_3': [['B', 'C', 'E'], ['A', 'D']],
         'fold_4': [['B', 'C', 'D'], ['A', 'E']],
         'fold_5': [['A', 'D', 'E'], ['B', 'C']],
         'fold_6': [['A', 'C', 'E'], ['B', 'D']],
         'fold_7': [['A', 'C', 'D'], ['B', 'E']],
         'fold_8': [['A', 'B', 'E'], ['C', 'D']],
         'fold_9': [['A', 'B', 'D'], ['C', 'E']],
         'fold_10': [['A', 'B', 'C'], ['D', 'E']]}

tf.random.set_seed(85)
# set up the callbacks
bestModel = ModelCheckpoint('best.h5', verbose=1, save_best_only=True, monitor="auc", mode="max")
earlyStop = EarlyStopping(patience=10, restore_best_weights=True)
reduceLr = ReduceLROnPlateau(monitor="auc", factor=0.9, patience=5, mode="max", min_delta=0.0001)

# run through all the folds
for fold in folds.keys():
    print(f'\n{fold}\n')
    
    x_train, y_train = train[train['product_code'].isin(folds[fold][0])][features].values, train[train['product_code'].isin(folds[fold][0])]['failure'].values
    x_valid, y_valid = train[train['product_code'].isin(folds[fold][1])][features].values, train[train['product_code'].isin(folds[fold][1])]['failure'].values

    # create a new model for each fold
    model = Sequential()
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    model.add(Dense(len(x_train), activation="relu"))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    model.add(Dense(128, activation="relu"))
    model.add(BatchNormalization())
    model.add(Dropout(0.1))
    model.add(Dense(64, activation="relu"))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    model.add(Dense(32, activation="relu"))
    model.add(BatchNormalization())
    model.add(Dropout(0.1))
    model.add(Dense(1, activation="sigmoid"))

    model.compile(optimizer=AdamW(learning_rate=1e-3, weight_decay=1e-3), loss=BinaryCrossentropy(), metrics=["AUC"])
    model.fit(x_train, y_train, batch_size = 64, epochs = 100, callbacks=[earlyStop, reduceLr, bestModel], validation_data=(x_valid, y_valid))



fold_1

Epoch 1/100
Epoch 1: auc improved from -inf to 0.52493, saving model to best.h5
Epoch 2/100
Epoch 2: auc improved from 0.52493 to 0.54590, saving model to best.h5
Epoch 3/100
Epoch 3: auc improved from 0.54590 to 0.54758, saving model to best.h5
Epoch 4/100
Epoch 4: auc improved from 0.54758 to 0.56441, saving model to best.h5
Epoch 5/100
Epoch 5: auc improved from 0.56441 to 0.56763, saving model to best.h5
Epoch 6/100
Epoch 6: auc improved from 0.56763 to 0.56829, saving model to best.h5
Epoch 7/100
Epoch 7: auc improved from 0.56829 to 0.57788, saving model to best.h5
Epoch 8/100
Epoch 8: auc did not improve from 0.57788
Epoch 9/100
Epoch 9: auc did not improve from 0.57788
Epoch 10/100
Epoch 10: auc did not improve from 0.57788
Epoch 11/100
Epoch 11: auc did not improve from 0.57788
Epoch 12/100
Epoch 12: auc did not improve from 0.57788
Epoch 13/100
Epoch 13: auc did not improve from 0.57788
Epoch 14/100
Epoch 14: auc did not improve from 0.57788
Epoch 15/100
Epoch 15: au

In [19]:
from google.colab import files
files.download('best.h5')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>