In [74]:
import pandas as pd
import numpy as np
import csv
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import StratifiedGroupKFold, StratifiedKFold, GroupKFold
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import KNNImputer
from sklearn import linear_model
from sklearn.linear_model import HuberRegressor
from sklearn.neighbors import KNeighborsClassifier
from feature_engine.encoding import WoEEncoder
import pickle
import pickle

In [75]:
# load data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

Prepocessing

In [76]:
# set datasets: target(y_train) / train(X_train) / data(X_train + X_test) 
target = train['failure']
train = train.drop('failure', axis=1)
data = pd.concat([train, test])

In [77]:
# adding missing values as extra columns => observe failure rate when feature is missing 
data['m3_missing'] = data['measurement_3'].isnull().astype(np.int8)
data['m5_missing'] = data['measurement_5'].isnull().astype(np.int8)

In [78]:
cols = [col for col in test.columns if col.startswith('measurement')]

In [79]:
# float type columns => find 4 most correlated columns for measurement_3 ~ measurement_17
col_corrsum = list()
col_name = list()
for x in range(3, 18):   
    corr = np.absolute(data[cols].corr()[f'measurement_{x}']).sort_values(ascending=False)
    col_corrsum.append(np.round(np.sum(corr[1:5]),5))
    col_name.append(f'measurement_{x}')

# 計算哪些欄位 和其他欄位有較大關聯性
# create df for it
c = pd.DataFrame()
c['Selected columns'] = col_name
c['correlation total'] = col_corrsum
c = c.sort_values(by = 'correlation total', ascending=False).reset_index(drop = True)

# 和其他欄位有較大關聯性的這些欄位 在各product code中關聯性表現如何 
# create dictionary for it => convenient for looking up
dict = {}
for i in range(8):
    selected_col = c.loc[i, 'Selected columns']
    selected_col_dic = {}
    for pc in data.product_code.unique(): 
        corr = np.absolute(data.loc[data.product_code == pc, cols].corr()[selected_col]).sort_values(ascending=False)
        # print(f'product code: {x}, selected columns: {selected_col}, corr: ')
        # print(corr)
        selected_col_dic[pc] = list(corr[1:5].index)
    dict[selected_col] = selected_col_dic

In [80]:
features = [col for col in data.columns if col.startswith('measurement') or col=='loading']
null_cols = [col for col in train.columns if train[col].isnull().sum()!=0]

In [81]:
# fill NaN/null value 
for pc in data.product_code.unique():
    
    num_NaN_filled_byModel = 0
    num_NaN_filled_byKNN = 0

    # print(f'Product code {pc}: filled by linear model (HuberRegressor):')
    for selected_col in dict.keys():
        # try to fill NaN (if meeting requirements about features needed) => use those to train a model 
        data_with_specific_pc = data[data.product_code == pc]
        features_train = [selected_col] + dict[selected_col][pc]   # dict[selected_col][pc]=look up dictionary(for selected_col performing in the specific product code)
        L_train = data_with_specific_pc[features_train].dropna(axis=0, how='any')  # use features_train to train linear model(also drop NaN row)
        LX_train = L_train[dict[selected_col][pc]]
        Ly_train = L_train[selected_col]
        subdata_pc = data[data.product_code == pc][features_train]
        condition1 = (subdata_pc[dict[selected_col][pc]].isnull().sum(axis=1) == 0)  # axis=1 : cross columns
        condition2 = (subdata_pc[selected_col].isnull())
        LX_test = subdata_pc[condition1 & condition2]
        # print(len(LX_train), len(LX_test))

        if(len(LX_train)!=0 and len(LX_test)!=0):
            model = HuberRegressor(epsilon=1.35, max_iter=400)
            model.fit(LX_train, Ly_train)
            prediction = model.predict(data[data.product_code == pc].loc[LX_test.index.tolist(), dict[selected_col][pc]])
            data[data.product_code == pc].loc[LX_test.index.tolist(), selected_col]= prediction
            # print(f"fill '{selected_col}': #row = {len(LX_test)}")
            num_NaN_filled_byModel += len(LX_test)
        
    # columns still NaN/null (result from not meeting requirements about features needed) => use 'features' to impute
    num_NaN_filled_byKNN = data.loc[data["product_code"] == pc, null_cols].isnull().sum().sum()
    knn = KNNImputer(n_neighbors=3)
    data.loc[data.product_code==pc, features] = knn.fit_transform(data.loc[data.product_code==pc, features])

print(f'#filled (by linear model) = {num_NaN_filled_byModel}') 
print(f'#filled (by KNN) = {num_NaN_filled_byKNN}')

#filled (by linear model) = 1499
#filled (by KNN) = 3829


In [82]:
# reload train/test because preprocessing
train = data.iloc[:len(train),:]
test = data.iloc[len(train):,:]

In [83]:
X = train
y = target
# labeling for 'attribute_0' => use WoEEncoder
woe_encoder = WoEEncoder(variables=['attribute_0'])
woe_encoder.fit(X, y)
X = woe_encoder.transform(X)
test = woe_encoder.transform(test)

In [84]:
# pick based on correlation analysis and important_list 
features_for_training = ['loading',
                        'attribute_0',
                        'measurement_17',
                        'measurement_0',
                        'measurement_1',
                        'measurement_2',
                        'm3_missing',
                        'm5_missing',
                        # 'attribute_3',
                        # 'measurement_4',
                        ]

In [85]:
# scaling for the certain features columns 
def scaling_specific_features(df_list, features):
    
    df_new_list = []
    scaler = StandardScaler()

    for i, df in enumerate(df_list):
        # fit and transform
        if i==0:
            df_scaled = scaler.fit_transform(df[features])
        else:
            df_scaled = scaler.transform(df[features])
        # put it back
        df_new = df.copy()
        df_new[features] = df_scaled
        df_new_list.append(df_new)
    
    return df_new_list

Load model to predict

In [86]:
# load model
model = pickle.load(open('model_best_publicScore.pkl', 'rb'))

In [87]:
x_train, x_test = scaling_specific_features([X, test], features_for_training)
prediction_full = model.predict_proba(x_test[features_for_training])[:, 1]

prediction_full

array([0.20192883, 0.19422158, 0.19818543, ..., 0.18903897, 0.21466629,
       0.19475708])

Output CSV file

In [88]:
# output csv file
with open('submission_best_publicScore_pickle.csv', 'w', newline='') as csvfile:
  csv_writer = csv.writer(csvfile)
  csv_writer.writerow(["id", "failure"])
  for id, value in zip(list(test.id), prediction_full):
    csv_writer.writerow([id, value])