In [None]:
%pip install feature-engine

In [41]:
import pandas as pd
import numpy as np
import csv
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import StratifiedGroupKFold, StratifiedKFold, GroupKFold
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import KNNImputer
from sklearn import linear_model
from sklearn.linear_model import HuberRegressor
from sklearn.neighbors import KNeighborsClassifier
from feature_engine.encoding import WoEEncoder
import pickle

In [42]:
# load data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [43]:
# set datasets: target(y_train) / train(X_train) / data(X_train + X_test) 
target = train['failure']
train = train.drop('failure', axis=1)
data = pd.concat([train, test])

In [44]:
# adding missing values as extra columns => observe failure rate when feature is missing 
data['m3_missing'] = data['measurement_3'].isnull().astype(np.int8)
data['m5_missing'] = data['measurement_5'].isnull().astype(np.int8)

In [45]:
cols = [col for col in test.columns if col.startswith('measurement')]

In [46]:
# float type columns => find 4 most correlated columns for measurement_3 ~ measurement_17
col_corrsum = list()
col_name = list()
for x in range(3, 18):   
    corr = np.absolute(data[cols].corr()[f'measurement_{x}']).sort_values(ascending=False)
    col_corrsum.append(np.round(np.sum(corr[1:5]),5))
    col_name.append(f'measurement_{x}')

# 計算哪些欄位 和其他欄位有較大關聯性
# create df for it
c = pd.DataFrame()
c['Selected columns'] = col_name
c['correlation total'] = col_corrsum
c = c.sort_values(by = 'correlation total', ascending=False).reset_index(drop = True)

# 和其他欄位有較大關聯性的這些欄位 在各product code中關聯性表現如何 
# create dictionary for it => convenient for looking up
dict = {}
for i in range(8):
    selected_col = c.loc[i, 'Selected columns']
    selected_col_dic = {}
    for pc in data.product_code.unique(): 
        corr = np.absolute(data.loc[data.product_code == pc, cols].corr()[selected_col]).sort_values(ascending=False)
        # print(f'product code: {x}, selected columns: {selected_col}, corr: ')
        # print(corr)
        selected_col_dic[pc] = list(corr[1:5].index)
    dict[selected_col] = selected_col_dic

In [47]:
features = [col for col in data.columns if col.startswith('measurement') or col=='loading']
null_cols = [col for col in train.columns if train[col].isnull().sum()!=0]

In [48]:
# fill NaN/null value 
for pc in data.product_code.unique():
    
    num_NaN_filled_byModel = 0
    num_NaN_filled_byKNN = 0

    # print(f'Product code {pc}: filled by linear model (HuberRegressor):')
    for selected_col in dict.keys():
        # try to fill NaN (if meeting requirements about features needed) => use those to train a model 
        data_with_specific_pc = data[data.product_code == pc]
        features_train = [selected_col] + dict[selected_col][pc]   # dict[selected_col][pc]=look up dictionary(for selected_col performing in the specific product code)
        L_train = data_with_specific_pc[features_train].dropna(axis=0, how='any')  # use features_train to train linear model(also drop NaN row)
        LX_train = L_train[dict[selected_col][pc]]
        Ly_train = L_train[selected_col]
        subdata_pc = data[data.product_code == pc][features_train]
        condition1 = (subdata_pc[dict[selected_col][pc]].isnull().sum(axis=1) == 0)  # axis=1 : cross columns
        condition2 = (subdata_pc[selected_col].isnull())
        LX_test = subdata_pc[condition1 & condition2]
        # print(len(LX_train), len(LX_test))

        if(len(LX_train)!=0 and len(LX_test)!=0):
            model = HuberRegressor(epsilon=1.35, max_iter=400)
            model.fit(LX_train, Ly_train)
            prediction = model.predict(data[data.product_code == pc].loc[LX_test.index.tolist(), dict[selected_col][pc]])
            data[data.product_code == pc].loc[LX_test.index.tolist(), selected_col]= prediction
            # print(f"fill '{selected_col}': #row = {len(LX_test)}")
            num_NaN_filled_byModel += len(LX_test)
        
    # columns still NaN/null (result from not meeting requirements about features needed) => use 'features' to impute
    num_NaN_filled_byKNN = data.loc[data["product_code"] == pc, null_cols].isnull().sum().sum()
    knn = KNNImputer(n_neighbors=3)
    data.loc[data.product_code==pc, features] = knn.fit_transform(data.loc[data.product_code==pc, features])

print(f'#filled (by linear model) = {num_NaN_filled_byModel}') 
print(f'#filled (by KNN) = {num_NaN_filled_byKNN}')

#filled (by linear model) = 1499
#filled (by KNN) = 3829


In [49]:
# scaling for the certain features columns 
def scaling_specific_features(df_list, features):
    
    df_new_list = []
    scaler = StandardScaler()

    for i, df in enumerate(df_list):
        # fit and transform
        if i==0:
            df_scaled = scaler.fit_transform(df[features])
        else:
            df_scaled = scaler.transform(df[features])
        # put it back
        df_new = df.copy()
        df_new[features] = df_scaled
        df_new_list.append(df_new)
    
    return df_new_list

In [50]:
# # labeling for 'attribute_1' => use LabelEncoder
# le = LabelEncoder()
# data['attribute_1'] = le.fit_transform(data['attribute_1'])

In [51]:
# # clip for 'measurement_2' => y correlated when this values > 10
# data['measurement_2'] = data['measurement_2'].clip(lower=11, upper=None)

In [52]:
# reload train/test because preprocessing
train = data.iloc[:len(train),:]
test = data.iloc[len(train):,:]

In [53]:
X = train
y = target
# labeling for 'attribute_0' => use WoEEncoder
woe_encoder = WoEEncoder(variables=['attribute_0'])
woe_encoder.fit(X, y)
X = woe_encoder.transform(X)
test = woe_encoder.transform(test)

In [54]:
# pick based on correlation analysis and important_list 
features_for_training = ['loading',
                        'attribute_0',
                        'measurement_17',
                        'measurement_0',
                        'measurement_1',
                        'measurement_2',
                        'm3_missing',
                        'm5_missing',
                        # 'attribute_3',
                        # 'measurement_4',
                        ]

In [55]:
# try different strategies of cross validation
for kf in [StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=0), StratifiedKFold(n_splits=5, shuffle=True, random_state=0), GroupKFold(n_splits=5)]:
    print(f'Strategy : {kf}')
    prediction = np.zeros(len(test))
    roc_auc = 0
    importance_list = []

    # fold loop
    for num_fold, (train_index, val_index) in enumerate(kf.split(X, y, X.product_code)):
        # set dataset 
        x_train = X.iloc[train_index]
        x_valid = X.iloc[val_index]
        y_train = y.iloc[train_index]
        y_valid = y.iloc[val_index]
        
        # scaling
        df_list = [x_train, x_valid, test]
        x_train, x_valid, x_test = scaling_specific_features(df_list, features_for_training)

        # training
        model = linear_model.LogisticRegression(max_iter=100, C=0.0001, penalty='l2', solver='liblinear')
        model.fit(x_train[features_for_training], y_train)
        importance_list.append(model.coef_.ravel())

        # validation
        y_pred_valid = model.predict_proba(x_valid[features_for_training])[:, 1]
        print(f"Fold: {num_fold+1} => ROC-AUC: {round(roc_auc_score(y_valid, y_pred_valid), 5)}")
        roc_auc += roc_auc_score(y_valid, y_pred_valid) / 5
        prediction += model.predict_proba(x_test[features_for_training])[:, 1] / 5

    print(f"Average ROC-AUC = {roc_auc}")

Strategy : StratifiedGroupKFold(n_splits=5, random_state=0, shuffle=True)
Fold: 1 => ROC-AUC: 0.58145
Fold: 2 => ROC-AUC: 0.59798
Fold: 3 => ROC-AUC: 0.59146
Fold: 4 => ROC-AUC: 0.59397
Fold: 5 => ROC-AUC: 0.58682
Average ROC-AUC = 0.5903374932212958
Strategy : StratifiedKFold(n_splits=5, random_state=0, shuffle=True)
Fold: 1 => ROC-AUC: 0.60195
Fold: 2 => ROC-AUC: 0.59369
Fold: 3 => ROC-AUC: 0.58225
Fold: 4 => ROC-AUC: 0.59426
Fold: 5 => ROC-AUC: 0.58649
Average ROC-AUC = 0.591726973363407
Strategy : GroupKFold(n_splits=5)
Fold: 1 => ROC-AUC: 0.58682
Fold: 2 => ROC-AUC: 0.58145
Fold: 3 => ROC-AUC: 0.59146
Fold: 4 => ROC-AUC: 0.59798
Fold: 5 => ROC-AUC: 0.59397
Average ROC-AUC = 0.5903374932212958


In [56]:
# y_pred
prediction 

array([0.39069472, 0.38167169, 0.38640681, ..., 0.37460454, 0.40397887,
       0.38163977])

In [57]:
# output csv file
with open('submission_best_privateScore.csv', 'w', newline='') as csvfile:
  csv_writer = csv.writer(csvfile)
  csv_writer.writerow(["id", "failure"])
  for id, value in zip(list(test.id), prediction):
    csv_writer.writerow([id, value])

Use full train dataset to train

In [58]:
# training
model = linear_model.LogisticRegression(max_iter=100, C=0.0001, penalty='l2', solver='newton-cg')
x_train, x_test = scaling_specific_features([X, test], features_for_training)
model.fit(x_train[features_for_training], y)
prediction_full = model.predict_proba(x_test[features_for_training])[:, 1]

# y_pred
prediction_full

array([0.20192883, 0.19422158, 0.19818543, ..., 0.18903897, 0.21466629,
       0.19475708])

In [59]:
# save trained model 
pickle.dump(model, open('model_best_publicScore.pkl', 'wb'))

In [60]:
# output csv file
with open('submission_best_publicScore.csv', 'w', newline='') as csvfile:
  csv_writer = csv.writer(csvfile)
  csv_writer.writerow(["id", "failure"])
  for id, value in zip(list(test.id), prediction_full):
    csv_writer.writerow([id, value])

Find better parameter

In [None]:
# X = train_df__X
# y = train_df__y
# test = test_df
# for kf in [StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=0), StratifiedKFold(n_splits=5, shuffle=True, random_state=0), GroupKFold(n_splits=5)]:
#     print(f'Strategy : {kf}')
#     prediction = np.zeros(len(test))
#     best_roc_auc = 0
#     best_accuracy = 0
#     best_roc_auc_model = None
#     best_accuracy_model = None
#     importance_list = []

#     penalties = ['none', 'l2', 'l1', 'elasticnet']
#     Cs = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1]
#     solvers = ['lbfgs', 'newton-cg', 'newton-cholesky', 'liblinear', 'sag', 'saga']
#     l1_ratios = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

#     for solver in solvers[:3]:
#         for penalty in penalties[:2]:
#             for C in Cs:
#                 roc_auc = 0
#                 accuracy = 0
#                 for num_fold, (train_index, val_index) in enumerate(kf.split(X, y, X.product_code)):
#                     x_train = X.iloc[train_index]
#                     x_valid = X.iloc[val_index]
#                     y_train = y.iloc[train_index]
#                     y_valid = y.iloc[val_index]
#                     x_train, x_valid, x_test = scaling_specific_features(x_train, x_valid, test, features_for_training)

#                     model = linear_model.LogisticRegression(max_iter=100, C=C, penalty=penalty, solver=solver)
#                     # model = linear_model.LogisticRegression(max_iter=100, C=C, penalty=penalty, solver=solver, random_state=0)
#                     # model = linear_model.LogisticRegression(max_iter=100, C=C, penalty=penalty, solver=solver, random_state=0, l1_ratio=l1_ratio)
#                     model.fit(x_train[features_for_training], y_train)
#                     importance_list.append(model.coef_.ravel())

#                     y_pred_valid = model.predict_proba(x_valid[features_for_training])[:, 1]
#                     y_pred_valid2 = model.predict(x_valid[features_for_training])

#                     # print(f"Fold: {num_fold+1} => ROC-AUC: {round(roc_auc_score(y_valid, y_pred_valid), 5)}  accuracy: {round(accuracy_score(y_valid, y_pred_valid2),5)}")
#                     roc_auc += roc_auc_score(y_valid, y_pred_valid) / 5
#                     accuracy += accuracy_score(y_valid, y_pred_valid2) / 5
#                     prediction += model.predict_proba(x_test[features_for_training])[:, 1] / 5

#                 print(f"kf={kf} C={C} penalty={penalty} solver={solver} l1_ratio={l1_ratio}")
#                 print(f"Average ROC-AUC = {roc_auc}  Average accuracy = {accuracy}")
#                 if roc_auc>best_roc_auc:
#                     best_roc_auc = roc_auc
#                     best_roc_auc_model = f"kf={kf} C={C} penalty={penalty} solver={solver} l1_ratio={l1_ratio}"
#                 if accuracy>best_accuracy:
#                     best_accuracy = accuracy
#                     best_accuracy_model = f"kf={kf} C={C} penalty={penalty} solver={solver} l1_ratio={l1_ratio}"

In [601]:
# print(best_roc_auc, best_roc_auc_model)
# print(best_accuracy, best_accuracy_model)

Use Ensemble (w/Soft voting) 

In [None]:
# import pandas as pd
# import numpy as np
# from sklearn.model_selection import train_test_split, GroupKFold
# from sklearn.svm import SVC, SVR
# from sklearn.metrics import accuracy_score, roc_auc_score
# from sklearn.preprocessing import StandardScaler, OneHotEncoder
# from sklearn.pipeline import make_pipeline
# from sklearn.impute import KNNImputer
# from sklearn.linear_model import LogisticRegression
# from sklearn.naive_bayes import GaussianNB
# from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, VotingClassifier
# from keras.models import Sequential
# from keras.layers import Dense, Dropout, Activation
# import warnings
# from sklearn.neural_network import MLPClassifier

# train_df = pd.read_csv('train.csv', index_col='id')
# test_df = pd.read_csv('test.csv', index_col='id')

In [None]:
# y = train_df.failure

# def get_models():
#     models = []
#     models.append(('lr_1', LogisticRegression(penalty='l1', C=0.01, solver='liblinear', random_state=1)))
#     models.append(('lr_2', LogisticRegression(max_iter=500, C=0.0001, penalty='l2', solver='newton-cg', random_state=1)))
#     models.append(('bayes', GaussianNB(var_smoothing=0.5, priors=[len(y[y == 0]) / len(y), len(y[y == 1])/len(y)])))
#     models.append(('adaboost', AdaBoostClassifier(n_estimators=100, random_state=12)))
#     models.append(('mlp', MLPClassifier(alpha=0.1, max_iter=1000)))
#     return models

In [None]:
# accuracys = []
# y_pred = []
# importance_features = []
# # GroupKFold：保證同一個group的數據不會同時出現在訓練集和測試集上
# gkf = GroupKFold(n_splits=5) # 5 product codes => n_splits=5
# all_features = test_df.columns

# for num_fold, (train_index, valid_index) in enumerate(gkf.split(train_df, train_df.failure, groups=train_df.product_code)):
#     X_train = train_df.iloc[train_index][all_features]
#     y_train = train_df.iloc[train_index].failure
#     X_valid = train_df.iloc[valid_index][all_features]
#     y_valid = train_df.iloc[valid_index].failure
#     X_test = test_df.copy()

#     # one-hot encoding for attribute_0, attribute_1
#     # drop first：ensure there are no "reference" columns (= the remaining columns become linearly independent.)
#     encoding_attributes = ['attribute_0', 'attribute_1']
#     encoding_output = ['encoding_7', 'encoding_6', 'encoding_8']
#     encoder = OneHotEncoder(categories=[['material_5', 'material_7'],['material_5', 'material_6', 'material_8']],
#                         drop='first', sparse_output=False, handle_unknown='ignore')
#     encoder.fit(X_train[encoding_attributes])

#     for df in [X_train, X_valid, X_test]:
#         with warnings.catch_warnings():  # ignore "Found unknown categories"
#             warnings.filterwarnings('ignore', category=UserWarning)
#             df[encoding_output] = encoder.transform(df[encoding_attributes])  # add new features after encoding
#         df.drop(columns=encoding_attributes, inplace=True)  # delete old features before encoding

#     # add new features for missing values
#     for df in [X_train, X_valid, X_test]:
#         df['m_3_missing'] = df.measurement_3.isna()
#         df['m_5_missing'] = df.measurement_5.isna()
    
#     # Impute the missing values based on product code 
#     features_may_have_nan = [c for c in X_train.columns if c == 'loading' or c.startswith('measurement')]
#     imputer = KNNImputer(n_neighbors=3)
#     for df in [X_train, X_valid, X_test]:
#         for pc in list((df.groupby('product_code')['product_code']).count().index):
#             df.loc[df['product_code']==pc, features_may_have_nan] = imputer.fit_transform(df.loc[df['product_code']==pc, features_may_have_nan])

#     # Clip measurement_2 => y correlated when this values > 10 
#     for df in [X_train, X_valid, X_test]:
#         df['measurement_2'] = df['measurement_2'].clip(lower=11, upper=None)
    
#     # Standardize data (StandardScaler) + train model
#     features_for_training = ['loading', 'attribute_3', 'measurement_2', 'measurement_4', 'measurement_17', 'm_3_missing', 'm_5_missing']
#     # features_for_training = [f for f in X_train.columns if f != 'product_code']  # remove product_code
#     scores = []
#     for (name, md) in get_models():
#         model = make_pipeline(StandardScaler(), 
#                             md)
                          
#         model.fit(X_train[features_for_training], y_train)
#         # importance_features.append(model.named_steps['logisticregression'].coef_.ravel())

#         # validation
#         y_pred_valid = model.predict(X_valid[features_for_training])
#         score = accuracy_score(y_valid, y_pred_valid)
#         print(f"Model {name}  Fold {num_fold}: accuracy = {score:.5f}")
#         scores.append(score)
#     accuracys.append(scores)
#     y_pred.append(model.predict(X_test[features_for_training]))

# # print overall score
# # print(f"Average accuracy = {sum(accuracys) / len(accuracys):.5f}")
# print('============================================')
# model_scores = []
# for i in range(len(accuracys[0])):
#     sum = 0
#     for j in range(len(accuracys)):
#         sum += accuracys[j][i]
#     print(f"Average accuracy = {sum / len(accuracys):.5f}")
#     model_scores.append(sum / len(accuracys))
# print('============================================')
# print(y_pred)
    