In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.metrics import cohen_kappa_score
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from ltcn.LTCN import LTCN
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
import pickle
import shap

In [2]:
# Data load 
data = pd.read_csv(r"C:\Users\elinavaldmane\Desktop\Thesis_4.0\Data\ProcessedData.csv")

Y = (data["ConnectionFailure"]).values
X = data.drop(columns=["ConnectionFailure"]).values

Y_df = (data["ConnectionFailure"])
X_df = data.drop(columns=["ConnectionFailure"])

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=42)

# Normalization
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Undersampling - Random
ru = RandomUnderSampler(random_state=42)
X_train_u, y_train_u = ru.fit_resample(X_train_scaled, y_train)

print(X_train_u.shape)
print(y_train_u.shape)
print(X_test_scaled.shape)
print(y_test.shape)


(3742, 225)
(3742,)
(51302, 225)
(51302,)


In [3]:
def evaluate_model(y_true, y_pred):
    f1 = f1_score(y_true, y_pred)
    kappa = cohen_kappa_score(y_true, y_pred)
    cm = confusion_matrix(y_true, y_pred)

    print("Cappa Score:", kappa)
    print("F1 Score:", f1)
    print("Confusion matrix: ", cm)

### LR

In [4]:
model_lr = LogisticRegression()
model_lr.fit(X_train_u, y_train_u)

Y_pred = model_lr.predict(X_test_scaled)
print(evaluate_model(y_test, Y_pred))

Cappa Score: 0.0898593178805226
F1 Score: 0.10431458219552157
Confusion matrix:  [[44360  6509]
 [   51   382]]
None


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [5]:
model_lr_hp = LogisticRegression()

param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100], 
    'penalty': ['l1', 'l2'],  
    'solver': ['liblinear'], 
    'max_iter': [100, 200, 300], 
    'fit_intercept': [True, False],
}

grid_search = GridSearchCV(model_lr_hp, param_grid=param_grid, cv=5, scoring='f1', n_jobs=-1)

grid_search.fit(X_train_u, y_train_u)

best_model_lr = grid_search.best_estimator_
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

Y_pred_best = best_model_lr.predict(X_test_scaled)

evaluate_model(y_test, Y_pred_best)

Best Hyperparameters: {'C': 1, 'fit_intercept': True, 'max_iter': 300, 'penalty': 'l1', 'solver': 'liblinear'}
Cappa Score: 0.09071279586304937
F1 Score: 0.10514726121662539
Confusion matrix:  [[44418  6451]
 [   51   382]]
Evaluation Result: None


### LGBM

In [6]:
model_lgbm = LGBMClassifier()
model_lgbm.fit(X_train_u, y_train_u)

Y_pred = model_lgbm.predict(X_test_scaled)
evaluate_model(y_test, Y_pred)

[LightGBM] [Info] Number of positive: 1871, number of negative: 1871
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007959 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 21620
[LightGBM] [Info] Number of data points in the train set: 3742, number of used features: 183
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Cappa Score: 0.203099309238577
F1 Score: 0.2150313152400835
Confusion matrix:  [[47882  2987]
 [   21   412]]
None


In [7]:
model_lgbm_hp = LGBMClassifier()

param_grid = {
    'learning_rate': [0.001, 0.01, 0.1],
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'num_leaves': [15, 31, 63],
    'subsample': [0.8, 1.0],
}

grid_search = GridSearchCV(model_lgbm_hp, param_grid=param_grid, cv=5, scoring='f1', n_jobs=-1)
grid_search.fit(X_train_u, y_train_u)

best_model_lgbm_hp = grid_search.best_estimator_
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

Y_pred_best = best_model_lgbm_hp.predict(X_test_scaled)

evaluate_model(y_test, Y_pred_best)


[LightGBM] [Info] Number of positive: 1871, number of negative: 1871
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009842 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 21620
[LightGBM] [Info] Number of data points in the train set: 3742, number of used features: 183
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200, 'num_leaves': 31, 'subsample': 0.8}
Cappa Score: 0.21195310471415252
F1 Score: 0.22368061252392668
Confusion matrix:  [[48054  2815]
 [   24   409]]
Evaluation Result: None


### LTCN

In [8]:
def evaluate_model_LTCN(Y_test, Y_pred):
    kappa = cohen_kappa_score(np.argmax(Y_test, axis=1), np.argmax(Y_pred, axis=1))
    f1 = f1_score(np.argmax(Y_test, axis=1), np.argmax(Y_pred, axis=1), average='macro')
    confusion = confusion_matrix(np.argmax(Y_test, axis=1), np.argmax(Y_pred, axis=1))
    
    return {
        "Cohen's Kappa": kappa,
        "F1 Score": f1,
        "Confusion Matrix": confusion
    }

In [9]:
# encoding for LTCN
X_train_np = X_train_u
X_test_np = X_test_scaled
y_train_np = pd.get_dummies(y_train_u)
y_test_np = pd.get_dummies(y_test)


In [10]:
print(X_train_np.shape)
print(y_train_np.shape)
print(X_test_np.shape)
print(y_test_np.shape)

(3742, 225)
(3742, 2)
(51302, 225)
(51302, 2)


In [11]:
model_ltcn = LTCN()
model_ltcn.fit(X_train_np, y_train_np)

Y_pred_1 = model_ltcn.predict(X_test_np)
print(evaluate_model_LTCN(y_test_np, Y_pred_1))

{"Cohen's Kappa": 0.08894208790133262, 'F1 Score': 0.5176959002188383, 'Confusion Matrix': array([[44442,  6427],
       [   59,   374]], dtype=int64)}


In [12]:
model_ltcn_hp = LTCN()

param_grid = {
    'T': [5],
    'phi': [0.3, 0.5, 0.9],
    'function': ['sigmoid'],
    'method': ['ridge', 'inverse'],
    'alpha': [1.0E-4, 1.0E-3, 1.0E-2]
}

kf = KFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(model_ltcn_hp, param_grid, scoring='f1', cv=kf)

grid_search.fit(X_train_np, y_train_np)

best_model_ltcn_hp = grid_search.best_estimator_
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

Y_pred_best = best_model_ltcn_hp.predict(X_test_np)

evaluation_result = evaluate_model_LTCN(y_test_np, Y_pred_best)
print("Evaluation Result:", evaluation_result)


Scoring failed. The score on this train-test partition for these parameters will be set to nan. Details: 
Traceback (most recent call last):
  File "c:\Users\elinavaldmane\AppData\Local\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\elinavaldmane\AppData\Local\anaconda3\Lib\site-packages\sklearn\metrics\_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\elinavaldmane\AppData\Local\anaconda3\Lib\site-packages\sklearn\metrics\_scorer.py", line 353, in _score
    y_pred = method_caller(estimator, "predict", X)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\elinavaldmane\AppData\Local\anaconda3\Lib\site-packages\sklearn\metrics\_scorer.py", line 86, 

Best Hyperparameters: {'T': 5, 'alpha': 0.0001, 'function': 'sigmoid', 'method': 'ridge', 'phi': 0.3}
Evaluation Result: {"Cohen's Kappa": 0.08835916782006559, 'F1 Score': 0.5171875326727086, 'Confusion Matrix': array([[44402,  6467],
       [   59,   374]], dtype=int64)}


### Feature importance rankings

In [13]:
# LTCN feature relevance scores
# Function to compute scores
def compute_scores(ltcn):
    if ltcn.method == 'ridge':
        W = ltcn.model.coef_.T
    else:
        W = ltcn.W2[:-1, :]

    n_features = int(W.shape[0] / (ltcn.T + 1))
    scores = np.zeros((n_features, W.shape[1]))

    for i in range(0, W.shape[0], n_features):
        scores += np.absolute(W[i:(i + n_features), :])

    mean_scores = np.mean(scores, axis=1)
    return mean_scores / np.sum(mean_scores)

# Feature names and scores
feature_names = X_df.columns
feature_dict = dict(zip(feature_names, compute_scores(best_model_ltcn_hp)))
sorted_feature_dict = dict(sorted(feature_dict.items(), key=lambda x: x[1], reverse=True))
print(sorted_feature_dict)

{'BondPosZ_2': 0.061939626204117464, 'BondPosZA_2': 0.06108268668916612, 'DeformationT20T_2': 0.04276667890931522, 'VCPullState_2': 0.04266507641329306, 'DeformationT30T_2': 0.03941903249251119, 'VCPullState_1': 0.032164892749076184, 'DeformationT20T_1': 0.02566219904110359, 'DeformationT90V_2': 0.02279423666955549, 'MaxDef_2': 0.022449195204812315, 'DeformationT20V_2': 0.022073035601039175, 'DeformationT70V_2': 0.022058561547696982, 'DeformationT50V_2': 0.021678070868958444, 'DeformationT10V_2': 0.02166626139003548, 'DeformationT30V_2': 0.021611313629542314, 'DeformationT100V_2': 0.021053785653956856, 'BondPosZA_1': 0.019643073881624486, 'BondPosZ_1': 0.01877598036566776, 'PowerSum_1': 0.018136203256759617, 'MachineID_WB-2 Roger 61-X-600271': 0.016944846427485935, 'DeformData_2_Scale': 0.014351472049734334, 'DeformData_1_Scale': 0.014341874433402691, 'PullMaxValue_2': 0.014170963628470779, 'BondPosX_2': 0.013338925266496948, 'PullMaxValue_1': 0.013233336723137031, 'DeformationReserve_

In [14]:
file_path = 'LTCN_W_feature_importance_rank_ru.pkl'
with open(file_path, 'wb') as file:
    pickle.dump(sorted_feature_dict, file)

### SHAP

In [15]:
K = 1
back_data_summary = shap.sample(X_train_np, K)
explainer = shap.KernelExplainer(best_model_ltcn_hp.predict, back_data_summary)
shap_values = explainer.shap_values(X_train_np)

  0%|          | 0/3742 [00:00<?, ?it/s]

In [16]:
with open('shap_explainer.pkl', 'wb') as f:
    pickle.dump(explainer, f)

with open('shap_values.pkl', 'wb') as f:
    pickle.dump(shap_values, f)

In [17]:
# SHAP feature relevance scores
feature_importance_scores = np.abs(shap_values).mean(axis=0)
feature_importance_scores_flat = feature_importance_scores.flatten()
feature_importance = pd.DataFrame(list(zip(X_df.columns, feature_importance_scores_flat)), columns=['col_name', 'feature_importance_vals'])

# Assuming feature importance
feature_importance['abs_mean_shap'] = feature_importance['feature_importance_vals'].apply(lambda x: np.max(np.abs(x)))
feature_importance.sort_values(by=['abs_mean_shap'], ascending=False, inplace=True) # sort
feature_importance.drop(columns=['abs_mean_shap'], inplace=True) # and drop

# Create dictionary
feature_importance_dict = feature_importance.set_index('col_name')['feature_importance_vals'].to_dict()
print(feature_importance_dict)


{'PowerSum_1': 39.13145674162179, 'BondPosZ_2': 26.95862637295981, 'MachineID_WB-2 Roger 61-X-600271': 26.55882388068948, 'BondPosZA_2': 26.51564210043672, 'DeformationReserve_1': 18.719046266140356, 'MaxDef_1': 18.40000176754687, 'DeformData_2_Scale': 18.16571375734638, 'DeformData_1_Scale': 18.154451943886976, 'MachineID_WB-4 Pierce 62-X-600511': 11.870615915671808, 'PgmIniPower_1': 10.822774847252447, 'PgmMaxPower_1': 10.790685042723016, 'PgmMinPower_1': 10.754555570771315, 'ChipNr_2': 8.957323884185096, 'BondPosX_2': 8.31344641028278, 'BondPosX_1': 7.847664350000122, 'ChipName_1_TPCB_A': 6.863606953850645, 'BondPosZA_1': 6.332213591270509, 'BondPosZ_1': 6.209324194260949, 'ChipName_2_MMU': 4.718193552167496, 'UsgData2first_2': 3.9790714687622692, 'TD_Steps_1': 3.418692250159599, 'ChipName_1_CELL_POS': 3.329270863596931, 'DeformationT90V_2': 2.629903184718507, 'ChipName_2_PCB': 2.5982376663788145, 'ChipNr_1': 2.551019269228688, 'DeformationT100V_2': 2.505094322176464, 'UsgData2_1_Of

In [18]:
file_path = 'LTCN_SHAP_feature_importance_rank_ru.pkl'
with open(file_path, 'wb') as file:
    pickle.dump(feature_importance_dict, file)