In [1]:
import joblib

import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression 
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import confusion_matrix, classification_report, fbeta_score, make_scorer
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE


## Evaluation Methods

In order to find the optimal model, F_beta score is applied in cross validation. beta equals to 2 indicates that we have a higher weight on the recall when selecting models . In other words, type II error will be weigh higher.

When evaluating the test data, the main measure for deciding the optimal model will be the challenge metric (Total cost = Cost1 · `#type I failures` + Cost2 · `#type II failures`). Besides, F_beta score and classification reports of precision/recall will be applied when running the script as a reference for the model performance. Together with challenge metrics, F_beta scores are recorded as F2 scores.

`#type I failures` is the count of type I failures (false positive), which indicates no faulty systems were reported positive falsely and leads to unnecessary mechanical check. While `#type II failures` is the count of type II failures (false negative), which indicates these problematic systems were reported no failure. The costs are 10 and 500 respectively.

## Load data


In [2]:
data = pd.read_csv('../data/ida_2016_training_set_update.csv', na_values=['na'])

In [None]:
# list(data.columns)

In [3]:
# Split into X and y
X, y = data.drop(['class'], axis=1), data['class']

In this section, missing values will be imputed by the median of each feature. After that, all the features are scaled before fitting the algorithm.

### Data imputation

In [4]:
# Function to return a table of missing values count&percent
def missing_values_table(df):
    missing_values = df.isnull().sum()
    missing_values_percent = 100 * df.isnull().sum() / len(df)

    table = pd.concat([missing_values, missing_values_percent], axis=1)
    renamed_table = table.rename(columns = {0 : 'Missing Values', 1 : 'Percent of Total Values'})

    # Sort the table by percentage
    renamed_table_sorted = renamed_table[renamed_table.iloc[:,1] != 0].sort_values('Percent of Total Values', ascending=False).round(2)

    print("Input dataframe has " + str(df.shape[1]) + " columns in total.")
    print("There are " + str(renamed_table_sorted.shape[0]) + " columns that have null values.")

    return renamed_table_sorted

In [5]:
missing_values_table(X)

Input dataframe has 170 columns in total.
There are 169 columns that have null values.


Unnamed: 0,Missing Values,Percent of Total Values
br_000,49264,82.11
bq_000,48722,81.20
bp_000,47740,79.57
bo_000,46333,77.22
ab_000,46329,77.22
...,...,...
by_000,473,0.79
ci_000,338,0.56
cj_000,338,0.56
ck_000,338,0.56


In [6]:
# Imputate missing values by median
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

In [7]:
with open('../models/imputer.pkl', 'wb') as pickled_file: 
    joblib.dump(imputer, pickled_file)

In [8]:
missing_values_table(X_imputed)

Input dataframe has 170 columns in total.
There are 0 columns that have null values.


Unnamed: 0,Missing Values,Percent of Total Values


In [9]:
X_imputed.head()

Unnamed: 0,aa_000,ab_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,ag_003,...,ee_002,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000
0,76698.0,0.0,2130706000.0,280.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1240520.0,493384.0,721044.0,469792.0,339156.0,157956.0,73224.0,0.0,0.0,0.0
1,33058.0,0.0,0.0,126.0,0.0,0.0,0.0,0.0,0.0,0.0,...,421400.0,178064.0,293306.0,245416.0,133654.0,81140.0,97576.0,1500.0,0.0,0.0
2,41040.0,0.0,228.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,...,277378.0,159812.0,423992.0,409564.0,320746.0,158022.0,95128.0,514.0,0.0,0.0
3,12.0,0.0,70.0,66.0,0.0,10.0,0.0,0.0,0.0,318.0,...,240.0,46.0,58.0,44.0,10.0,0.0,0.0,0.0,4.0,32.0
4,60874.0,0.0,1368.0,458.0,0.0,0.0,0.0,0.0,0.0,0.0,...,622012.0,229790.0,405298.0,347188.0,286954.0,311560.0,433954.0,1218.0,0.0,0.0


### Scaling data

In [10]:
X_imputed_scaled = X_imputed.copy()

scaler = preprocessing.MinMaxScaler(feature_range = (0, 1))

In [11]:
X_imputed_scaled = scaler.fit_transform(X_imputed_scaled)

In [12]:
with open('../models/scaler.pkl', 'wb') as pickled_file: 
    joblib.dump(scaler, pickled_file)

In [13]:
# Split into training/testing dataset
X_train, X_test, y_train, y_test = train_test_split(X_imputed_scaled, y, test_size=0.2, random_state=3, stratify=y)

## Logistic regression

Similar to previous XGBoost models,there are three approaches will be experimented: 
1. One with default parameters, 
2. Another one with defined parameter ‘class_weight’ (as ’balanced’ to adjust proportional classes), 
3. And a set of grid search CV models is to find the optimal regularization parameter C from provided range.

In [14]:
# Function to get the total cost (competition metric)
def get_total_cost(type1_error, type2_error, cost1 = 10,cost2 = 500):
    # type II error: FN, cost=500
    # type I error: FP, cost=10
    return type1_error*cost1 + type2_error*cost2

In [15]:
# Function to get prettified confusion matrix
def get_confusion_matrix(y_pred, y_true=y_test):
    confusion_matrix_df = pd.DataFrame(
        confusion_matrix(y_true, y_pred, labels=['pos', 'neg']),
        index=['True:pos', 'True:neg'], 
        columns=['Pred:pos', 'Pred:neg']
    )
    
    return confusion_matrix_df

### Baseline model

In [16]:
lr_model = LogisticRegression(random_state=3)

In [17]:
lr_model.fit(X_train, y_train)

LogisticRegression(random_state=3)

In [18]:
y_pred = lr_model.predict(X_test)

In [19]:
# Evalutation on test data
conf_mat = get_confusion_matrix(y_pred)
print(f'Confusion matrix:\n {conf_mat}')

Confusion matrix:
           Pred:pos  Pred:neg
True:pos        89       111
True:neg        24     11776


In [20]:
total_cost = get_total_cost(type1_error=24, type2_error=111)
print(f'Total cost: {total_cost}')

Total cost: 55740


In [21]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         neg       0.99      1.00      0.99     11800
         pos       0.79      0.45      0.57       200

    accuracy                           0.99     12000
   macro avg       0.89      0.72      0.78     12000
weighted avg       0.99      0.99      0.99     12000



In [22]:
ftwo_score = fbeta_score(y_test, y_pred, beta=2, pos_label='pos')

print(f'F2 score: {ftwo_score}')

F2 score: 0.48740416210295734


### Add class weights during training

In [23]:
lr_model2 = LogisticRegression(random_state=3, class_weight='balanced', max_iter=3000)

In [24]:
lr_model2.fit(X_train, y_train)

LogisticRegression(class_weight='balanced', max_iter=3000, random_state=3)

In [25]:
y_pred2 = lr_model2.predict(X_test)

In [26]:
# Evalution on test data
conf_mat2 = get_confusion_matrix(y_pred2)
print(f'Confusion matrix:\n {conf_mat2}')

Confusion matrix:
           Pred:pos  Pred:neg
True:pos       188        12
True:neg       328     11472


In [27]:
total_cost2 = get_total_cost(type1_error=328, type2_error=12)
print(f'Total cost: {total_cost2}')

Total cost: 9280


In [28]:
print(classification_report(y_test, y_pred2))

              precision    recall  f1-score   support

         neg       1.00      0.97      0.99     11800
         pos       0.36      0.94      0.53       200

    accuracy                           0.97     12000
   macro avg       0.68      0.96      0.76     12000
weighted avg       0.99      0.97      0.98     12000



In [29]:
ftwo_score2 = fbeta_score(y_test, y_pred2, beta=2, pos_label='pos')

print(f'F2 score: {ftwo_score2}')

F2 score: 0.7142857142857143


### Grid search to find best parameter C

In [30]:
# Define a 5-fold splits in applying CV, random_state is fixed
fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=3)

In [31]:
# A grid of parameter 'C' in logistic regresssion
params_grid = {'C': np.power(10.0, np.arange(-5, 5))}

lr_model3 = LogisticRegression(random_state=3, class_weight='balanced', max_iter=3000, verbose=1)

ftwo_scorer = make_scorer(fbeta_score, beta=2, pos_label='pos')
grid_search_cv_models = GridSearchCV(lr_model3, params_grid, cv=fold, scoring=ftwo_scorer)

In [32]:
# Fit the CV model for logistic regression
grid_search_cv_models.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_j

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   36.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   36.3s finished
[Parallel(n_jobs=1)]: Using b

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=3, shuffle=True),
             estimator=LogisticRegression(class_weight='balanced',
                                          max_iter=3000, random_state=3,
                                          verbose=1),
             param_grid={'C': array([1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02,
       1.e+03, 1.e+04])},
             scoring=make_scorer(fbeta_score, beta=2, pos_label=pos))

In [33]:
print(f'The best score during training: {grid_search_cv_models.best_score_}, with params: {grid_search_cv_models.best_params_}')

The best score during training: 0.7185352327461569, with params: {'C': 100.0}


In [34]:
y_pred3 = grid_search_cv_models.predict(X_test)

In [35]:
# Evaluation on test data
conf_mat3 = get_confusion_matrix(y_pred3)
print(f'Confusion matrix:\n {conf_mat3}')

Confusion matrix:
           Pred:pos  Pred:neg
True:pos       189        11
True:neg       284     11516


In [36]:
total_cost3 = get_total_cost(type1_error=284, type2_error=11)
print(f'Total cost: {total_cost3}')

Total cost: 8340


In [37]:
print(classification_report(y_test, y_pred3))

              precision    recall  f1-score   support

         neg       1.00      0.98      0.99     11800
         pos       0.40      0.94      0.56       200

    accuracy                           0.98     12000
   macro avg       0.70      0.96      0.77     12000
weighted avg       0.99      0.98      0.98     12000



In [38]:
ftwo_score3 = fbeta_score(y_test, y_pred3, beta=2, pos_label='pos')

print(f'F2 score: {ftwo_score3}')

F2 score: 0.7423409269442263


## SMOTE method

Additionally, SMOTE is applied here in combination with Logistic Regression to see if oversampling techniques can help the imbalanced data. SMOTE stands for Synthetic Minority Oversampling Technique.

In [39]:
smote = SMOTE(sampling_strategy='minority', random_state=3)

In [40]:
y_train.value_counts()

neg    47200
pos      800
Name: class, dtype: int64

In [41]:
# dataoversampling by smote
X_smoted, y_smoted = smote.fit_sample(X_train, y_train)

In [42]:
y_smoted.value_counts()

pos    47200
neg    47200
Name: class, dtype: int64

In [43]:
lr_model4 = LogisticRegression(random_state=3, max_iter=3000, verbose=1)

grid_search_cv_models_smoted = GridSearchCV(lr_model4, params_grid, cv=fold, scoring=ftwo_scorer)

In [44]:
# Fit the CV model for logistic regression
grid_search_cv_models_smoted.fit(X_smoted, y_smoted)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_j

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.2min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.1min finished
[Parallel(n_jobs=1)]: Using b

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=3, shuffle=True),
             estimator=LogisticRegression(max_iter=3000, random_state=3,
                                          verbose=1),
             param_grid={'C': array([1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02,
       1.e+03, 1.e+04])},
             scoring=make_scorer(fbeta_score, beta=2, pos_label=pos))

In [46]:
print(f'The best score during training: {grid_search_cv_models_smoted.best_score_}, with params: {grid_search_cv_models_smoted.best_params_}')

The best score during training: 0.9640531438044162, with params: {'C': 10000.0}


In [47]:
y_pred4 = grid_search_cv_models_smoted.predict(X_test)

In [48]:
conf_mat4 = get_confusion_matrix(y_pred4)
print(f'Confusion matrix:\n {conf_mat4}')

Confusion matrix:
           Pred:pos  Pred:neg
True:pos       184        16
True:neg       267     11533


In [52]:
total_cost4 = get_total_cost(type1_error=267, type2_error=16)
print(f'Total cost: {total_cost4}')

Total cost: 10670


In [50]:
print(classification_report(y_test, y_pred4))

              precision    recall  f1-score   support

         neg       1.00      0.98      0.99     11800
         pos       0.41      0.92      0.57       200

    accuracy                           0.98     12000
   macro avg       0.70      0.95      0.78     12000
weighted avg       0.99      0.98      0.98     12000



In [51]:
ftwo_score4 = fbeta_score(y_test, y_pred4, beta=2, pos_label='pos')

print(f'F2 score: {ftwo_score4}')

F2 score: 0.7354116706634694
