In [5]:
import xgboost

In [6]:
xgboost.__version__

'1.4.2'

In [350]:
import os
import pickle
from copy import deepcopy
from datetime import datetime
from collections import defaultdict
from collections import OrderedDict
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import xgboost
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import sklearn.metrics as metrics
from gensim.models import Word2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import statsmodels.api as sm
from statsmodels.tools import add_constant as add_constant
#from imblearn.over_sampling import SMOTE

In [260]:
## Reading Data
print("Reading input data")
trainingDataset=pd.read_excel('Encoded_data_binary.xlsx')
Error_data = pd.read_excel("data/Error_Encoding.xlsx")

Reading input data


In [261]:
Error_data['Error_split'] = Error_data['Error'].apply(lambda x:x.split(".")[-1])

In [738]:
def classification_stats(target_actual_res , positive_class_prob_score,target_model_res = None):
    
    false_positive_rate, true_positive_rate, thresholds = metrics.roc_curve(target_actual_res, positive_class_prob_score)
    auc_val = metrics.auc(false_positive_rate, true_positive_rate)
    if target_model_res is None:
        confusion_mat = "Not Available"
    else:
        confusion_mat = pd.crosstab(np.asarray(target_actual_res), np.asarray(target_model_res), rownames=['Actual'], colnames=['Predicted'])
        print(confusion_mat)
        print("\nPrecision is: ", metrics.precision_score(target_actual_res, target_model_res))
        print("Recall is: ", metrics.recall_score(target_actual_res, target_model_res))
        print("Overall_Accuracy is: ", metrics.accuracy_score(target_actual_res,target_model_res))

In [262]:
Error_data  = Error_data.groupby('Encoded value').agg({'Error_split':'first'}).drop_duplicates().reset_index()
Error_data.columns = ['Encoded value','Error']

In [263]:
SEED = 42

In [264]:
trainingDataset = trainingDataset.merge(Error_data, left_on='PrimaryCause', right_on='Encoded value', how='left').rename(columns={'Error':'PrimaryCause','PrimaryCause':'PrimaryCause_y'})

trainingDataset = trainingDataset.merge(Error_data, left_on='Cause1', right_on='Encoded value', how='left').rename(columns={'Error':'Cause1','Cause1':'Cause1_y'})
trainingDataset = trainingDataset.merge(Error_data, left_on='Cause2', right_on='Encoded value', how='left').rename(columns={'Error':'Cause2','Cause2':'Cause2_y'})
trainingDataset = trainingDataset.merge(Error_data, left_on='Cause3', right_on='Encoded value', how='left').rename(columns={'Error':'Cause3','Cause3':'Cause3_y'})
trainingDataset = trainingDataset.merge(Error_data, left_on='Cause4', right_on='Encoded value', how='left').rename(columns={'Error':'Cause4','Cause4':'Cause4_y'})
#trainingDataset = trainingDataset.merge(Error_data, left_on='Cause5', right_on='Encoded value', how='left').rename(columns={'Error':'Cause5','Cause5':'Cause5_y'})

In [277]:
trainingDataset['Cause1'] = trainingDataset['Cause1'].fillna(value=trainingDataset['PrimaryCause'])
trainingDataset['Cause2'] = trainingDataset['Cause2'].fillna(value=trainingDataset['PrimaryCause'])
trainingDataset['Cause3'] = trainingDataset['Cause3'].fillna(value=trainingDataset['PrimaryCause'])
trainingDataset['Cause4'] = trainingDataset['Cause4'].fillna(value=trainingDataset['PrimaryCause'])

In [279]:
tokens_summary = trainingDataset['Summary'].tolist()
tokens_primary = trainingDataset['PrimaryCause'].tolist()
tokens_cause1 = trainingDataset['Cause1'].tolist()
tokens_cause2 = trainingDataset['Cause2'].tolist()
tokens_cause3 = trainingDataset['Cause3'].tolist()
tokens_cause4 = trainingDataset['Cause4'].tolist()

In [280]:
model_summary = Word2Vec(sentences=tokens_summary, vector_size=10, workers=4, seed=SEED)
model_primary = Word2Vec(sentences=tokens_primary, vector_size=5, workers=4, seed=SEED)
model_cause1 = Word2Vec(sentences=tokens_cause1, vector_size=5, workers=4, seed=SEED)
model_cause2 = Word2Vec(sentences=tokens_cause2, vector_size=5, workers=4, seed=SEED)
model_cause3 = Word2Vec(sentences=tokens_cause3, vector_size=5, workers=4, seed=SEED)
model_cause4 = Word2Vec(sentences=tokens_cause4, vector_size=5, workers=4, seed=SEED)

In [281]:
def vectorize(list_of_docs, model):
    """Generate vectors for list of documents using a Word Embedding

    Args:
        list_of_docs: List of documents
        model: Gensim's Word Embedding

    Returns:
        List of document vectors
    """
    features = []

    for tokens in list_of_docs:
        zero_vector = np.zeros(model.vector_size)
        vectors = []
        for token in tokens:
            if token in model.wv:
                try:
                    vectors.append(model.wv[token])
                except KeyError:
                    continue
        if vectors:
            vectors = np.asarray(vectors)
            avg_vec = vectors.mean(axis=0)
            features.append(avg_vec)
        else:
            features.append(zero_vector)
    return features

vectorized_summary = vectorize(tokens_summary, model=model_summary)
vectorized_primary = vectorize(tokens_primary, model=model_primary)
vectorized_cause1 = vectorize(tokens_cause1, model=model_cause1)
vectorized_cause2 = vectorize(tokens_cause2, model=model_cause2)
vectorized_cause3 = vectorize(tokens_cause3, model=model_cause3)
vectorized_cause4 = vectorize(tokens_cause4, model=model_cause4)

In [282]:
vectorized_summary = pd.DataFrame(vectorized_summary, columns=['summary.1','summary.2','summary.3','summary.4','summary.5','summary.6','summary.7','summary.8','summary.9','summary.10'])
vectorized_primary = pd.DataFrame(vectorized_primary, columns=['primary.1','primary.2','primary.3','primary.4','primary.5'])
vectorized_cause1 = pd.DataFrame(vectorized_cause1, columns=['cause1.1','cause1.2','cause1.3','cause1.4','cause1.5'])
vectorized_cause2 = pd.DataFrame(vectorized_cause2, columns=['cause2.1','cause2.2','cause2.3','cause2.4','cause2.5'])
vectorized_cause3 = pd.DataFrame(vectorized_cause3, columns=['cause3.1','cause3.2','cause3.3','cause3.4','cause3.5'])
vectorized_cause4 = pd.DataFrame(vectorized_cause4, columns=['cause4.1','cause4.2','cause4.3','cause4.4','cause4.5'])

In [284]:
trainingDataset = pd.concat([trainingDataset,vectorized_summary,vectorized_primary,vectorized_cause1,vectorized_cause2,vectorized_cause3,vectorized_cause4], axis=1)


In [286]:
balanced_data = trainingDataset.groupby('BinaryLabel').apply(lambda x:x.sample(n=131)).reset_index(drop=True)

In [712]:
## Over Sampling balancing
data = trainingDataset.copy()
## Under Sampling balancing
b_data = balanced_data.copy()

In [713]:
cols_consider = data.columns[27:].tolist()

In [714]:
X = data.loc[:, cols_consider]
#sca = StandardScaler()
#X = sca.fit_transform(X)
y = data.loc[:, ['BinaryLabel']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)
X_b = b_data.loc[:,cols_consider]
y_b = b_data.loc[:, ['BinaryLabel']]
Xb_train, Xb_test, yb_train, yb_test = train_test_split(X_b, y_b, test_size=0.2, random_state=SEED)

In [177]:
#features = ['Component/s', 'Environment','Project key', 'PrimaryCause', 'Cause1', 'Cause2','summary.1','summary.2','summary.3','summary.4','summary.5','summary.6','summary.7','summary.8','summary.9','summary.10','BinaryLabel']

## Logistic Regression

In [715]:
## Logisitc Regression
#smote = SMOTE()
X_train_constant = add_constant(X_train)
model_logit = sm.Logit(y_train, X_train_constant)
model_logit_result = model_logit.fit()
print(model_logit_result.summary())

Optimization terminated successfully.
         Current function value: 0.532697
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:            BinaryLabel   No. Observations:                  401
Model:                          Logit   Df Residuals:                      365
Method:                           MLE   Df Model:                           35
Date:                Fri, 22 Oct 2021   Pseudo R-squ.:                 0.08564
Time:                        16:36:58   Log-Likelihood:                -213.61
converged:                       True   LL-Null:                       -233.62
Covariance Type:            nonrobust   LLR p-value:                    0.2573
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.3753     46.269      0.008      0.994     -90.311      91.061
summary.1    -20.3114     21.

In [716]:
def back_feature_elem (data_frame,dep_var,col_list):
    removed_cols = []
    while len(col_list)>0 :
        model=sm.Logit(dep_var,data_frame[col_list])
        result=model.fit(disp=0)
        largest_pvalue=round(result.pvalues,3).nlargest(1)
        if largest_pvalue[0]<(0.1):
            return result, removed_cols
            break
        else:
            removed_cols.append(largest_pvalue.index[0])
            #print("removing :",largest_pvalue.index)
            col_list.remove(largest_pvalue.index)

result, new_features = back_feature_elem(X_train_constant,y_train,cols_consider[:])

In [717]:
result.summary()

0,1,2,3
Dep. Variable:,BinaryLabel,No. Observations:,401.0
Model:,Logit,Df Residuals:,394.0
Method:,MLE,Df Model:,6.0
Date:,"Fri, 22 Oct 2021",Pseudo R-squ.:,0.04718
Time:,16:37:08,Log-Likelihood:,-222.6
converged:,True,LL-Null:,-233.62
Covariance Type:,nonrobust,LLR p-value:,0.00119

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
summary.6,17.2413,4.975,3.465,0.001,7.490,26.993
summary.8,-43.1855,13.772,-3.136,0.002,-70.178,-16.193
cause1.2,15.2999,7.368,2.076,0.038,0.858,29.742
cause1.3,-14.0253,7.324,-1.915,0.056,-28.381,0.330
cause1.5,-6.6689,3.677,-1.814,0.070,-13.875,0.537
cause3.2,-10.7927,4.207,-2.565,0.010,-19.039,-2.547
cause3.5,-10.0461,4.608,-2.180,0.029,-19.078,-1.014


In [790]:
logit_sk = LogisticRegression(penalty='l2',max_iter=100,random_state=SEED, class_weight='balanced').fit(X_train, y_train.to_numpy().ravel())
logit_sk_b = LogisticRegression(penalty='l2',max_iter=100,random_state=SEED, class_weight='balanced').fit(X_train[new_features], y_train.to_numpy().ravel())

In [793]:
results_train = logit_sk.predict_proba(X_train)
results_train_v2 = logit_sk.predict(X_train)
results_test = logit_sk.predict_proba(X_test)
results_test_v2 = logit_sk.predict(X_test)
## New Feature set
results_train_b = logit_sk_b.predict_proba(X_train[new_features])
results_train_v2_b = logit_sk_b.predict(X_train[new_features])
results_test_b = logit_sk_b.predict_proba(X_test[new_features])
results_test_v2_b = logit_sk_b.predict(X_test[new_features])

In [788]:
print("Model Statistics for training data")
classification_stats(y_train.values.ravel(), results_train[:,1], results_train_v2)
print("*****"*10)
print("Model Statistics for test data")
classification_stats(y_test.values.ravel(), results_test[:,1], results_test_v2)

Model Statistics for training data
Predicted    0    1
Actual             
0           54   54
1          104  189

Precision is:  0.7777777777777778
Recall is:  0.6450511945392492
Overall_Accuracy is:  0.6059850374064838
**************************************************
Model Statistics for test data
Predicted   0   1
Actual           
0          11  12
1          34  44

Precision is:  0.7857142857142857
Recall is:  0.5641025641025641
Overall_Accuracy is:  0.5445544554455446


In [795]:
print("Model Statistics for training data")
classification_stats(y_train.values.ravel(), results_train_b[:,1], results_train_v2_b)
print("*****"*10)
print("Model Statistics for test data")
classification_stats(y_test.values.ravel(), results_test_b[:,1], results_test_v2_b)

Model Statistics for training data
Predicted    0    1
Actual             
0           53   55
1          101  192

Precision is:  0.7773279352226721
Recall is:  0.6552901023890785
Overall_Accuracy is:  0.6109725685785536
**************************************************
Model Statistics for test data
Predicted   0   1
Actual           
0          11  12
1          31  47

Precision is:  0.7966101694915254
Recall is:  0.6025641025641025
Overall_Accuracy is:  0.5742574257425742


## XGBOOST

In [721]:
## XGBOOST
hyper = OrderedDict({
    'kfold':[5],
    'n_estimators': [1,10,1],
    'max_depth': range(1,10,1),
    'learning_rate': [0.005,0.01,0.03,0.05,0.1],
    'subsample': np.arange(0.10,1.0, 0.10),
    'min_child_weight': range(1, 10,1)
})

In [766]:
## Defining the model
xgb = xgboost.sklearn.XGBClassifier(learning_rate =0.005,n_estimators=10,max_depth=4,min_child_weight=5,subsample=1,\
                         objective= 'binary:logistic',seed=SEED, scale_pos_weight=0.4)
xgb.fit(X_train, y_train)
xgb_b = xgboost.sklearn.XGBClassifier(learning_rate =0.005,n_estimators=10,max_depth=4,min_child_weight=5,subsample=1,\
                         objective= 'binary:logistic',seed=SEED)
xgb_b.fit(Xb_train, yb_train)



  return f(**kwargs)


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.005, max_delta_step=0, max_depth=4,
              min_child_weight=5, missing=nan, monotone_constraints='()',
              n_estimators=10, n_jobs=12, num_parallel_tree=1, random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=42,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [767]:
results_train = xgb.predict_proba(X_train)
results_train_v2 = xgb.predict(X_train)
results_test = xgb.predict_proba(X_test)
results_test_v2 = xgb.predict(X_test)
results_train_b = xgb_b.predict_proba(Xb_train)
results_train_v2_b = xgb_b.predict(Xb_train)
results_test_b = xgb_b.predict_proba(Xb_test)
results_test_v2_b = xgb_b.predict(Xb_test)

In [770]:
#print("Model Statistics for training data")
#classification_stats(y_train.values.ravel(), results_train[:,1], results_train_v2)
#print("*****"*10)
#print("Model Statistics for test data")
#classification_stats(y_test.values.ravel(), results_test[:,1], results_test_v2)

In [769]:
print("Model Statistics for training data")
classification_stats(yb_train.values.ravel(), results_train_b[:,1], results_train_v2_b)
print("*****"*10)
print("Model Statistics for test data")
classification_stats(yb_test.values.ravel(), results_test_b[:,1], results_test_v2_b)

Model Statistics for training data
Predicted   0   1
Actual           
0          85  20
1          45  59

Precision is:  0.7468354430379747
Recall is:  0.5673076923076923
Overall_Accuracy is:  0.6889952153110048
**************************************************
Model Statistics for test data
Predicted   0   1
Actual           
0          22   4
1          12  15

Precision is:  0.7894736842105263
Recall is:  0.5555555555555556
Overall_Accuracy is:  0.6981132075471698


## Random Forest

In [774]:
## Random Forest
model_rf = RandomForestClassifier(n_estimators=10, class_weight='balanced')
model_rf_b  = RandomForestClassifier(n_estimators=10)
hyper_rf = OrderedDict({
    'n_estimators': [10,1000,100],
    'max_depth': range(1,10,1),
    'min_samples_split':[10,100,10],
    'min_samples_leaf':[1,10,1],
    'max_features':['auto','sqrt'],
    'bootstrap':[True,False]
})
model_rf_cv = RandomizedSearchCV(estimator=model_rf, param_distributions=hyper_rf, cv=3, verbose=2, n_jobs=-1)
model_rf_cv_b = RandomizedSearchCV(estimator=model_rf_b, param_distributions=hyper_rf, cv=3, verbose=2, n_jobs=-1)

model_rf_cv = model_rf_cv.fit(X_train, y_train)
model_rf_cv_b = model_rf_cv_b.fit(Xb_train, yb_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  23 out of  30 | elapsed:  1.2min remaining:   21.0s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  1.2min finished
  self.best_estimator_.fit(X, y, **fit_params)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  23 out of  30 | elapsed:    7.5s remaining:    2.2s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    8.8s finished
  self.best_estimator_.fit(X, y, **fit_params)


In [775]:
results_train = model_rf_cv.predict_proba(X_train)
results_train_v2 = model_rf_cv.predict(X_train)
results_test = model_rf_cv.predict_proba(X_test)
results_test_v2 = model_rf_cv.predict(X_test)
results_train_b = model_rf_cv_b.predict_proba(Xb_train)
results_train_v2_b = model_rf_cv_b.predict(Xb_train)
results_test_b = model_rf_cv_b.predict_proba(Xb_test)
results_test_v2_b = model_rf_cv_b.predict(Xb_test)

In [776]:
print("Model Statistics for training data")
classification_stats(y_train.values.ravel(), results_train[:,1], results_train_v2)
print("*****"*10)
print("Model Statistics for test data")
classification_stats(y_test.values.ravel(), results_test[:,1], results_test_v2)

Model Statistics for training data
Predicted    0    1
Actual             
0          100    8
1           13  280

Precision is:  0.9722222222222222
Recall is:  0.9556313993174061
Overall_Accuracy is:  0.9476309226932669
**************************************************
Model Statistics for test data
Predicted   0   1
Actual           
0          12  11
1          23  55

Precision is:  0.8333333333333334
Recall is:  0.7051282051282052
Overall_Accuracy is:  0.6633663366336634


In [777]:
print("Model Statistics for training data")
classification_stats(yb_train.values.ravel(), results_train_b[:,1], results_train_v2_b)
print("*****"*10)
print("Model Statistics for test data")
classification_stats(yb_test.values.ravel(), results_test_b[:,1], results_test_v2_b)

Model Statistics for training data
Predicted   0   1
Actual           
0          63  42
1          21  83

Precision is:  0.664
Recall is:  0.7980769230769231
Overall_Accuracy is:  0.6985645933014354
**************************************************
Model Statistics for test data
Predicted  0   1
Actual          
0          8  18
1          9  18

Precision is:  0.5
Recall is:  0.6666666666666666
Overall_Accuracy is:  0.49056603773584906
