In [1]:
import os
import pickle
from copy import deepcopy
from datetime import datetime
from collections import defaultdict
from collections import OrderedDict
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import xgboost
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import sklearn.metrics as metrics
from gensim.models import Word2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import statsmodels.api as sm
from statsmodels.tools import add_constant as add_constant
from sklearn.feature_selection import VarianceThreshold

In [34]:
## Reading Data
print("Reading input data")
trainingDataset=pd.read_csv("Final_data_defect_classification.csv")

Reading input data


In [35]:
SEED=42

In [36]:
def classification_stats(target_actual_res , positive_class_prob_score,target_model_res = None):
    
    false_positive_rate, true_positive_rate, thresholds = metrics.roc_curve(target_actual_res, positive_class_prob_score)
    auc_val = metrics.auc(false_positive_rate, true_positive_rate)
    if target_model_res is None:
        confusion_mat = "Not Available"
    else:
        confusion_mat = pd.crosstab(np.asarray(target_actual_res), np.asarray(target_model_res), rownames=['Actual'], colnames=['Predicted'])
        print(confusion_mat)
        print("\nPrecision is: ", metrics.precision_score(target_actual_res, target_model_res))
        print("Recall is: ", metrics.recall_score(target_actual_res, target_model_res))
        print("Overall_Accuracy is: ", metrics.accuracy_score(target_actual_res,target_model_res))

In [37]:
trainingDataset = trainingDataset.rename(columns={'Final_extract':'BinaryLabel'})

In [38]:
trainingDataset = pd.get_dummies(trainingDataset, columns=['Project key','Issue Type'], drop_first=False, prefix=['key','type'])

In [7]:
cols_consider = [col for col in trainingDataset.columns.tolist() if col.startswith("summary_") or col.startswith("description_") or col.startswith("key_") or col.startswith("type_")]

In [8]:
def remove_features(df, key_sum=False,key_threshold = 30, variance=False, var_threshold=0.1, add_cols_remove=['Issue key','BinaryLabel']):
    cols_remove = []
    cols_selected = []
    if key_sum:
        key_cols = df.select_dtypes(include=np.number).columns.tolist()
        for col in key_cols:
            if df[col].sum()<key_threshold:
                cols_remove.append(col)
        cols_remove.extend(add_cols_remove)
    if variance:
        selector = VarianceThreshold(var_threshold)
        selector.fit(df[cols_consider])
        cols_selected = df.columns[selector.get_support(indices=True)]
    if len(cols_selected)==0:
        return [col for col in df.columns if col not in cols_remove]
    else:
        return [col for col in cols_selected if col not in cols_remove]

In [9]:
dataset = trainingDataset.copy()
cols_selected = list(remove_features(trainingDataset, key_sum=True, key_threshold=45))+['BinaryLabel']
cols_selected = [col for col in cols_selected if not col.startswith("wcode_")]
dataset = dataset[cols_selected]

In [10]:
dataset[cols_selected].sum().min()

46.0

In [11]:
corr_matrix = dataset.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.7)]

In [12]:
if "BinaryLabel" in to_drop:
    to_drop.remove("BinaryLabel")

In [13]:
dataset.drop(to_drop, axis=1, inplace=True)

In [14]:
balanced_data = dataset.groupby('BinaryLabel').apply(lambda x:x.sample(n=5857)).reset_index(drop=True)

In [15]:
## Over Sampling balancing
data = dataset.copy()
## Under Sampling balancing
b_data = balanced_data.copy()

In [16]:
cols_selected = list(set(cols_selected)-set(to_drop))
cols_selected.remove("BinaryLabel")

In [17]:
X = data.loc[:, cols_selected]
#sca = StandardScaler()
#X = sca.fit_transform(X)
y = data.loc[:, ['BinaryLabel']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=SEED)
X_b = b_data.loc[:,cols_selected]
y_b = b_data.loc[:, ['BinaryLabel']]
Xb_train, Xb_test, yb_train, yb_test = train_test_split(X_b, y_b, test_size=0.35, random_state=SEED)

## Logistic Regression

In [18]:
## Logisitc Regression
#smote = SMOTE()
X_train_constant = add_constant(X_train)
model_logit = sm.Logit(y_train, X_train_constant)
model_logit_result = model_logit.fit()
print(model_logit_result.summary())

         Current function value: 0.490445
         Iterations: 35
                           Logit Regression Results                           
Dep. Variable:            BinaryLabel   No. Observations:                13516
Model:                          Logit   Df Residuals:                    13410
Method:                           MLE   Df Model:                          105
Date:                Thu, 11 Nov 2021   Pseudo R-squ.:                  0.1795
Time:                        17:02:52   Log-Likelihood:                -6628.9
converged:                      False   LL-Null:                       -8079.1
Covariance Type:            nonrobust   LLR p-value:                     0.000
                              coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                       0.3710      0.196      1.891      0.059      -0.013       0.755
key_IOTDB                 



In [19]:
## Logisitc Regression
#smote = SMOTE()
Xb_train_constant = add_constant(Xb_train)
model_logitb = sm.Logit(yb_train, Xb_train_constant)
model_logit_resultb = model_logitb.fit()
print(model_logit_resultb.summary())

         Current function value: 0.565451
         Iterations: 35
                           Logit Regression Results                           
Dep. Variable:            BinaryLabel   No. Observations:                 7614
Model:                          Logit   Df Residuals:                     7508
Method:                           MLE   Df Model:                          105
Date:                Thu, 11 Nov 2021   Pseudo R-squ.:                  0.1842
Time:                        17:02:53   Log-Likelihood:                -4305.3
converged:                      False   LL-Null:                       -5277.2
Covariance Type:            nonrobust   LLR p-value:                     0.000
                              coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                      -0.5788      0.263     -2.200      0.028      -1.094      -0.063
key_IOTDB                 



In [21]:
def back_feature_elem (data_frame,dep_var,col_list):
    removed_cols = []
    while len(col_list)>0 :
        model=sm.Logit(dep_var,data_frame[col_list])
        result=model.fit(disp=0)
        largest_pvalue=round(result.pvalues,3).nlargest(1)
        if largest_pvalue[0]<(0.05):
            return result, removed_cols
            break
        else:
            removed_cols.append(largest_pvalue.index[0])
            #print("removing :",largest_pvalue.index)
            col_list.remove(largest_pvalue.index)

#result, new_features = back_feature_elem(X_train_constant,y_train,cols_selected[:])
#resultb, new_featuresb = back_feature_elem(Xb_train_constant,yb_train,cols_selected[:])

In [None]:
result.summary()

In [None]:
resultb.summary()

In [22]:
logit_sk = LogisticRegression(penalty='l2',max_iter=100,random_state=SEED, class_weight='balanced').fit(X_train, y_train.to_numpy().ravel())
logit_sk_b = LogisticRegression(penalty='l2',max_iter=100,random_state=SEED).fit(Xb_train, yb_train.to_numpy().ravel())
#logit_skn = LogisticRegression(penalty='l2',max_iter=100,random_state=SEED, class_weight='balanced').fit(X_train[new_features], y_train.to_numpy().ravel())
#logit_sk_bn = LogisticRegression(penalty='l2',max_iter=100,random_state=SEED).fit(Xb_train[new_features], yb_train.to_numpy().ravel())

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [23]:
## Full Data Full Features
print("training ")
logistc_prediction = logit_sk.predict(X_train)
print(metrics.accuracy_score(y_train, logistc_prediction))
print(metrics.precision_score(y_train, logistc_prediction))
print(metrics.recall_score(y_train, logistc_prediction))
print("Test ")
logistc_prediction = logit_sk.predict(X_test)
print(metrics.accuracy_score(y_test, logistc_prediction))
print(metrics.precision_score(y_test, logistc_prediction))
print(metrics.recall_score(y_test, logistc_prediction))

training 
0.7306895531222255
0.84262630860264
0.7664044711239909
Test 
0.7361912613355317
0.8520461699895068
0.7696682464454976


In [24]:
## Balanced data, Full Features
print("training ")
logistc_prediction = logit_sk_b.predict(Xb_train)
print(metrics.accuracy_score(yb_train, logistc_prediction))
print(metrics.precision_score(yb_train, logistc_prediction))
print(metrics.recall_score(yb_train, logistc_prediction))
print("Test ")
logistc_prediction = logit_sk.predict(Xb_test)
print(metrics.accuracy_score(yb_test, logistc_prediction))
print(metrics.precision_score(yb_test, logistc_prediction))
print(metrics.recall_score(yb_test, logistc_prediction))

training 
0.7071184659837142
0.6863060989643268
0.774746687451286
Test 
0.6997560975609756
0.6700218818380744
0.7624501992031872


In [25]:
## Full data, Significant features
print("training ")
logistc_prediction = logit_skn.predict(X_train[new_features])
print(metrics.accuracy_score(y_train, logistc_prediction))
print(metrics.precision_score(y_train, logistc_prediction))
print(metrics.recall_score(y_train, logistc_prediction))
print("Test ")
logistc_prediction = logit_skn.predict(X_test[new_features])
print(metrics.accuracy_score(y_test, logistc_prediction))
print(metrics.precision_score(y_test, logistc_prediction))
print(metrics.recall_score(y_test, logistc_prediction))

training 


NameError: name 'logit_skn' is not defined

In [None]:
## Balanced data, Significant features
print("training ")
logistc_prediction = logit_sk_bn.predict(Xb_train[new_features])
print(metrics.accuracy_score(yb_train, logistc_prediction))
print(metrics.precision_score(yb_train, logistc_prediction))
print(metrics.recall_score(yb_train, logistc_prediction))
print("Test ")
logistc_prediction = logit_skn.predict(Xb_test[new_features])
print(metrics.accuracy_score(yb_test, logistc_prediction))
print(metrics.precision_score(yb_test, logistc_prediction))
print(metrics.recall_score(yb_test, logistc_prediction))

## XGBOOST

In [26]:
## XGBOOST
hyper = OrderedDict({
    'kfold':[5],
    'n_estimators': [1,10,1],
    'max_depth': range(1,10,1),
    'learning_rate': [0.005,0.01,0.03,0.05,0.1],
    'subsample': np.arange(0.10,1.0, 0.10),
    'min_child_weight': range(1, 10,1),
    'scale_pos_weight':[0.3,1,0.1]
})

In [27]:
## Defining the model
xgb = xgboost.sklearn.XGBClassifier(learning_rate =0.005,n_estimators=3,max_depth=30,min_child_weight=3,subsample=1,\
                         objective= 'binary:logistic',seed=SEED, scale_pos_wegiht=1,gamma=1,max_delta_step=2, reg_alpha=9)
xgb.fit(X_train, y_train)
#xgb_b = xgboost.sklearn.XGBClassifier(learning_rate =0.005,n_estimators=10,max_depth=4,min_child_weight=5,subsample=1,\
#                         objective= 'binary:logistic',seed=SEED)
#xgb_b.fit(Xb_train, yb_train)
print("")

  return f(**kwargs)


Parameters: { "scale_pos_wegiht" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.





In [28]:
results_train = xgb.predict_proba(X_train)
results_train_v2 = xgb.predict(X_train)
results_test = xgb.predict_proba(X_test)
results_test_v2 = xgb.predict(X_test)
#results_train_b = xgb_b.predict_proba(Xb_train)
#results_train_v2_b = xgb_b.predict(Xb_train)
#results_test_b = xgb_b.predict_proba(Xb_test)
#results_test_v2_b = xgb_b.predict(Xb_test)

In [32]:
print("Model Statistics for training data")
classification_stats(y_train.values.ravel(), results_train[:,1], results_train_v2)
print("*****"*10)
print("Model Statistics for test data")
classification_stats(y_test.values.ravel(), results_test[:,1], results_test_v2)

Model Statistics for training data
Predicted     0     1
Actual               
0          2316  1538
1          1532  8130

Precision is:  0.8409184940008275
Recall is:  0.8414406955081764
Overall_Accuracy is:  0.7728617934300088
**************************************************
Model Statistics for test data
Predicted     0     1
Actual               
0          1039   964
1           992  4283

Precision is:  0.8162759672193635
Recall is:  0.8119431279620853
Overall_Accuracy is:  0.731244847485573


In [None]:
xgb.feature_importances_

In [33]:
xgb.feature_importances_

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.00262302, 0.00154574, 0.00702005,
       0.        , 0.        , 0.00819221, 0.00834113, 0.00169987,
       0.0016372 , 0.        , 0.16005133, 0.        , 0.00216387,
       0.        , 0.00320267, 0.        , 0.        , 0.00433253,
       0.00772985, 0.00662609, 0.        , 0.01790803, 0.00190869,
       0.        , 0.        , 0.00227297, 0.0180864 , 0.        ,
       0.        , 0.00110679, 0.        , 0.        , 0.        ,
       0.        , 0.07308543, 0.        , 0.00122111, 0.        ,
       0.00220141, 0.        , 0.        , 0.        , 0.        ,
       0.0390757 , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.00152446, 0.00245623, 0.03259165,
       0.        , 0.        , 0.00602653, 0.        , 0.04621453,
       0.        , 0.        , 0.00539874, 0.        , 0.00166725,
       0.10626037, 0.        , 0.10251787, 0.        , 0.     

In [31]:
#print("Model Statistics for training data")
#classification_stats(yb_train.values.ravel(), results_train_b[:,1], results_train_v2_b)
#print("*****"*10)
#print("Model Statistics for test data")
#classification_stats(yb_test.values.ravel(), results_test_b[:,1], results_test_v2_b)

## Random Forest

In [None]:
## Random Forest
model_rf = RandomForestClassifier(n_estimators=10, class_weight={0:1.5,1:1})
model_rf_b  = RandomForestClassifier(n_estimators=10)
hyper_rf = OrderedDict({
    'n_estimators': [10,1000,100],
    'max_depth': range(1,10,1),
    'min_samples_split':[10,100,10],
    'min_samples_leaf':[1,10,1],
    'max_features':['auto','sqrt'],
    'bootstrap':[True,False]
})
#model_rf_cv = GridSearchCV(estimator=model_rf, param_grid=hyper_rf, n_jobs=-1) 
model_rf_cv = RandomizedSearchCV(estimator=model_rf, param_distributions=hyper_rf, cv=3, verbose=2, n_jobs=-1)
#model_rf_cv_b = RandomizedSearchCV(estimator=model_rf_b, param_distributions=hyper_rf, cv=3, verbose=2, n_jobs=-1)

model_rf_cv = model_rf_cv.fit(X_train, y_train)
#model_rf_cv_b = model_rf_cv_b.fit(Xb_train, yb_train)

In [None]:
results_train = model_rf_cv.predict_proba(X_train)
results_train_v2 = model_rf_cv.predict(X_train)
results_test = model_rf_cv.predict_proba(X_test)
results_test_v2 = model_rf_cv.predict(X_test)
#results_train_b = model_rf_cv_b.predict_proba(Xb_train)
#results_train_v2_b = model_rf_cv_b.predict(Xb_train)
#results_test_b = model_rf_cv_b.predict_proba(Xb_test)
#results_test_v2_b = model_rf_cv_b.predict(Xb_test)

In [None]:
print("Model Statistics for training data")
classification_stats(y_train.values.ravel(), results_train[:,1], results_train_v2)
print("*****"*10)
print("Model Statistics for test data")
classification_stats(y_test.values.ravel(), results_test[:,1], results_test_v2)

In [None]:
print("Model Statistics for training data")
classification_stats(yb_train.values.ravel(), results_train_b[:,1], results_train_v2_b)
print("*****"*10)
print("Model Statistics for test data")
classification_stats(yb_test.values.ravel(), results_test_b[:,1], results_test_v2_b)