In [1]:
# Importing Required Library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as spy
import os
import warnings
warnings.filterwarnings('ignore')
# Importing Train Data
data = pd.read_csv('train.csv')
data.head()

Unnamed: 0,id,loc,v(g),ev(g),iv(g),n,v,l,d,i,...,lOCode,lOComment,lOBlank,locCodeAndComment,uniq_Op,uniq_Opnd,total_Op,total_Opnd,branchCount,defects
0,0,22.0,3.0,1.0,2.0,60.0,278.63,0.06,19.56,14.25,...,17,1,1,0,16.0,9.0,38.0,22.0,5.0,False
1,1,14.0,2.0,1.0,2.0,32.0,151.27,0.14,7.0,21.11,...,11,0,1,0,11.0,11.0,18.0,14.0,3.0,False
2,2,11.0,2.0,1.0,2.0,45.0,197.65,0.11,8.05,22.76,...,8,0,1,0,12.0,11.0,28.0,17.0,3.0,False
3,3,8.0,1.0,1.0,1.0,23.0,94.01,0.19,5.25,17.86,...,4,0,2,0,8.0,6.0,16.0,7.0,1.0,True
4,4,11.0,2.0,1.0,2.0,17.0,60.94,0.18,5.63,12.44,...,7,0,2,0,7.0,6.0,10.0,10.0,3.0,False


In [2]:
# Col Names
data.columns

Index(['id', 'loc', 'v(g)', 'ev(g)', 'iv(g)', 'n', 'v', 'l', 'd', 'i', 'e',
       'b', 't', 'lOCode', 'lOComment', 'lOBlank', 'locCodeAndComment',
       'uniq_Op', 'uniq_Opnd', 'total_Op', 'total_Opnd', 'branchCount',
       'defects'],
      dtype='object')

In [3]:
# Null Values
data.isnull().sum()

id                   0
loc                  0
v(g)                 0
ev(g)                0
iv(g)                0
n                    0
v                    0
l                    0
d                    0
i                    0
e                    0
b                    0
t                    0
lOCode               0
lOComment            0
lOBlank              0
locCodeAndComment    0
uniq_Op              0
uniq_Opnd            0
total_Op             0
total_Opnd           0
branchCount          0
defects              0
dtype: int64

In [4]:
# Shape of data
data.shape

(101763, 23)

In [5]:
# Info of Data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101763 entries, 0 to 101762
Data columns (total 23 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   id                 101763 non-null  int64  
 1   loc                101763 non-null  float64
 2   v(g)               101763 non-null  float64
 3   ev(g)              101763 non-null  float64
 4   iv(g)              101763 non-null  float64
 5   n                  101763 non-null  float64
 6   v                  101763 non-null  float64
 7   l                  101763 non-null  float64
 8   d                  101763 non-null  float64
 9   i                  101763 non-null  float64
 10  e                  101763 non-null  float64
 11  b                  101763 non-null  float64
 12  t                  101763 non-null  float64
 13  lOCode             101763 non-null  int64  
 14  lOComment          101763 non-null  int64  
 15  lOBlank            101763 non-null  int64  
 16  lo

In [6]:
# Stats of data
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,101763.0,50881.0,29376.592059,0.0,25440.5,50881.0,76321.5,101762.0
loc,101763.0,37.34716,54.600401,1.0,13.0,22.0,42.0,3442.0
v(g),101763.0,5.492684,7.900855,1.0,2.0,3.0,6.0,404.0
ev(g),101763.0,2.845022,4.631262,1.0,1.0,1.0,3.0,165.0
iv(g),101763.0,3.498826,5.534541,1.0,1.0,2.0,4.0,402.0
n,101763.0,96.655995,171.147191,0.0,25.0,51.0,111.0,8441.0
v,101763.0,538.280956,1270.791601,0.0,97.67,232.79,560.25,80843.08
l,101763.0,0.111634,0.100096,0.0,0.05,0.09,0.15,1.0
d,101763.0,13.681881,14.121306,0.0,5.6,9.82,18.0,418.2
i,101763.0,27.573007,22.856742,0.0,15.56,23.36,34.34,569.78


#### From the statistics, we can clearly see that we need to apply feature scaling to normalise the data. 
#### Because Min Value is 0 and Max Values are far away from the range
#### Also, we need to convert our target column into an integer format before applying any Machine Learning algorithm.

In [7]:
# PairPlot
plt.figure(figsize=(12,12))
sns.pairplot(data,
            corner=True)
plt.show()

<Figure size 1200x1200 with 0 Axes>


KeyboardInterrupt



In [None]:
# Histogram
data.hist(figsize=(15,12),
          grid=False)
plt.show()

In [None]:
# Corr
plt.figure(figsize=(20,20))
sns.heatmap(data.corr(),
            annot=True,
            cmap='RdYlGn')
plt.title("Correlation Matrix of Software Defects DataSet")
plt.show()

In [None]:
# We import label encoder for converting our bool col type to int 
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data['defects'] = le.fit_transform(data['defects'])
data.head()

In [None]:
# We drop the ID Column
data_new = data
data_new

### Here we do not use features scaling techniques because most of the tree-based, Decision tree and random forest algorithms do not work with feature scaling

In [None]:
# Define X and Y Variable
Y = data_new[['defects']]
X = data_new.drop(columns=['defects'])

# Splitting the data into X and Y Train and test
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=42)
# len of our splitted data
len(X_train),len(X_test),len(Y_train),len(Y_test)

In [None]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression(n_jobs=-1,
                        random_state=42,)

lr_model = LR.fit(X_train,
                  Y_train)

# Prediction
Y_test["Pred_Logi"]=lr_model.predict(X_test)

In [None]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier(random_state=42)
model = dtree.fit(X_train, Y_train)
model

# Prediction
Y_test['Pred_Dtree'] = model.predict(X_test)

In [None]:
# Random forest
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state= 42,
                             n_estimators= 2000,
                             criterion='log_loss',
                             n_jobs=-1)
mdl = rfc.fit(X_train, Y_train)
mdl

# Prediction
Y_test['Pred_RFC'] = mdl.predict(X_test)

In [None]:
# SVM (Support Vector Machine)
from sklearn.svm import SVC
svm = SVC(random_state=42)
model2 = svm.fit(X_train,Y_train)
model2
Y_test["Predicted_SVM"] = model2.predict(X_test)

In [None]:
# ExtraTree Classifiers
from sklearn.ensemble import ExtraTreesClassifier
etc = ExtraTreesClassifier(n_estimators=1000,
                           n_jobs=-1,
                           random_state=42)

model_etc = etc.fit(X_train,
                    Y_train)
model_etc

# Prediction
Y_test['Pred_ExtraTree'] = model_etc.predict(X_test)

In [None]:
# AdaBoost Classifier
from sklearn.ensemble import AdaBoostClassifier
abc = AdaBoostClassifier(n_estimators=1000, 
                         random_state=42)

model_abc = abc.fit(X_train,
                    Y_train)
model_abc

# Prediction
Y_test['Pred_AdaB'] = model_abc.predict(X_test)

In [None]:
# GradiantBoost Classifier
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier(n_estimators=1000, 
                                 random_state=42)

model_gbc = gbc.fit(X_train,
                    Y_train)
model_gbc

# Prediction
Y_test['Pred_GBC'] = model_gbc.predict(X_test)

In [None]:
# XGBoost Classifier
import xgboost as xgb

xgb_classifier = xgb.XGBClassifier(n_estimators=1000, 
                                   random_state=42)

model_xgb = xgb_classifier.fit(X_train, 
                               Y_train)

# Prediction
Y_test['Pred_xgb'] = model_xgb.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
# Logistic Regression 
print("Logistic Regression")
print(classification_report(Y_test['defects'], Y_test["Pred_Logi"]))
print("--------------------------------------------------------------------")
print("--------------------------------------------------------------------")

# DeTree
print("Decision Tree")
print(classification_report(Y_test['defects'], Y_test['Pred_Dtree']))
print("--------------------------------------------------------------------")
print("--------------------------------------------------------------------")


# for random forest
print("Random Forest")
print(classification_report(Y_test["defects"],Y_test["Pred_RFC"]))
print("--------------------------------------------------------------------")
print("--------------------------------------------------------------------")


# SVm
print("Support Vector Machine")
print(classification_report(Y_test["defects"],Y_test["Predicted_SVM"]))
print("--------------------------------------------------------------------")
print("--------------------------------------------------------------------")


# ExtraTree
print("ExtraTree Classifiers")
print(classification_report(Y_test["defects"],Y_test["Pred_ExtraTree"]))
print("--------------------------------------------------------------------")
print("--------------------------------------------------------------------")


# AdaBoost
print('AdaBoost')
print(classification_report(Y_test['defects'],Y_test['Pred_AdaB']))
print('--------------------------------------------------------------------')
print("--------------------------------------------------------------------")


# GradiantBoost
print("GradiantBoost Classifiers")
print(classification_report(Y_test["defects"],Y_test["Pred_GBC"]))
print("--------------------------------------------------------------------")
print("--------------------------------------------------------------------")


# XGBoost
print("XGBoost Classifiers")
print(classification_report(Y_test["defects"],Y_test["Pred_xgb"]))
print("--------------------------------------------------------------------")
print("--------------------------------------------------------------------")

After Running Few Machine Learning Algorithms, We Conclued
Based on the F1-scores and considering accuracy, the best-performing models seem to be Random Forest, ExtraTree Classifiers, AdaBoost, and Gradient Boost Classifiers, all of which have an accuracy around 0.81. Among these, Gradient Boost Classifiers have slightly better F1-scores, which indicates a better balance between precision and recall for both classes. Therefore, Gradient Boost Classifiers could be considered the best-performing model based on the provided evaluation metrics.

In [None]:
from sklearn.metrics import roc_auc_score

# Calculate AUC for Logistic Regression
auc_logi = roc_auc_score(Y_test['defects'], Y_test['Pred_Logi'])
print(f"AUC for Logistic Regression: {auc_logi}")

# Calculate AUC for Decision Tree
auc_dtree = roc_auc_score(Y_test['defects'], Y_test['Pred_Dtree'])
print(f"AUC for Decision Tree: {auc_dtree}")

# Calculate AUC for Random Forest
auc_rfc = roc_auc_score(Y_test["defects"], Y_test["Pred_RFC"])
print(f"AUC for Random Forest: {auc_rfc}")

# Calculate AUC for SVM
auc_svm = roc_auc_score(Y_test["defects"], Y_test["Predicted_SVM"])
print(f"AUC for Support Vector Machine: {auc_svm}")

# Calculate AUC for ExtraTree Classifiers
auc_extra_tree = roc_auc_score(Y_test["defects"], Y_test["Pred_ExtraTree"])
print(f"AUC for ExtraTree Classifiers: {auc_extra_tree}")

# Calculate AUC for AdaBoost
auc_adaboost = roc_auc_score(Y_test['defects'], Y_test['Pred_AdaB'])
print(f"AUC for AdaBoost: {auc_adaboost}")

# Calculate AUC for GradientBoost
auc_gradient_boost = roc_auc_score(Y_test["defects"], Y_test["Pred_GBC"])
print(f"AUC for GradientBoost Classifiers: {auc_gradient_boost}")

# Calculate AUC for XGBoost
auc_xgb = roc_auc_score(Y_test["defects"], Y_test["Pred_xgb"])
print(f"AUC for XGBoost Classifiers: {auc_xgb}")


**** After Seeing the AUC Curve, We Can Clearly See that the AUC =< 0.65 Models performed well, but I am considering the AUC score and classification report. ****

**** After seeing both the parameters, I am considering the Gradient Boost as the best model.****