In [7]:
from scipy.io import arff
import pandas as pd
import numpy as np
import math

from numpy import unique
from numpy import where
from matplotlib import pyplot

data = arff.loadarff('./../DefectData/inst/extdata/terapromise/mccabe/ar4.arff') 
df = pd.DataFrame(data[0])

In [8]:
df.shape

(107, 30)

In [9]:
df.head()

Unnamed: 0,total_loc,blank_loc,comment_loc,code_and_comment_loc,executable_loc,unique_operands,unique_operators,total_operands,total_operators,halstead_vocabulary,...,condition_count,multiple_condition_count,cyclomatic_complexity,cyclomatic_density,decision_density,design_complexity,design_density,normalized_cyclomatic_complexity,formal_parameters,defects
0,103.0,61.0,3.0,0.0,39.0,26.0,19.0,81.0,111.0,45.0,...,7.0,1.0,8.0,0.20513,1.1429,2.0,0.25,0.07767,0.0,b'false'
1,53.0,22.0,5.0,0.0,26.0,18.0,16.0,38.0,54.0,34.0,...,5.0,2.0,5.0,0.19231,1.2,0.0,0.0,0.09434,1.0,b'false'
2,25.0,10.0,1.0,0.0,14.0,12.0,14.0,42.0,58.0,26.0,...,1.0,0.0,2.0,0.14286,1.0,0.0,0.0,0.08,2.0,b'false'
3,73.0,22.0,8.0,1.0,43.0,25.0,12.0,67.0,97.0,37.0,...,15.0,7.0,9.0,0.2093,1.0,0.0,0.0,0.12329,0.0,b'false'
4,69.0,21.0,17.0,0.0,31.0,16.0,9.0,28.0,48.0,25.0,...,13.0,6.0,8.0,0.25806,1.0,0.0,0.0,0.11594,0.0,b'false'


In [10]:
buggy = b'true'
clean = b'false'
x = df.drop(['defects'],axis = 'columns')

for i in range(107):
    if df.iloc[i,29] == buggy:
        df.iloc[i,29] = 1
    else:
        df.iloc[i,29] = 0

y = df['defects']
y = y.astype(int)
y.head()

0    0
1    0
2    0
3    0
4    0
Name: defects, dtype: int64

In [11]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy = 'minority')
x, y = smote.fit_resample(x,y)

print(y.value_counts())

0    87
1    87
Name: defects, dtype: int64


In [12]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.4,stratify = y)

print(f"Size of training data : {len(x_train)}")
print(f"Distribution of y_train values :\n {y_train.value_counts()}")
print(f"Distribution of y_test values :\n {y_test.value_counts()}")

Size of training data : 104
Distribution of y_train values :
 0    52
1    52
Name: defects, dtype: int64
Distribution of y_test values :
 0    35
1    35
Name: defects, dtype: int64


In [13]:
# training the model on training set
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(x_train, y_train)

# making predictions on the testing set
y_pred = gnb.predict(x_test)

from sklearn.metrics import confusion_matrix

conf = confusion_matrix(y_test,y_pred)
TP = conf[0,0]
FP = conf[0,1]
TN = conf[1,0]
FN = conf[1,1]

#Matthew Correlation Coefficient

MCC = (TP*TN-FP*FN)/math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
accuracy = (TP+TN)/(TP+TN+FP+FN)
precision = TP/(TP+FP)
recall = TP/(TP+FN)
#F1-Score
F = 2*(precision*recall)/(precision+recall)

print(f"MCC : {MCC}")
print(f"Accuracy : {accuracy}")
print(f"F-Score : {F}")

MCC : 0.34291382751370236
Accuracy : 0.6571428571428571
F-Score : 0.7142857142857143


In [14]:
from sklearn.svm import SVC

svc_model = SVC() 
svc_model.fit(x_train, y_train) 

y_pred = svc_model .predict(x_test) 


from sklearn.metrics import confusion_matrix

conf = confusion_matrix(y_test,y_pred)
TP = conf[0,0]
FP = conf[0,1]
TN = conf[1,0]
FN = conf[1,1]

#Matthew Correlation Coefficient

MCC = (TP*TN-FP*FN)/math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
accuracy = (TP+TN)/(TP+TN+FP+FN)
precision = TP/(TP+FP)
recall = TP/(TP+FN)
#F1-Score
F = 2*(precision*recall)/(precision+recall)

print(f"MCC : {MCC}")
print(f"Accuracy : {accuracy}")
print(f"F-Score : {F}")

MCC : 0.5068420748999524
Accuracy : 0.7428571428571429
F-Score : 0.775


In [15]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier()
classifier.fit(x_train, y_train)

y_pred = classifier.predict(x_test)


from sklearn.metrics import confusion_matrix

conf = confusion_matrix(y_test,y_pred)
TP = conf[0,0]
FP = conf[0,1]
TN = conf[1,0]
FN = conf[1,1]

#Matthew Correlation Coefficient

MCC = (TP*TN-FP*FN)/math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
accuracy = (TP+TN)/(TP+TN+FP+FN)
precision = TP/(TP+FP)
recall = TP/(TP+FN)
#F1-Score
F = 2*(precision*recall)/(precision+recall)

print(f"MCC : {MCC}")
print(f"Accuracy : {accuracy}")
print(f"F-Score : {F}")

MCC : -0.09124485970087103
Accuracy : 0.45714285714285713
F-Score : 0.5365853658536586


In [16]:
#Fitting Decision Tree classifier to the training set  
from sklearn.ensemble import RandomForestClassifier  
classifier= RandomForestClassifier(n_estimators= 10, criterion="entropy")  
classifier.fit(x_train, y_train)  

y_pred= classifier.predict(x_test)  

from sklearn.metrics import confusion_matrix

conf = confusion_matrix(y_test,y_pred)
TP = conf[0,0]
FP = conf[0,1]
TN = conf[1,0]
FN = conf[1,1]

#Matthew Correlation Coefficient

MCC = (TP*TN-FP*FN)/math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
accuracy = (TP+TN)/(TP+TN+FP+FN)
precision = TP/(TP+FP)
recall = TP/(TP+FN)
#F1-Score
F = 2*(precision*recall)/(precision+recall)

print(f"MCC : {MCC}")
print(f"Accuracy : {accuracy}")
print(f"F-Score : {F}")

MCC : 0.07580980435789034
Accuracy : 0.5285714285714286
F-Score : 0.6451612903225806


In [17]:
#Fitting Logistic Regression to the training set  
from sklearn.linear_model import LogisticRegression  
classifier= LogisticRegression(random_state=0)  
classifier.fit(x_train, y_train) 

y_pred= classifier.predict(x_test)  

from sklearn.metrics import confusion_matrix

conf = confusion_matrix(y_test,y_pred)
TP = conf[0,0]
FP = conf[0,1]
TN = conf[1,0]
FN = conf[1,1]

#Matthew Correlation Coefficient

MCC = (TP*TN-FP*FN)/math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
accuracy = (TP+TN)/(TP+TN+FP+FN)
precision = TP/(TP+FP)
recall = TP/(TP+FN)
#F1-Score
F = 2*(precision*recall)/(precision+recall)

print(f"MCC : {MCC}")
print(f"Accuracy : {accuracy}")
print(f"F-Score : {F}")

MCC : -0.033314830232638475
Accuracy : 0.4857142857142857
F-Score : 0.5909090909090909


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
