In [1]:
from scipy.io import arff
import pandas as pd
import numpy as np
import math

from numpy import unique
from numpy import where
from matplotlib import pyplot

data = arff.loadarff('./../DefectData/inst/extdata/terapromise/mccabe/ar6.arff') 
df = pd.DataFrame(data[0])

In [2]:
df.shape

(101, 30)

In [3]:
df.head()

Unnamed: 0,total_loc,blank_loc,comment_loc,code_and_comment_loc,executable_loc,unique_operands,unique_operators,total_operands,total_operators,halstead_vocabulary,...,condition_count,multiple_condition_count,cyclomatic_complexity,cyclomatic_density,decision_density,design_complexity,design_density,normalized_cyclomatic_complexity,formal_parameters,defects
0,8.0,0.0,4.0,0.0,4.0,8.0,6.0,10.0,12.0,14.0,...,0.0,0.0,1.0,0.25,0.0,0.0,0.0,0.13,0.0,b'false'
1,13.0,0.0,1.0,0.0,12.0,10.0,8.0,21.0,26.0,18.0,...,0.0,0.0,2.0,0.17,0.0,0.0,0.0,0.15,0.0,b'false'
2,20.0,0.0,12.0,0.0,8.0,14.0,12.0,21.0,34.0,26.0,...,3.0,0.0,4.0,0.5,1.0,0.0,0.0,0.2,0.0,b'false'
3,40.0,0.0,17.0,0.0,23.0,20.0,18.0,64.0,90.0,38.0,...,11.0,2.0,11.0,0.48,1.09,1.0,0.09,0.28,0.0,b'false'
4,8.0,0.0,3.0,0.0,5.0,7.0,13.0,13.0,21.0,20.0,...,1.0,0.0,3.0,0.6,2.0,0.0,0.0,0.38,1.0,b'false'


In [4]:
buggy = b'true'
clean = b'false'
x = df.drop(['defects'],axis = 'columns')

for i in range(101):
    if df.iloc[i,29] == buggy:
        df.iloc[i,29] = 1
    else:
        df.iloc[i,29] = 0

y = df['defects']
y = y.astype(int)
y.value_counts()

0    86
1    15
Name: defects, dtype: int64

In [5]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.4,stratify = y)

print(f"Size of training data : {len(x_train)}")
print(f"Distribution of y_train values :\n {y_train.value_counts()}")
print(f"Distribution of y_test values :\n {y_test.value_counts()}")

Size of training data : 60
Distribution of y_train values :
 0    51
1     9
Name: defects, dtype: int64
Distribution of y_test values :
 0    35
1     6
Name: defects, dtype: int64


In [6]:
# training the model on training set
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(x_train, y_train)

# making predictions on the testing set
y_pred = gnb.predict(x_test)

from sklearn.metrics import confusion_matrix

conf = confusion_matrix(y_test,y_pred)
TP = conf[0,0]
FP = conf[0,1]
TN = conf[1,0]
FN = conf[1,1]

#Matthew Correlation Coefficient

MCC = (TP*TN-FP*FN)/math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
accuracy = (TP+TN)/(TP+TN+FP+FN)
precision = TP/(TP+FP)
recall = TP/(TP+FN)
#F1-Score
F = 2*(precision*recall)/(precision+recall)

print(f"MCC : {MCC}")
print(f"Accuracy : {accuracy}")
print(f"F-Score : {F}")

MCC : 0.6667884087975601
Accuracy : 0.9024390243902439
F-Score : 0.9411764705882354


In [7]:
from sklearn.svm import SVC

svc_model = SVC() 
svc_model.fit(x_train, y_train) 

y_pred = svc_model .predict(x_test) 


from sklearn.metrics import confusion_matrix

conf = confusion_matrix(y_test,y_pred)
TP = conf[0,0]
FP = conf[0,1]
TN = conf[1,0]
FN = conf[1,1]

#Matthew Correlation Coefficient

MCC = (TP*TN-FP*FN)/math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
accuracy = (TP+TN)/(TP+TN+FP+FN)
precision = TP/(TP+FP)
recall = TP/(TP+FN)
#F1-Score
F = 2*(precision*recall)/(precision+recall)

print(f"MCC : {MCC}")
print(f"Accuracy : {accuracy}")
print(f"F-Score : {F}")

MCC : 0.9001028747788694
Accuracy : 0.975609756097561
F-Score : 0.9859154929577464


In [8]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier()
classifier.fit(x_train, y_train)

y_pred = classifier.predict(x_test)


from sklearn.metrics import confusion_matrix

conf = confusion_matrix(y_test,y_pred)
TP = conf[0,0]
FP = conf[0,1]
TN = conf[1,0]
FN = conf[1,1]

#Matthew Correlation Coefficient

MCC = (TP*TN-FP*FN)/math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
accuracy = (TP+TN)/(TP+TN+FP+FN)
precision = TP/(TP+FP)
recall = TP/(TP+FN)
#F1-Score
F = 2*(precision*recall)/(precision+recall)

print(f"MCC : {MCC}")
print(f"Accuracy : {accuracy}")
print(f"F-Score : {F}")

MCC : 0.8409178658720822
Accuracy : 0.9512195121951219
F-Score : 0.9705882352941176


In [9]:
from sklearn.ensemble import RandomForestClassifier  
classifier= RandomForestClassifier(n_estimators= 10, criterion="entropy")  
classifier.fit(x_train, y_train)  

y_pred= classifier.predict(x_test)  

from sklearn.metrics import confusion_matrix

conf = confusion_matrix(y_test,y_pred)
TP = conf[0,0]
FP = conf[0,1]
TN = conf[1,0]
FN = conf[1,1]

#Matthew Correlation Coefficient

MCC = (TP*TN-FP*FN)/math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
accuracy = (TP+TN)/(TP+TN+FP+FN)
precision = TP/(TP+FP)
recall = TP/(TP+FN)
#F1-Score
F = 2*(precision*recall)/(precision+recall)

print(f"MCC : {MCC}")
print(f"Accuracy : {accuracy}")
print(f"F-Score : {F}")

MCC : 0.6095238095238096
Accuracy : 0.9024390243902439
F-Score : 0.9428571428571428


In [10]:
#Fitting Logistic Regression to the training set  
from sklearn.linear_model import LogisticRegression  
classifier= LogisticRegression(random_state=0)  
classifier.fit(x_train, y_train) 

y_pred= classifier.predict(x_test)  

from sklearn.metrics import confusion_matrix

conf = confusion_matrix(y_test,y_pred)
TP = conf[0,0]
FP = conf[0,1]
TN = conf[1,0]
FN = conf[1,1]

#Matthew Correlation Coefficient

MCC = (TP*TN-FP*FN)/math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
accuracy = (TP+TN)/(TP+TN+FP+FN)
precision = TP/(TP+FP)
recall = TP/(TP+FN)
#F1-Score
F = 2*(precision*recall)/(precision+recall)

print(f"MCC : {MCC}")
print(f"Accuracy : {accuracy}")
print(f"F-Score : {F}")

MCC : 0.5457097393742018
Accuracy : 0.8780487804878049
F-Score : 0.9275362318840579


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
