In [1]:
from scipy.io import arff
import pandas as pd
import numpy as np
import math

from numpy import unique
from numpy import where
from matplotlib import pyplot

data = arff.loadarff('./../DefectData/inst/extdata/terapromise/mccabe/ar5.arff') 
df = pd.DataFrame(data[0])

In [2]:
df.shape

(36, 30)

In [3]:
df.head()

Unnamed: 0,total_loc,blank_loc,comment_loc,code_and_comment_loc,executable_loc,unique_operands,unique_operators,total_operands,total_operators,halstead_vocabulary,...,condition_count,multiple_condition_count,cyclomatic_complexity,cyclomatic_density,decision_density,design_complexity,design_density,normalized_cyclomatic_complexity,formal_parameters,defects
0,82.0,26.0,9.0,0.0,47.0,55.0,12.0,91.0,132.0,67.0,...,4.0,1.0,10.0,0.21277,2.5,20.0,2.0,0.12195,0.0,b'false'
1,16.0,6.0,2.0,1.0,8.0,13.0,6.0,18.0,20.0,19.0,...,0.0,0.0,2.0,0.25,0.0,1.0,0.5,0.125,1.0,b'false'
2,31.0,12.0,3.0,2.0,16.0,18.0,9.0,31.0,42.0,27.0,...,5.0,0.0,6.0,0.375,1.0,1.0,0.16667,0.19355,0.0,b'false'
3,477.0,104.0,89.0,2.0,284.0,150.0,29.0,482.0,699.0,179.0,...,116.0,25.0,93.0,0.32746,1.0172,4.0,0.043011,0.19497,0.0,b'true'
4,11.0,2.0,0.0,0.0,9.0,10.0,4.0,15.0,17.0,14.0,...,0.0,0.0,1.0,0.11111,0.0,2.0,2.0,0.090909,0.0,b'false'


In [4]:
buggy = b'true'
clean = b'false'
x = df.drop(['defects'],axis = 'columns')

for i in range(36):
    if df.iloc[i,29] == buggy:
        df.iloc[i,29] = 1
    else:
        df.iloc[i,29] = 0

y = df['defects']
y = y.astype(int)
y.value_counts()

0    28
1     8
Name: defects, dtype: int64

In [5]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.4,stratify = y)

print(f"Size of training data : {len(x_train)}")
print(f"Distribution of y_train values :\n {y_train.value_counts()}")
print(f"Distribution of y_test values :\n {y_test.value_counts()}")

Size of training data : 21
Distribution of y_train values :
 0    16
1     5
Name: defects, dtype: int64
Distribution of y_test values :
 0    12
1     3
Name: defects, dtype: int64


In [6]:
# training the model on training set
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(x_train, y_train)

# making predictions on the testing set
y_pred = gnb.predict(x_test)

from sklearn.metrics import confusion_matrix

conf = confusion_matrix(y_test,y_pred)
TP = conf[0,0]
FP = conf[0,1]
TN = conf[1,0]
FN = conf[1,1]

#Matthew Correlation Coefficient

MCC = (TP*TN-FP*FN)/math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
accuracy = (TP+TN)/(TP+TN+FP+FN)
precision = TP/(TP+FP)
recall = TP/(TP+FN)
#F1-Score
F = 2*(precision*recall)/(precision+recall)

print(f"MCC : {MCC}")
print(f"Accuracy : {accuracy}")
print(f"F-Score : {F}")

MCC : 0.16666666666666666
Accuracy : 0.7333333333333333
F-Score : 0.8333333333333334


In [7]:
from sklearn.svm import SVC

svc_model = SVC() 
svc_model.fit(x_train, y_train) 

y_pred = svc_model .predict(x_test) 


from sklearn.metrics import confusion_matrix

conf = confusion_matrix(y_test,y_pred)
TP = conf[0,0]
FP = conf[0,1]
TN = conf[1,0]
FN = conf[1,1]

#Matthew Correlation Coefficient

MCC = (TP*TN-FP*FN)/math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
accuracy = (TP+TN)/(TP+TN+FP+FN)
precision = TP/(TP+FP)
recall = TP/(TP+FN)
#F1-Score
F = 2*(precision*recall)/(precision+recall)

print(f"MCC : {MCC}")
print(f"Accuracy : {accuracy}")
print(f"F-Score : {F}")

MCC : 0.16666666666666666
Accuracy : 0.7333333333333333
F-Score : 0.8333333333333334


In [8]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier()
classifier.fit(x_train, y_train)

y_pred = classifier.predict(x_test)


from sklearn.metrics import confusion_matrix

conf = confusion_matrix(y_test,y_pred)
TP = conf[0,0]
FP = conf[0,1]
TN = conf[1,0]
FN = conf[1,1]

#Matthew Correlation Coefficient

MCC = (TP*TN-FP*FN)/math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
accuracy = (TP+TN)/(TP+TN+FP+FN)
precision = TP/(TP+FP)
recall = TP/(TP+FN)
#F1-Score
F = 2*(precision*recall)/(precision+recall)

print(f"MCC : {MCC}")
print(f"Accuracy : {accuracy}")
print(f"F-Score : {F}")

MCC : 0.16666666666666666
Accuracy : 0.7333333333333333
F-Score : 0.8333333333333334


In [9]:
#Fitting Decision Tree classifier to the training set  
from sklearn.ensemble import RandomForestClassifier  
classifier= RandomForestClassifier(n_estimators= 10, criterion="entropy")  
classifier.fit(x_train, y_train)  

y_pred= classifier.predict(x_test)  

from sklearn.metrics import confusion_matrix

conf = confusion_matrix(y_test,y_pred)
TP = conf[0,0]
FP = conf[0,1]
TN = conf[1,0]
FN = conf[1,1]

#Matthew Correlation Coefficient

MCC = (TP*TN-FP*FN)/math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
accuracy = (TP+TN)/(TP+TN+FP+FN)
precision = TP/(TP+FP)
recall = TP/(TP+FN)
#F1-Score
F = 2*(precision*recall)/(precision+recall)

print(f"MCC : {MCC}")
print(f"Accuracy : {accuracy}")
print(f"F-Score : {F}")

MCC : 0.82915619758885
Accuracy : 0.9333333333333333
F-Score : 0.9565217391304348


In [10]:
#Fitting Logistic Regression to the training set  
from sklearn.linear_model import LogisticRegression  
classifier= LogisticRegression(random_state=0)  
classifier.fit(x_train, y_train) 

y_pred= classifier.predict(x_test)  

from sklearn.metrics import confusion_matrix

conf = confusion_matrix(y_test,y_pred)
TP = conf[0,0]
FP = conf[0,1]
TN = conf[1,0]
FN = conf[1,1]

#Matthew Correlation Coefficient

MCC = (TP*TN-FP*FN)/math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
accuracy = (TP+TN)/(TP+TN+FP+FN)
precision = TP/(TP+FP)
recall = TP/(TP+FN)
#F1-Score
F = 2*(precision*recall)/(precision+recall)

print(f"MCC : {MCC}")
print(f"Accuracy : {accuracy}")
print(f"F-Score : {F}")

MCC : 0.16666666666666666
Accuracy : 0.7333333333333333
F-Score : 0.8333333333333334


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
