In [1]:
from scipy.io import arff
import pandas as pd
import numpy as np
import math

from numpy import unique
from numpy import where
from matplotlib import pyplot

data = arff.loadarff('./../DefectData/inst/extdata/terapromise/mccabe/ar5.arff') 
df = pd.DataFrame(data[0])

In [2]:
df.shape

(36, 30)

In [3]:
df.head()

Unnamed: 0,total_loc,blank_loc,comment_loc,code_and_comment_loc,executable_loc,unique_operands,unique_operators,total_operands,total_operators,halstead_vocabulary,...,condition_count,multiple_condition_count,cyclomatic_complexity,cyclomatic_density,decision_density,design_complexity,design_density,normalized_cyclomatic_complexity,formal_parameters,defects
0,82.0,26.0,9.0,0.0,47.0,55.0,12.0,91.0,132.0,67.0,...,4.0,1.0,10.0,0.21277,2.5,20.0,2.0,0.12195,0.0,b'false'
1,16.0,6.0,2.0,1.0,8.0,13.0,6.0,18.0,20.0,19.0,...,0.0,0.0,2.0,0.25,0.0,1.0,0.5,0.125,1.0,b'false'
2,31.0,12.0,3.0,2.0,16.0,18.0,9.0,31.0,42.0,27.0,...,5.0,0.0,6.0,0.375,1.0,1.0,0.16667,0.19355,0.0,b'false'
3,477.0,104.0,89.0,2.0,284.0,150.0,29.0,482.0,699.0,179.0,...,116.0,25.0,93.0,0.32746,1.0172,4.0,0.043011,0.19497,0.0,b'true'
4,11.0,2.0,0.0,0.0,9.0,10.0,4.0,15.0,17.0,14.0,...,0.0,0.0,1.0,0.11111,0.0,2.0,2.0,0.090909,0.0,b'false'


In [4]:
buggy = b'true'
clean = b'false'
x = df.drop(['defects'],axis = 'columns')

for i in range(36):
    if df.iloc[i,29] == buggy:
        df.iloc[i,29] = 1
    else:
        df.iloc[i,29] = 0

y = df['defects']
y = y.astype(int)
y.value_counts()

0    28
1     8
Name: defects, dtype: int64

In [5]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy = 'minority')
x, y = smote.fit_resample(x,y)

print(y.value_counts())

0    28
1    28
Name: defects, dtype: int64


In [6]:
buggy = 1
clean = 0

In [7]:
def ASFM(DataFrame):
    SFM = 0
    
    x,y = DataFrame.shape
    
    for i in range(x):
        for j in range(y-1):
            SFM += DataFrame.iloc[i,j]
    
    return SFM/x

In [8]:
from sklearn.cluster import KMeans

kmeans_model = KMeans(n_clusters = 2)
kmeans_model.fit(x)

kmeans_prediction = kmeans_model.predict(x)
x['cluster'] = kmeans_prediction

kmeans_prediction[:5]


df0 = x[x.cluster==0]
df1 = x[x.cluster==1]

ASFM0 = ASFM(df0)
ASFM1 = ASFM(df1)

print(f"ASFM0 : {ASFM0}")
print(f"ASFM1 : {ASFM1}")


#ASFM0 value is greater than ASFM1, so label 0 will represent defected software and 1 represents non-defected 
#software



TP = 0
FN = 0
TN = 0
FP = 0

n,m = df.shape

for i in range(n):
    
    defect = df.iloc[i,m-1]
    
    if defect == buggy and kmeans_prediction[i]==1:
        TP += 1
    elif defect == buggy and kmeans_prediction[i]==0:
        FN += 1
    elif defect == clean and kmeans_prediction[i]==0:
        TN += 1
    else:
        FP += 1
        
        
        
#Matthew Correlation Coefficient

MCC = (TP*TN-FP*FN)/math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
accuracy = (TP+TN)/len(df)
precision = TP/(TP+FP)
recall = TP/(TP+FN)
#F1-Score
F = 2*(precision*recall)/(precision+recall)

print(f"MCC : {MCC}")
print(f"Accuracy : {accuracy}")
print(f"F-Score : {F}")

ASFM0 : 35244.51015010911
ASFM1 : 231015.90694986438
MCC : 0.1620509308880411
Accuracy : 0.7777777777777778
F-Score : 0.2


In [9]:
from sklearn.cluster import MiniBatchKMeans

miniBatchKmeans_model = MiniBatchKMeans(n_clusters = 2)

x = x.drop(['cluster'],axis = 'columns')
miniBatchKmeans_model.fit(x)
miniBatchKmeans_prediction = miniBatchKmeans_model.labels_
x['cluster'] = miniBatchKmeans_prediction

miniBatchKmeans_prediction[:5]


df0 = x[x.cluster==0]
df1 = x[x.cluster==1]

ASFM0 = ASFM(df0)
ASFM1 = ASFM(df1)

print(f"ASFM0 : {ASFM0}")
print(f"ASFM1 : {ASFM1}")


#ASFM1 value is greater than ASFM0, so label 1 will represent defected software and 0 represents non-defected 
#software



TP = 0
FN = 0
TN = 0
FP = 0

n,m = df.shape


for i in range(n):
    
    defect = df.iloc[i,m-1]
    
    if defect == buggy and miniBatchKmeans_prediction[i]==1:
        TP += 1
    elif defect == buggy and miniBatchKmeans_prediction[i]==0:
        FN += 1
    elif defect == clean and miniBatchKmeans_prediction[i]==0:
        TN += 1
    else:
        FP += 1
        
        
#Matthew Correlation Coefficient

MCC = (TP*TN-FP*FN)/math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
accuracy = (TP+TN)/len(df)
precision = TP/(TP+FP)
recall = TP/(TP+FN)
#F-Score
F = 2*(precision*recall)/(precision+recall)

print(f"MCC : {MCC}")
print(f"Accuracy : {accuracy}")
print(f"F-Score : {F}")

ASFM0 : 12943.89115948469
ASFM1 : 96405.20269922275
MCC : 0.46291004988627577
Accuracy : 0.8055555555555556
F-Score : 0.5882352941176471


In [11]:
from sklearn.cluster import AgglomerativeClustering

ag_model = AgglomerativeClustering(n_clusters = 2)

x = x.drop(['cluster'],axis = 'columns')
ag_model.fit(x)
ag_prediction = ag_model.labels_
x['cluster'] = ag_prediction

ag_prediction[:5]


df0 = x[x.cluster==0]
df1 = x[x.cluster==1]

ASFM0 = ASFM(df0)
ASFM1 = ASFM(df1)

print(f"ASFM0 : {ASFM0}")
print(f"ASFM1 : {ASFM1}")


#ASFM1 value is greater than ASFM0, so label 1 will represent defected software and 0 represents non-defected 
#software



TP = 0
FN = 0
TN = 0
FP = 0

n,m = df.shape


for i in range(n):
    
    defect = df.iloc[i,m-1]
    
    if defect == buggy and ag_prediction[i]==1:
        TP += 1
    elif defect == buggy and ag_prediction[i]==0:
        FN += 1
    elif defect == clean and ag_prediction[i]==0:
        TN += 1
    else:
        FP += 1
        
        
        

#Matthew Correlation Coefficient

MCC = (TP*TN-FP*FN)/math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
accuracy = (TP+TN)/len(df)
precision = TP/(TP+FP)
recall = TP/(TP+FN)
#F-Score
F = 2*(precision*recall)/(precision+recall)

print(f"MCC : {MCC}")
print(f"Accuracy : {accuracy}")
print(f"F-Score : {F}")

ASFM0 : 35244.51015010911
ASFM1 : 231015.90694986438
MCC : 0.1620509308880411
Accuracy : 0.7777777777777778
F-Score : 0.2


In [13]:
from sklearn.cluster import Birch

birch_model = Birch(threshold=0.01, n_clusters=2)

x = x.drop(['cluster'],axis = 'columns')
birch_model.fit(x)
birch_prediction = birch_model.labels_
x['cluster'] = birch_prediction

birch_prediction[:5]


df0 = x[x.cluster==0]
df1 = x[x.cluster==1]

ASFM0 = ASFM(df0)
ASFM1 = ASFM(df1)

print(f"ASFM0 : {ASFM0}")
print(f"ASFM1 : {ASFM1}")


#ASFM1 value is greater than ASFM0, so label 1 will represent defected software and 0 represents non-defected 
#software



TP = 0
FN = 0
TN = 0
FP = 0

n,m = df.shape


for i in range(n):
    
    defect = df.iloc[i,m-1]
    
    if defect == buggy and birch_prediction[i]==1:
        TP += 1
    elif defect == buggy and birch_prediction[i]==0:
        FN += 1
    elif defect == clean and birch_prediction[i]==0:
        TN += 1
    else:
        FP += 1
        
        
        

#Matthew Correlation Coefficient

MCC = (TP*TN-FP*FN)/math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
accuracy = (TP+TN)/len(df)
precision = TP/(TP+FP)
recall = TP/(TP+FN)
#F-Score
F = 2*(precision*recall)/(precision+recall)

print(f"MCC : {MCC}")
print(f"Accuracy : {accuracy}")
print(f"F-Score : {F}")

ASFM0 : 35244.51015010911
ASFM1 : 231015.90694986438
MCC : 0.1620509308880411
Accuracy : 0.7777777777777778
F-Score : 0.2


In [14]:
from sklearn.cluster import SpectralClustering

SC_model = SpectralClustering(n_clusters = 2)

x = x.drop(['cluster'],axis = 'columns')
SC_model.fit(x)

SC_prediction = SC_model.labels_
x['cluster'] = SC_prediction

SC_prediction[:5]


df0 = x[x.cluster==0]
df1 = x[x.cluster==1]

ASFM0 = ASFM(df0)
ASFM1 = ASFM(df1)

print(f"ASFM0 : {ASFM0}")
print(f"ASFM1 : {ASFM1}")


#ASFM1 value is greater than ASFM0, so label 1 will represent defected software and 0 represents non-defected 
#software



TP = 0
FN = 0
TN = 0
FP = 0

n,m = df.shape


for i in range(n):
    
    defect = df.iloc[i,m-1]
    
    if defect == buggy and SC_prediction[i]==0:
        TP += 1
    elif defect == buggy and SC_prediction[i]==1:
        FN += 1
    elif defect == clean and SC_prediction[i]==1:
        TN += 1
    else:
        FP += 1
        
        
        

#Matthew Correlation Coefficient

MCC = (TP*TN-FP*FN)/math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
accuracy = (TP+TN)/len(df)
precision = TP/(TP+FP)
recall = TP/(TP+FN)
#F1-Score
F = 2*(precision*recall)/(precision+recall)

print(f"MCC : {MCC}")
print(f"Accuracy : {accuracy}")
print(f"F-Score : {F}")



ASFM0 : 47409.134853581105
ASFM1 : 456.73835599999995
MCC : 0.12964074471043288
Accuracy : 0.2777777777777778
F-Score : 0.38095238095238093


In [18]:
from sklearn.mixture import GaussianMixture

gm_model = GaussianMixture(n_components = 2)

x = x.drop(['cluster'],axis = 'columns')
gm_model.fit(x)
gm_prediction = gm_model.predict(x)
x['cluster'] = gm_prediction

gm_prediction[:5]


df0 = x[x.cluster==0]
df1 = x[x.cluster==1]

ASFM0 = ASFM(df0)
ASFM1 = ASFM(df1)

print(f"ASFM0 : {ASFM0}")
print(f"ASFM1 : {ASFM1}")

#ASFM1 value is greater than ASFM0, so label 1 will represent defected software and 0 represents non-defected 
#software



TP = 0
FN = 0
TN = 0
FP = 0

n,m = df.shape


for i in range(n):
    
    defect = df.iloc[i,m-1]
    
    if defect == buggy and gm_prediction[i]==1:
        TP += 1
    elif defect == buggy and gm_prediction[i]==0:
        FN += 1
    elif defect == clean and gm_prediction[i]==0:
        TN += 1
    else:
        FP += 1
        
        
        

#Matthew Correlation Coefficient

MCC = (TP*TN-FP*FN)/math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
accuracy = (TP+TN)/len(df)
precision = TP/(TP+FP)
recall = TP/(TP+FN)
#F-Score
F = 2*(precision*recall)/(precision+recall)

print(f"MCC : {MCC}")
print(f"Accuracy : {accuracy}")
print(f"F-Score : {F}")

ASFM0 : 34669.53439132276
ASFM1 : 189547.74261414836
MCC : 0.3223291856101521
Accuracy : 0.8055555555555556
F-Score : 0.36363636363636365
