In [1]:
from scipy.io import arff
import pandas as pd
import numpy as np
import math

from numpy import unique
from numpy import where
from matplotlib import pyplot

data = arff.loadarff('./../DefectData/inst/extdata/terapromise/mccabe/ar1.arff') 
df = pd.DataFrame(data[0])

In [2]:
df.shape

(121, 30)

In [3]:
df.head()

Unnamed: 0,total_loc,blank_loc,comment_loc,code_and_comment_loc,executable_loc,unique_operands,unique_operators,total_operands,total_operators,halstead_vocabulary,...,condition_count,multiple_condition_count,cyclomatic_complexity,cyclomatic_density,decision_density,design_complexity,design_density,normalized_cyclomatic_complexity,formal_parameters,defects
0,7.0,0.0,4.0,0.0,3.0,8.0,6.0,10.0,12.0,14.0,...,0.0,0.0,1.0,0.33,0.0,0.0,0.0,0.14,0.0,b'false'
1,9.0,0.0,1.0,0.0,8.0,7.0,8.0,15.0,20.0,15.0,...,0.0,0.0,2.0,0.25,0.0,0.0,0.0,0.22,0.0,b'false'
2,21.0,0.0,14.0,1.0,7.0,15.0,12.0,21.0,36.0,27.0,...,4.0,1.0,4.0,0.57,1.0,1.0,0.25,0.19,0.0,b'false'
3,30.0,0.0,11.0,0.0,19.0,16.0,18.0,50.0,70.0,34.0,...,9.0,2.0,9.0,0.47,1.11,1.0,0.11,0.3,0.0,b'true'
4,8.0,0.0,2.0,0.0,6.0,4.0,5.0,5.0,10.0,9.0,...,1.0,0.0,2.0,0.33,1.0,1.0,0.5,0.25,0.0,b'false'


In [4]:
buggy = b'true'
clean = b'false'
x = df.drop(['defects'],axis = 'columns')

for i in range(121):
    if df.iloc[i,29] == buggy:
        df.iloc[i,29] = 1
    else:
        df.iloc[i,29] = 0

y = df['defects']
y = y.astype(int)
y.head()

0    0
1    0
2    0
3    1
4    0
Name: defects, dtype: int64

In [5]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy = 'minority')
x, y = smote.fit_resample(x,y)

print(y.value_counts())

0    112
1    112
Name: defects, dtype: int64


In [6]:
buggy = 1
clean = 0

In [7]:
from sklearn.cluster import KMeans

kmeans_model = KMeans(n_clusters = 2)
kmeans_model.fit(x)

kmeans_prediction = kmeans_model.predict(x)
x['cluster'] = kmeans_prediction

kmeans_prediction[:5]

array([0, 0, 0, 0, 0], dtype=int32)

In [8]:
def ASFM(DataFrame):
    SFM = 0
    
    x,y = DataFrame.shape
    
    for i in range(x):
        for j in range(y-1):
            SFM += DataFrame.iloc[i,j]
    
    return SFM/x

In [9]:
df0 = x[x.cluster==0]
df1 = x[x.cluster==1]

In [10]:
ASFM0 = ASFM(df0)
ASFM1 = ASFM(df1)

print(f"ASFM0 : {ASFM0}")
print(f"ASFM1 : {ASFM1}")

ASFM0 : 3541.9309168718905
ASFM1 : 23532.830814374243


In [11]:
#ASFM0 value is greater than ASFM1, so label 0 will represent defected software and 1 represents non-defected 
#software



TP = 0
FN = 0
TN = 0
FP = 0

n,m = df.shape

for i in range(n):
    
    defect = df.iloc[i,m-1]
    
    if defect == buggy and kmeans_prediction[i]==1:
        TP += 1
    elif defect == buggy and kmeans_prediction[i]==0:
        FN += 1
    elif defect == clean and kmeans_prediction[i]==0:
        TN += 1
    else:
        FP += 1

In [12]:
#Matthew Correlation Coefficient

MCC = (TP*TN-FP*FN)/math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
accuracy = (TP+TN)/len(df)
precision = TP/(TP+FP)
recall = TP/(TP+FN)
#F1-Score
F = 2*(precision*recall)/(precision+recall)

print(f"MCC : {MCC}")
print(f"Accuracy : {accuracy}")
print(f"F-Score : {F}")

MCC : 0.2254254980353404
Accuracy : 0.9090909090909091
F-Score : 0.26666666666666666


In [161]:
from sklearn.cluster import MiniBatchKMeans

miniBatchKmeans_model = MiniBatchKMeans(n_clusters = 2)

x = x.drop(['cluster'],axis = 'columns')
miniBatchKmeans_model.fit(x)
miniBatchKmeans_prediction = miniBatchKmeans_model.labels_
x['cluster'] = miniBatchKmeans_prediction

miniBatchKmeans_prediction[:5]

array([1, 1, 1, 1, 1], dtype=int32)

In [162]:
df0 = x[x.cluster==0]
df1 = x[x.cluster==1]

ASFM0 = ASFM(df0)
ASFM1 = ASFM(df1)

print(f"ASFM0 : {ASFM0}")
print(f"ASFM1 : {ASFM1}")

ASFM0 : 32786.260750196096
ASFM1 : 3832.5396591691892


In [163]:
#ASFM1 value is greater than ASFM0, so label 1 will represent defected software and 0 represents non-defected 
#software



TP = 0
FN = 0
TN = 0
FP = 0

n,m = df.shape


for i in range(n):
    
    defect = df.iloc[i,m-1]
    
    if defect == buggy and miniBatchKmeans_prediction[i]==0:
        TP += 1
    elif defect == buggy and miniBatchKmeans_prediction[i]==1:
        FN += 1
    elif defect == clean and miniBatchKmeans_prediction[i]==1:
        TN += 1
    else:
        FP += 1

In [164]:
#Matthew Correlation Coefficient

MCC = (TP*TN-FP*FN)/math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
accuracy = (TP+TN)/len(df)
precision = TP/(TP+FP)
recall = TP/(TP+FN)
#F-Score
F = 2*(precision*recall)/(precision+recall)

print(f"MCC : {MCC}")
print(f"Accuracy : {accuracy}")
print(f"F-Score : {F}")

MCC : 0.15736058322268526
Accuracy : 0.9173553719008265
F-Score : 0.16666666666666666


In [165]:
from sklearn.cluster import AgglomerativeClustering

ag_model = AgglomerativeClustering(n_clusters = 2)

x = x.drop(['cluster'],axis = 'columns')
ag_model.fit(x)
ag_prediction = ag_model.labels_
x['cluster'] = ag_prediction

ag_prediction[:5]


df0 = x[x.cluster==0]
df1 = x[x.cluster==1]

ASFM0 = ASFM(df0)
ASFM1 = ASFM(df1)

print(f"ASFM0 : {ASFM0}")
print(f"ASFM1 : {ASFM1}")

ASFM0 : 3993.696692976133
ASFM1 : 35370.558518369624


In [166]:
#ASFM1 value is greater than ASFM0, so label 1 will represent defected software and 0 represents non-defected 
#software



TP = 0
FN = 0
TN = 0
FP = 0

n,m = df.shape


for i in range(n):
    
    defect = df.iloc[i,m-1]
    
    if defect == buggy and ag_prediction[i]==1:
        TP += 1
    elif defect == buggy and ag_prediction[i]==0:
        FN += 1
    elif defect == clean and ag_prediction[i]==0:
        TN += 1
    else:
        FP += 1
        
        
        

#Matthew Correlation Coefficient

MCC = (TP*TN-FP*FN)/math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
accuracy = (TP+TN)/len(df)
precision = TP/(TP+FP)
recall = TP/(TP+FN)
#F-Score
F = 2*(precision*recall)/(precision+recall)

print(f"MCC : {MCC}")
print(f"Accuracy : {accuracy}")
print(f"F-Score : {F}")

MCC : 0.21028993178188585
Accuracy : 0.9256198347107438
F-Score : 0.1818181818181818


In [167]:
from sklearn.cluster import Birch

birch_model = Birch(threshold=0.01, n_clusters=2)

x = x.drop(['cluster'],axis = 'columns')
birch_model.fit(x)
birch_prediction = birch_model.labels_
x['cluster'] = birch_prediction

birch_prediction[:5]


df0 = x[x.cluster==0]
df1 = x[x.cluster==1]

ASFM0 = ASFM(df0)
ASFM1 = ASFM(df1)

print(f"ASFM0 : {ASFM0}")
print(f"ASFM1 : {ASFM1}")

ASFM0 : 17340.733640453775
ASFM1 : 2045.0018901348421


In [168]:
#ASFM1 value is greater than ASFM0, so label 1 will represent defected software and 0 represents non-defected 
#software



TP = 0
FN = 0
TN = 0
FP = 0

n,m = df.shape


for i in range(n):
    
    defect = df.iloc[i,m-1]
    
    if defect == buggy and birch_prediction[i]==0:
        TP += 1
    elif defect == buggy and birch_prediction[i]==1:
        FN += 1
    elif defect == clean and birch_prediction[i]==1:
        TN += 1
    else:
        FP += 1
        
        
        

#Matthew Correlation Coefficient

MCC = (TP*TN-FP*FN)/math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
accuracy = (TP+TN)/len(df)
precision = TP/(TP+FP)
recall = TP/(TP+FN)
#F-Score
F = 2*(precision*recall)/(precision+recall)

print(f"MCC : {MCC}")
print(f"Accuracy : {accuracy}")
print(f"F-Score : {F}")

MCC : 0.32745656883854213
Accuracy : 0.8925619834710744
F-Score : 0.380952380952381


In [169]:
from sklearn.cluster import SpectralClustering

SC_model = SpectralClustering(n_clusters = 2)

x = x.drop(['cluster'],axis = 'columns')
SC_model.fit(x)

SC_prediction = SC_model.labels_
x['cluster'] = SC_prediction

SC_prediction[:5]


df0 = x[x.cluster==0]
df1 = x[x.cluster==1]

ASFM0 = ASFM(df0)
ASFM1 = ASFM(df1)

print(f"ASFM0 : {ASFM0}")
print(f"ASFM1 : {ASFM1}")



ASFM0 : 5298.198225763355
ASFM1 : 2026.0025871649589


In [170]:
#ASFM1 value is greater than ASFM0, so label 1 will represent defected software and 0 represents non-defected 
#software



TP = 0
FN = 0
TN = 0
FP = 0

n,m = df.shape


for i in range(n):
    
    defect = df.iloc[i,m-1]
    
    if defect == buggy and SC_prediction[i]==0:
        TP += 1
    elif defect == buggy and SC_prediction[i]==1:
        FN += 1
    elif defect == clean and SC_prediction[i]==1:
        TN += 1
    else:
        FP += 1
        
        
        

#Matthew Correlation Coefficient

MCC = (TP*TN-FP*FN)/math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
accuracy = (TP+TN)/len(df)
precision = TP/(TP+FP)
recall = TP/(TP+FN)
#F1-Score
F = 2*(precision*recall)/(precision+recall)

print(f"MCC : {MCC}")
print(f"Accuracy : {accuracy}")
print(f"F-Score : {F}")

MCC : 0.0367496968162519
Accuracy : 0.09090909090909091
F-Score : 0.140625


In [171]:
from sklearn.mixture import GaussianMixture

gm_model = GaussianMixture(n_components = 2)

x = x.drop(['cluster'],axis = 'columns')
gm_model.fit(x)
gm_prediction = gm_model.predict(x)
x['cluster'] = gm_prediction

gm_prediction[:5]


df0 = x[x.cluster==0]
df1 = x[x.cluster==1]

ASFM0 = ASFM(df0)
ASFM1 = ASFM(df1)

print(f"ASFM0 : {ASFM0}")
print(f"ASFM1 : {ASFM1}")

ASFM0 : 1860.2472253178464
ASFM1 : 11863.989819844122


In [172]:
#ASFM1 value is greater than ASFM0, so label 1 will represent defected software and 0 represents non-defected 
#software



TP = 0
FN = 0
TN = 0
FP = 0

n,m = df.shape


for i in range(n):
    
    defect = df.iloc[i,m-1]
    
    if defect == buggy and gm_prediction[i]==1:
        TP += 1
    elif defect == buggy and gm_prediction[i]==0:
        FN += 1
    elif defect == clean and gm_prediction[i]==0:
        TN += 1
    else:
        FP += 1
        
        
        

#Matthew Correlation Coefficient

MCC = (TP*TN-FP*FN)/math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
accuracy = (TP+TN)/len(df)
precision = TP/(TP+FP)
recall = TP/(TP+FN)
#F-Score
F = 2*(precision*recall)/(precision+recall)

print(f"MCC : {MCC}")
print(f"Accuracy : {accuracy}")
print(f"F-Score : {F}")

MCC : 0.586744587362836
Accuracy : 0.9256198347107438
F-Score : 0.6086956521739131
