In [5]:
from scipy.io import arff
import pandas as pd
import numpy as np
import math

from numpy import unique
from numpy import where
from matplotlib import pyplot

data = arff.loadarff('./../DefectData/inst/extdata/terapromise/mccabe/ar6.arff') 
df = pd.DataFrame(data[0])

In [6]:
df.shape

(101, 30)

In [7]:
df.head()

Unnamed: 0,total_loc,blank_loc,comment_loc,code_and_comment_loc,executable_loc,unique_operands,unique_operators,total_operands,total_operators,halstead_vocabulary,...,condition_count,multiple_condition_count,cyclomatic_complexity,cyclomatic_density,decision_density,design_complexity,design_density,normalized_cyclomatic_complexity,formal_parameters,defects
0,8.0,0.0,4.0,0.0,4.0,8.0,6.0,10.0,12.0,14.0,...,0.0,0.0,1.0,0.25,0.0,0.0,0.0,0.13,0.0,b'false'
1,13.0,0.0,1.0,0.0,12.0,10.0,8.0,21.0,26.0,18.0,...,0.0,0.0,2.0,0.17,0.0,0.0,0.0,0.15,0.0,b'false'
2,20.0,0.0,12.0,0.0,8.0,14.0,12.0,21.0,34.0,26.0,...,3.0,0.0,4.0,0.5,1.0,0.0,0.0,0.2,0.0,b'false'
3,40.0,0.0,17.0,0.0,23.0,20.0,18.0,64.0,90.0,38.0,...,11.0,2.0,11.0,0.48,1.09,1.0,0.09,0.28,0.0,b'false'
4,8.0,0.0,3.0,0.0,5.0,7.0,13.0,13.0,21.0,20.0,...,1.0,0.0,3.0,0.6,2.0,0.0,0.0,0.38,1.0,b'false'


In [8]:
buggy = b'true'
clean = b'false'
x = df.drop(['defects'],axis = 'columns')

for i in range(101):
    if df.iloc[i,29] == buggy:
        df.iloc[i,29] = 1
    else:
        df.iloc[i,29] = 0

y = df['defects']
y = y.astype(int)
y.value_counts()

0    86
1    15
Name: defects, dtype: int64

In [9]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy = 'minority')
x, y = smote.fit_resample(x,y)

print(y.value_counts())

0    86
1    86
Name: defects, dtype: int64


In [10]:
buggy = 1
clean = 0


def ASFM(DataFrame):
    SFM = 0
    
    x,y = DataFrame.shape
    
    for i in range(x):
        for j in range(y-1):
            SFM += DataFrame.iloc[i,j]
    
    return SFM/x

In [11]:
from sklearn.cluster import KMeans

kmeans_model = KMeans(n_clusters = 2)
kmeans_model.fit(x)

kmeans_prediction = kmeans_model.predict(x)
x['cluster'] = kmeans_prediction

kmeans_prediction[:5]


df0 = x[x.cluster==0]
df1 = x[x.cluster==1]

ASFM0 = ASFM(df0)
ASFM1 = ASFM(df1)

print(f"ASFM0 : {ASFM0}")
print(f"ASFM1 : {ASFM1}")


#ASFM0 value is greater than ASFM1, so label 0 will represent defected software and 1 represents non-defected 
#software



TP = 0
FN = 0
TN = 0
FP = 0

n,m = df.shape

for i in range(n):
    
    defect = df.iloc[i,m-1]
    
    if defect == buggy and kmeans_prediction[i]==1:
        TP += 1
    elif defect == buggy and kmeans_prediction[i]==0:
        FN += 1
    elif defect == clean and kmeans_prediction[i]==0:
        TN += 1
    else:
        FP += 1
        
        
        
#Matthew Correlation Coefficient

MCC = (TP*TN-FP*FN)/math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
accuracy = (TP+TN)/len(df)
precision = TP/(TP+FP)
recall = TP/(TP+FN)
#F1-Score
F = 2*(precision*recall)/(precision+recall)

print(f"MCC : {MCC}")
print(f"Accuracy : {accuracy}")
print(f"F-Score : {F}")

ASFM0 : 2040.7923054872813
ASFM1 : 13006.922613103752
MCC : 0.1387596899224806
Accuracy : 0.7821782178217822
F-Score : 0.26666666666666666


In [18]:
from sklearn.cluster import MiniBatchKMeans

miniBatchKmeans_model = MiniBatchKMeans(n_clusters = 2)

x = x.drop(['cluster'],axis = 'columns')
miniBatchKmeans_model.fit(x)
miniBatchKmeans_prediction = miniBatchKmeans_model.labels_
x['cluster'] = miniBatchKmeans_prediction

miniBatchKmeans_prediction[:5]


df0 = x[x.cluster==0]
df1 = x[x.cluster==1]

ASFM0 = ASFM(df0)
ASFM1 = ASFM(df1)

print(f"ASFM0 : {ASFM0}")
print(f"ASFM1 : {ASFM1}")


#ASFM1 value is greater than ASFM0, so label 1 will represent defected software and 0 represents non-defected 
#software



TP = 0
FN = 0
TN = 0
FP = 0

n,m = df.shape


for i in range(n):
    
    defect = df.iloc[i,m-1]
    
    if defect == buggy and miniBatchKmeans_prediction[i]==1:
        TP += 1
    elif defect == buggy and miniBatchKmeans_prediction[i]==0:
        FN += 1
    elif defect == clean and miniBatchKmeans_prediction[i]==0:
        TN += 1
    else:
        FP += 1
        
        
#Matthew Correlation Coefficient

MCC = (TP*TN-FP*FN)/math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
accuracy = (TP+TN)/len(df)
precision = TP/(TP+FP)
recall = TP/(TP+FN)
#F-Score
F = 2*(precision*recall)/(precision+recall)

print(f"MCC : {MCC}")
print(f"Accuracy : {accuracy}")
print(f"F-Score : {F}")

ASFM0 : 2040.7923054872813
ASFM1 : 13006.922613103752
MCC : 0.1387596899224806
Accuracy : 0.7821782178217822
F-Score : 0.26666666666666666


In [19]:
from sklearn.cluster import AgglomerativeClustering

ag_model = AgglomerativeClustering(n_clusters = 2)

x = x.drop(['cluster'],axis = 'columns')
ag_model.fit(x)
ag_prediction = ag_model.labels_
x['cluster'] = ag_prediction

ag_prediction[:5]


df0 = x[x.cluster==0]
df1 = x[x.cluster==1]

ASFM0 = ASFM(df0)
ASFM1 = ASFM(df1)

print(f"ASFM0 : {ASFM0}")
print(f"ASFM1 : {ASFM1}")


#ASFM1 value is greater than ASFM0, so label 1 will represent defected software and 0 represents non-defected 
#software



TP = 0
FN = 0
TN = 0
FP = 0

n,m = df.shape


for i in range(n):
    
    defect = df.iloc[i,m-1]
    
    if defect == buggy and ag_prediction[i]==1:
        TP += 1
    elif defect == buggy and ag_prediction[i]==0:
        FN += 1
    elif defect == clean and ag_prediction[i]==0:
        TN += 1
    else:
        FP += 1
        
        
        

#Matthew Correlation Coefficient

MCC = (TP*TN-FP*FN)/math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
accuracy = (TP+TN)/len(df)
precision = TP/(TP+FP)
recall = TP/(TP+FN)
#F-Score
F = 2*(precision*recall)/(precision+recall)

print(f"MCC : {MCC}")
print(f"Accuracy : {accuracy}")
print(f"F-Score : {F}")

ASFM0 : 2194.3121112697686
ASFM1 : 13991.82883011853
MCC : 0.1047911556755288
Accuracy : 0.7920792079207921
F-Score : 0.22222222222222224


In [20]:
from sklearn.cluster import Birch

birch_model = Birch(threshold=0.01, n_clusters=2)

x = x.drop(['cluster'],axis = 'columns')
birch_model.fit(x)
birch_prediction = birch_model.labels_
x['cluster'] = birch_prediction

birch_prediction[:5]


df0 = x[x.cluster==0]
df1 = x[x.cluster==1]

ASFM0 = ASFM(df0)
ASFM1 = ASFM(df1)

print(f"ASFM0 : {ASFM0}")
print(f"ASFM1 : {ASFM1}")


#ASFM1 value is greater than ASFM0, so label 1 will represent defected software and 0 represents non-defected 
#software



TP = 0
FN = 0
TN = 0
FP = 0

n,m = df.shape


for i in range(n):
    
    defect = df.iloc[i,m-1]
    
    if defect == buggy and birch_prediction[i]==1:
        TP += 1
    elif defect == buggy and birch_prediction[i]==0:
        FN += 1
    elif defect == clean and birch_prediction[i]==0:
        TN += 1
    else:
        FP += 1
        
        
        

#Matthew Correlation Coefficient

MCC = (TP*TN-FP*FN)/math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
accuracy = (TP+TN)/len(df)
precision = TP/(TP+FP)
recall = TP/(TP+FN)
#F-Score
F = 2*(precision*recall)/(precision+recall)

print(f"MCC : {MCC}")
print(f"Accuracy : {accuracy}")
print(f"F-Score : {F}")

ASFM0 : 2194.3121112697686
ASFM1 : 13991.82883011853
MCC : 0.1047911556755288
Accuracy : 0.7920792079207921
F-Score : 0.22222222222222224


In [21]:
from sklearn.cluster import SpectralClustering

SC_model = SpectralClustering(n_clusters = 2)

x = x.drop(['cluster'],axis = 'columns')
SC_model.fit(x)

SC_prediction = SC_model.labels_
x['cluster'] = SC_prediction

SC_prediction[:5]


df0 = x[x.cluster==0]
df1 = x[x.cluster==1]

ASFM0 = ASFM(df0)
ASFM1 = ASFM(df1)

print(f"ASFM0 : {ASFM0}")
print(f"ASFM1 : {ASFM1}")


#ASFM1 value is greater than ASFM0, so label 1 will represent defected software and 0 represents non-defected 
#software



TP = 0
FN = 0
TN = 0
FP = 0

n,m = df.shape


for i in range(n):
    
    defect = df.iloc[i,m-1]
    
    if defect == buggy and SC_prediction[i]==0:
        TP += 1
    elif defect == buggy and SC_prediction[i]==1:
        FN += 1
    elif defect == clean and SC_prediction[i]==1:
        TN += 1
    else:
        FP += 1
        
        
        

#Matthew Correlation Coefficient

MCC = (TP*TN-FP*FN)/math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
accuracy = (TP+TN)/len(df)
precision = TP/(TP+FP)
recall = TP/(TP+FN)
#F1-Score
F = 2*(precision*recall)/(precision+recall)

print(f"MCC : {MCC}")
print(f"Accuracy : {accuracy}")
print(f"F-Score : {F}")



ASFM0 : 3673.312234137398
ASFM1 : 353.22721543357784
MCC : 0.04176345347922784
Accuracy : 0.15841584158415842
F-Score : 0.2608695652173913


In [22]:
from sklearn.mixture import GaussianMixture

gm_model = GaussianMixture(n_components = 2)

x = x.drop(['cluster'],axis = 'columns')
gm_model.fit(x)
gm_prediction = gm_model.predict(x)
x['cluster'] = gm_prediction

gm_prediction[:5]


df0 = x[x.cluster==0]
df1 = x[x.cluster==1]

ASFM0 = ASFM(df0)
ASFM1 = ASFM(df1)

print(f"ASFM0 : {ASFM0}")
print(f"ASFM1 : {ASFM1}")

#ASFM1 value is greater than ASFM0, so label 1 will represent defected software and 0 represents non-defected 
#software



TP = 0
FN = 0
TN = 0
FP = 0

n,m = df.shape


for i in range(n):
    
    defect = df.iloc[i,m-1]
    
    if defect == buggy and gm_prediction[i]==1:
        TP += 1
    elif defect == buggy and gm_prediction[i]==0:
        FN += 1
    elif defect == clean and gm_prediction[i]==0:
        TN += 1
    else:
        FP += 1
        
        
        

#Matthew Correlation Coefficient

MCC = (TP*TN-FP*FN)/math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
accuracy = (TP+TN)/len(df)
precision = TP/(TP+FP)
recall = TP/(TP+FN)
#F-Score
F = 2*(precision*recall)/(precision+recall)

print(f"MCC : {MCC}")
print(f"Accuracy : {accuracy}")
print(f"F-Score : {F}")

ASFM0 : 2027.1522527851382
ASFM1 : 12661.742512599765
MCC : 0.20006981225702639
Accuracy : 0.7920792079207921
F-Score : 0.3225806451612903
