In [9]:
from scipy.io import arff
import pandas as pd
import numpy as np
import math

from numpy import unique
from numpy import where
from matplotlib import pyplot

data = arff.loadarff('./../DefectData/inst/extdata/terapromise/mccabe/ar4.arff') 
df = pd.DataFrame(data[0])

In [10]:
df.shape

(107, 30)

In [11]:
df.head()

Unnamed: 0,total_loc,blank_loc,comment_loc,code_and_comment_loc,executable_loc,unique_operands,unique_operators,total_operands,total_operators,halstead_vocabulary,...,condition_count,multiple_condition_count,cyclomatic_complexity,cyclomatic_density,decision_density,design_complexity,design_density,normalized_cyclomatic_complexity,formal_parameters,defects
0,103.0,61.0,3.0,0.0,39.0,26.0,19.0,81.0,111.0,45.0,...,7.0,1.0,8.0,0.20513,1.1429,2.0,0.25,0.07767,0.0,b'false'
1,53.0,22.0,5.0,0.0,26.0,18.0,16.0,38.0,54.0,34.0,...,5.0,2.0,5.0,0.19231,1.2,0.0,0.0,0.09434,1.0,b'false'
2,25.0,10.0,1.0,0.0,14.0,12.0,14.0,42.0,58.0,26.0,...,1.0,0.0,2.0,0.14286,1.0,0.0,0.0,0.08,2.0,b'false'
3,73.0,22.0,8.0,1.0,43.0,25.0,12.0,67.0,97.0,37.0,...,15.0,7.0,9.0,0.2093,1.0,0.0,0.0,0.12329,0.0,b'false'
4,69.0,21.0,17.0,0.0,31.0,16.0,9.0,28.0,48.0,25.0,...,13.0,6.0,8.0,0.25806,1.0,0.0,0.0,0.11594,0.0,b'false'


In [12]:
buggy = b'true'
clean = b'false'
x = df.drop(['defects'],axis = 'columns')

for i in range(107):
    if df.iloc[i,29] == buggy:
        df.iloc[i,29] = 1
    else:
        df.iloc[i,29] = 0

y = df['defects']
y = y.astype(int)
y.value_counts()

0    87
1    20
Name: defects, dtype: int64

In [13]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy = 'minority')
x, y = smote.fit_resample(x,y)

print(y.value_counts())

0    87
1    87
Name: defects, dtype: int64


In [14]:
buggy = 1
clean = 0

In [15]:
from sklearn.cluster import KMeans

kmeans_model = KMeans(n_clusters = 2)
kmeans_model.fit(x)

kmeans_prediction = kmeans_model.predict(x)
x['cluster'] = kmeans_prediction

kmeans_prediction[:5]

array([0, 0, 0, 0, 0], dtype=int32)

In [16]:
def ASFM(DataFrame):
    SFM = 0
    
    x,y = DataFrame.shape
    
    for i in range(x):
        for j in range(y-1):
            SFM += DataFrame.iloc[i,j]
    
    return SFM/x

In [17]:
df0 = x[x.cluster==0]
df1 = x[x.cluster==1]

ASFM0 = ASFM(df0)
ASFM1 = ASFM(df1)

print(f"ASFM0 : {ASFM0}")
print(f"ASFM1 : {ASFM1}")


#ASFM0 value is greater than ASFM1, so label 0 will represent defected software and 1 represents non-defected 
#software



TP = 0
FN = 0
TN = 0
FP = 0

n,m = df.shape

for i in range(n):
    
    defect = df.iloc[i,m-1]
    
    if defect == buggy and kmeans_prediction[i]==1:
        TP += 1
    elif defect == buggy and kmeans_prediction[i]==0:
        FN += 1
    elif defect == clean and kmeans_prediction[i]==0:
        TN += 1
    else:
        FP += 1
        
        
        
#Matthew Correlation Coefficient

MCC = (TP*TN-FP*FN)/math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
accuracy = (TP+TN)/len(df)
precision = TP/(TP+FP)
recall = TP/(TP+FN)
#F1-Score
F = 2*(precision*recall)/(precision+recall)

print(f"MCC : {MCC}")
print(f"Accuracy : {accuracy}")
print(f"F-Score : {F}")

ASFM0 : 12571.072671992635
ASFM1 : 153418.66320445781
MCC : 0.40414495377177917
Accuracy : 0.8504672897196262
F-Score : 0.3846153846153846


In [19]:
from sklearn.cluster import MiniBatchKMeans

miniBatchKmeans_model = MiniBatchKMeans(n_clusters = 2)

x = x.drop(['cluster'],axis = 'columns')
miniBatchKmeans_model.fit(x)
miniBatchKmeans_prediction = miniBatchKmeans_model.labels_
x['cluster'] = miniBatchKmeans_prediction

miniBatchKmeans_prediction[:5]


df0 = x[x.cluster==0]
df1 = x[x.cluster==1]

ASFM0 = ASFM(df0)
ASFM1 = ASFM(df1)

print(f"ASFM0 : {ASFM0}")
print(f"ASFM1 : {ASFM1}")


#ASFM1 value is greater than ASFM0, so label 1 will represent defected software and 0 represents non-defected 
#software



TP = 0
FN = 0
TN = 0
FP = 0

n,m = df.shape


for i in range(n):
    
    defect = df.iloc[i,m-1]
    
    if defect == buggy and miniBatchKmeans_prediction[i]==1:
        TP += 1
    elif defect == buggy and miniBatchKmeans_prediction[i]==0:
        FN += 1
    elif defect == clean and miniBatchKmeans_prediction[i]==0:
        TN += 1
    else:
        FP += 1
        
        
#Matthew Correlation Coefficient

MCC = (TP*TN-FP*FN)/math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
accuracy = (TP+TN)/len(df)
precision = TP/(TP+FP)
recall = TP/(TP+FN)
#F-Score
F = 2*(precision*recall)/(precision+recall)

print(f"MCC : {MCC}")
print(f"Accuracy : {accuracy}")
print(f"F-Score : {F}")

ASFM0 : 12571.072671992635
ASFM1 : 153418.66320445781
MCC : 0.40414495377177917
Accuracy : 0.8504672897196262
F-Score : 0.3846153846153846


In [21]:
from sklearn.cluster import AgglomerativeClustering

ag_model = AgglomerativeClustering(n_clusters = 2)

x = x.drop(['cluster'],axis = 'columns')
ag_model.fit(x)
ag_prediction = ag_model.labels_
x['cluster'] = ag_prediction

ag_prediction[:5]


df0 = x[x.cluster==0]
df1 = x[x.cluster==1]

ASFM0 = ASFM(df0)
ASFM1 = ASFM(df1)

print(f"ASFM0 : {ASFM0}")
print(f"ASFM1 : {ASFM1}")


#ASFM1 value is greater than ASFM0, so label 1 will represent defected software and 0 represents non-defected 
#software



TP = 0
FN = 0
TN = 0
FP = 0

n,m = df.shape


for i in range(n):
    
    defect = df.iloc[i,m-1]
    
    if defect == buggy and ag_prediction[i]==0:
        TP += 1
    elif defect == buggy and ag_prediction[i]==1:
        FN += 1
    elif defect == clean and ag_prediction[i]==1:
        TN += 1
    else:
        FP += 1
        
        
        

#Matthew Correlation Coefficient

MCC = (TP*TN-FP*FN)/math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
accuracy = (TP+TN)/len(df)
precision = TP/(TP+FP)
recall = TP/(TP+FN)
#F-Score
F = 2*(precision*recall)/(precision+recall)

print(f"MCC : {MCC}")
print(f"Accuracy : {accuracy}")
print(f"F-Score : {F}")

ASFM0 : 153418.66320445781
ASFM1 : 12571.072671992635
MCC : 0.40414495377177917
Accuracy : 0.8504672897196262
F-Score : 0.3846153846153846


In [22]:
from sklearn.cluster import Birch

birch_model = Birch(threshold=0.01, n_clusters=2)

x = x.drop(['cluster'],axis = 'columns')
birch_model.fit(x)
birch_prediction = birch_model.labels_
x['cluster'] = birch_prediction

birch_prediction[:5]


df0 = x[x.cluster==0]
df1 = x[x.cluster==1]

ASFM0 = ASFM(df0)
ASFM1 = ASFM(df1)

print(f"ASFM0 : {ASFM0}")
print(f"ASFM1 : {ASFM1}")


#ASFM1 value is greater than ASFM0, so label 1 will represent defected software and 0 represents non-defected 
#software



TP = 0
FN = 0
TN = 0
FP = 0

n,m = df.shape


for i in range(n):
    
    defect = df.iloc[i,m-1]
    
    if defect == buggy and birch_prediction[i]==0:
        TP += 1
    elif defect == buggy and birch_prediction[i]==1:
        FN += 1
    elif defect == clean and birch_prediction[i]==1:
        TN += 1
    else:
        FP += 1
        
        
        

#Matthew Correlation Coefficient

MCC = (TP*TN-FP*FN)/math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
accuracy = (TP+TN)/len(df)
precision = TP/(TP+FP)
recall = TP/(TP+FN)
#F-Score
F = 2*(precision*recall)/(precision+recall)

print(f"MCC : {MCC}")
print(f"Accuracy : {accuracy}")
print(f"F-Score : {F}")

ASFM0 : 153418.66320445781
ASFM1 : 12571.072671992635
MCC : 0.40414495377177917
Accuracy : 0.8504672897196262
F-Score : 0.3846153846153846


In [23]:
from sklearn.cluster import SpectralClustering

SC_model = SpectralClustering(n_clusters = 2)

x = x.drop(['cluster'],axis = 'columns')
SC_model.fit(x)

SC_prediction = SC_model.labels_
x['cluster'] = SC_prediction

SC_prediction[:5]


df0 = x[x.cluster==0]
df1 = x[x.cluster==1]

ASFM0 = ASFM(df0)
ASFM1 = ASFM(df1)

print(f"ASFM0 : {ASFM0}")
print(f"ASFM1 : {ASFM1}")


#ASFM1 value is greater than ASFM0, so label 1 will represent defected software and 0 represents non-defected 
#software



TP = 0
FN = 0
TN = 0
FP = 0

n,m = df.shape


for i in range(n):
    
    defect = df.iloc[i,m-1]
    
    if defect == buggy and SC_prediction[i]==0:
        TP += 1
    elif defect == buggy and SC_prediction[i]==1:
        FN += 1
    elif defect == clean and SC_prediction[i]==1:
        TN += 1
    else:
        FP += 1
        
        
        

#Matthew Correlation Coefficient

MCC = (TP*TN-FP*FN)/math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
accuracy = (TP+TN)/len(df)
precision = TP/(TP+FP)
recall = TP/(TP+FN)
#F1-Score
F = 2*(precision*recall)/(precision+recall)

print(f"MCC : {MCC}")
print(f"Accuracy : {accuracy}")
print(f"F-Score : {F}")



ASFM0 : 30724.703775075275
ASFM1 : 682.2936639999999
MCC : 0.06617222226472869
Accuracy : 0.205607476635514
F-Score : 0.32


In [24]:
from sklearn.mixture import GaussianMixture

gm_model = GaussianMixture(n_components = 2)

x = x.drop(['cluster'],axis = 'columns')
gm_model.fit(x)
gm_prediction = gm_model.predict(x)
x['cluster'] = gm_prediction

gm_prediction[:5]


df0 = x[x.cluster==0]
df1 = x[x.cluster==1]

ASFM0 = ASFM(df0)
ASFM1 = ASFM(df1)

print(f"ASFM0 : {ASFM0}")
print(f"ASFM1 : {ASFM1}")

#ASFM1 value is greater than ASFM0, so label 1 will represent defected software and 0 represents non-defected 
#software



TP = 0
FN = 0
TN = 0
FP = 0

n,m = df.shape


for i in range(n):
    
    defect = df.iloc[i,m-1]
    
    if defect == buggy and gm_prediction[i]==1:
        TP += 1
    elif defect == buggy and gm_prediction[i]==0:
        FN += 1
    elif defect == clean and gm_prediction[i]==0:
        TN += 1
    else:
        FP += 1
        
        
        

#Matthew Correlation Coefficient

MCC = (TP*TN-FP*FN)/math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
accuracy = (TP+TN)/len(df)
precision = TP/(TP+FP)
recall = TP/(TP+FN)
#F-Score
F = 2*(precision*recall)/(precision+recall)

print(f"MCC : {MCC}")
print(f"Accuracy : {accuracy}")
print(f"F-Score : {F}")

ASFM0 : 12571.072671992635
ASFM1 : 153418.66320445781
MCC : 0.40414495377177917
Accuracy : 0.8504672897196262
F-Score : 0.3846153846153846
