In [1]:
#import lib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [11]:
df=pd.read_csv('../input/software-defect-prediction/jm1.csv')

df['uniq_Op'] = pd.to_numeric(df['uniq_Op'], errors='coerce') ## convert to number, make NaNs from ?s 
df['uniq_Opnd'] = pd.to_numeric(df['uniq_Opnd'], errors='coerce') ## convert to number, make NaNs from ?s 
df['total_Op'] = pd.to_numeric(df['total_Op'], errors='coerce') ## convert to number, make NaNs from ?s 
df['total_Opnd'] = pd.to_numeric(df['total_Opnd'], errors='coerce') ## convert to number, make NaNs from ?s 
df['branchCount'] = pd.to_numeric(df['branchCount'], errors='coerce') ## convert to number, make NaNs from ?s 
df['defects'] = df['defects'].astype(int)

df = df.dropna()
df = df.reset_index(drop=True)


In [12]:
from sklearn import model_selection

# Split-out validation dataset
array = df.values
X = array[:,0:21]
Y = array[:,21]


# Get Training and Validation sets
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=0.2, random_state=7)


In [13]:
from sklearn.preprocessing import StandardScaler
sc= StandardScaler()
X_train=sc.fit_transform(X_train)
X_validation=sc.fit_transform(X_validation)

In [14]:
from sklearn.svm import SVC
classifier=SVC(kernel='rbf',random_state=0)
%time classifier.fit(X_train,Y_train)

#predicting
y_pred=classifier.predict(X_validation)


CPU times: user 3.39 s, sys: 249 ms, total: 3.64 s
Wall time: 3.64 s


In [16]:
from sklearn.metrics import accuracy_score
y_pred_train=classifier.predict(X_train)

train_acc = accuracy_score(Y_train,y_pred_train)
test_acc = accuracy_score(Y_validation,y_pred)
print('Train: %.3f, Test: %.3f' % (train_acc, test_acc))


Train: 0.822, Test: 0.830


In [17]:
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
#print("1. Accuracy: {}".format(accuracy_score(Y_validation, y_pred)))
#print("2. Confusion Matrix:n{}".format(pd.crosstab(Y_validation, y_pred, rownames=['True'], colnames=['Predicted'])))
#f1_score(Y_validation, y_pred, average='weighted')
print('Accuracy Score : ' + str(accuracy_score(Y_validation,y_pred)))
print('Precision Score : ' + str(precision_score(Y_validation,y_pred)))
print('Recall Score : ' + str(recall_score(Y_validation,y_pred)))
print('F1 Score : ' + str(f1_score(Y_validation,y_pred)))

from sklearn.metrics import confusion_matrix
print('Confusion Matrix : \n' + str(confusion_matrix(Y_validation,y_pred)))


Accuracy Score : 0.8299632352941176
Precision Score : 0.5789473684210527
Recall Score : 0.14285714285714285
F1 Score : 0.22916666666666666
Confusion Matrix : 
[[1751   40]
 [ 330   55]]


In [18]:
pred_decision = classifier.decision_function(X_validation)

In [20]:
from sklearn.metrics import hinge_loss
hloss = hinge_loss(Y_validation,pred_decision)
print(hloss)

0.3562357963117244


In [21]:
from sklearn.metrics import mean_squared_error
mean_squared_error(Y_validation,y_pred) 

0.17003676470588236

Differential Evolution implementation

In [36]:
from scipy.optimize import differential_evolution

svc_alpha_lim = (0.1,10.0)
svc_gamma_lim = (0.1,10.0)
boundaries = [svc_alpha_lim,svc_gamma_lim] #[svc_alpha_lim] + [svc_gamma_lim]

#c_vector = [1e-2,1e-1,1,1e1,0.009,0.01,0.09,5,10,25,100]
#gamma_vector = [0.1,0.01,1e-5,1e-4,1e-3]
#hyperparams = [c_vector,gamma_vector]

def svc_function(hyperparams):
    alpha_value,gamma_value = hyperparams
    # Split data into test and train: random state fixed for reproducibility
   # kf = KFold(n_splits=10,shuffle=True,random_state=2020)
    svc_temp=SVC(kernel='rbf',random_state=0,C=alpha_value,gamma = gamma_value)
    svc_temp.fit(X_train,Y_train)
    y_pred_temp=svc_temp.predict(X_validation)
    
    rmse = 1-accuracy_score(Y_validation, y_pred_temp)
    return rmse
   


solver = differential_evolution(svc_function,boundaries,strategy='best1bin', workers=-1)
# Calculate best hyperparameters and resulting rmse
best_hyperparams = solver.x
best_rmse = solver.fun
# Print final results
print("Converged hyperparameters: alpha= %.6f, gamma= %.6f" %(best_hyperparams[0],best_hyperparams[1]))    
print("Minimum rmse: %.6f" %(best_rmse))



Converged hyperparameters: alpha= 1.462757, gamma= 9.334614
Minimum rmse: 0.176471


In [34]:
# when c ranegs from 1 to 2 and gamma ranges from 1 to 2, C=1.052822,gamma = 1.742512
svc_after_de=SVC(kernel='rbf',random_state=0,C=1.462757,gamma = 9.334614)
svc_after_de.fit(X_train,Y_train)
y_pred_after_de=svc_after_de.predict(X_validation)

previous one when range from 1 to 2 for both c and gamma
Accuracy Score : 0.8230698529411765
Precision Score : 0.5
Recall Score : 0.04155844155844156
F1 Score : 0.07673860911270983
Confusion Matrix : 
[[1775   16]
[ 369   16]]

In [35]:
print('Accuracy Score : ' + str(accuracy_score(Y_validation,y_pred_after_de)))
print('Precision Score : ' + str(precision_score(Y_validation,y_pred_after_de)))
print('Recall Score : ' + str(recall_score(Y_validation,y_pred_after_de)))
print('F1 Score : ' + str(f1_score(Y_validation,y_pred_after_de)))

from sklearn.metrics import confusion_matrix
print('Confusion Matrix : \n' + str(confusion_matrix(Y_validation,y_pred_after_de)))


Accuracy Score : 0.8230698529411765
Precision Score : 0.5
Recall Score : 0.04155844155844156
F1 Score : 0.07673860911270983
Confusion Matrix : 
[[1775   16]
 [ 369   16]]
