In [2]:
#import lib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [6]:
df=pd.read_csv('../input/software-defect-prediction/jm1.csv')

df['uniq_Op'] = pd.to_numeric(df['uniq_Op'], errors='coerce') ## convert to number, make NaNs from ?s 
df['uniq_Opnd'] = pd.to_numeric(df['uniq_Opnd'], errors='coerce') ## convert to number, make NaNs from ?s 
df['total_Op'] = pd.to_numeric(df['total_Op'], errors='coerce') ## convert to number, make NaNs from ?s 
df['total_Opnd'] = pd.to_numeric(df['total_Opnd'], errors='coerce') ## convert to number, make NaNs from ?s 
df['branchCount'] = pd.to_numeric(df['branchCount'], errors='coerce') ## convert to number, make NaNs from ?s 
df['defects'] = df['defects'].astype(int)

df = df.dropna()
df = df.reset_index(drop=True)


In [7]:
from sklearn import model_selection

# Split-out validation dataset
array = df.values
X = array[:,0:21]
Y = array[:,21]

# Get Training and Validation sets
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=0.2, random_state=7)

In [8]:
from sklearn.svm import SVC
classifier=SVC(kernel='rbf',random_state=0)
%time classifier.fit(X_train,Y_train)

#predicting
y_pred=classifier.predict(X_validation)


CPU times: user 2.06 s, sys: 101 ms, total: 2.16 s
Wall time: 2.16 s


In [9]:
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
#print("1. Accuracy: {}".format(accuracy_score(Y_validation, y_pred)))
#print("2. Confusion Matrix:n{}".format(pd.crosstab(Y_validation, y_pred, rownames=['True'], colnames=['Predicted'])))
#f1_score(Y_validation, y_pred, average='weighted')
print('Accuracy Score : ' + str(accuracy_score(Y_validation,y_pred)))
print('Precision Score : ' + str(precision_score(Y_validation,y_pred)))
print('Recall Score : ' + str(recall_score(Y_validation,y_pred)))
print('F1 Score : ' + str(f1_score(Y_validation,y_pred)))

from sklearn.metrics import confusion_matrix
print('Confusion Matrix : \n' + str(confusion_matrix(Y_validation,y_pred)))


Accuracy Score : 0.8226102941176471
Precision Score : 0.45454545454545453
Recall Score : 0.012987012987012988
F1 Score : 0.025252525252525252
Confusion Matrix : 
[[1785    6]
 [ 380    5]]


Grid Search Implementation:

In [20]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

classifier=SVC(kernel='rbf',random_state=0)
parameters={'C':[1e-2,1e-1,1,1e1,0.009,0.01,0.09,5,10,25],'kernel':['rbf'],'gamma':[0.1,0.01,1e-5,1e-4,1e-3]}
grid = GridSearchCV(classifier,parameters,n_jobs=-1,cv=3,scoring='accuracy')
%time grid = grid.fit(X_train,Y_train)
best_acc = grid.best_score_
best_param = grid.best_params_

CPU times: user 8.94 s, sys: 135 ms, total: 9.08 s
Wall time: 3min 16s


In [21]:
print(best_acc)
print(best_param)
pd.DataFrame(grid.cv_results_)

0.8059514400200886
{'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_gamma,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,3.674464,0.50931,1.501023,0.387501,0.01,0.1,rbf,"{'C': 0.01, 'gamma': 0.1, 'kernel': 'rbf'}",0.80255,0.802827,0.802482,0.802619,0.000149,4
1,2.881842,0.225188,1.151705,0.024304,0.01,0.01,rbf,"{'C': 0.01, 'gamma': 0.01, 'kernel': 'rbf'}",0.80255,0.802827,0.802482,0.802619,0.000149,4
2,1.802663,0.317237,0.775572,0.128992,0.01,1e-05,rbf,"{'C': 0.01, 'gamma': 1e-05, 'kernel': 'rbf'}",0.80255,0.802827,0.802482,0.802619,0.000149,4
3,1.830529,0.075404,0.786479,0.010908,0.01,0.0001,rbf,"{'C': 0.01, 'gamma': 0.0001, 'kernel': 'rbf'}",0.80255,0.802827,0.802482,0.802619,0.000149,4
4,2.735884,0.582229,1.157212,0.260447,0.01,0.001,rbf,"{'C': 0.01, 'gamma': 0.001, 'kernel': 'rbf'}",0.80255,0.802827,0.802482,0.802619,0.000149,4
5,3.3337,0.041869,1.552585,0.312747,0.1,0.1,rbf,"{'C': 0.1, 'gamma': 0.1, 'kernel': 'rbf'}",0.80255,0.802827,0.802482,0.802619,0.000149,4
6,3.81557,0.406086,1.31198,0.192325,0.1,0.01,rbf,"{'C': 0.1, 'gamma': 0.01, 'kernel': 'rbf'}",0.80255,0.802827,0.802482,0.802619,0.000149,4
7,2.277103,0.344021,0.873853,0.193365,0.1,1e-05,rbf,"{'C': 0.1, 'gamma': 1e-05, 'kernel': 'rbf'}",0.80255,0.802827,0.802482,0.802619,0.000149,4
8,2.759888,0.522684,1.059011,0.235076,0.1,0.0001,rbf,"{'C': 0.1, 'gamma': 0.0001, 'kernel': 'rbf'}",0.80255,0.802827,0.800414,0.80193,0.001078,28
9,3.056679,0.092893,1.129583,0.141846,0.1,0.001,rbf,"{'C': 0.1, 'gamma': 0.001, 'kernel': 'rbf'}",0.80255,0.802827,0.802482,0.802619,0.000149,4


random search implemenatation:

In [22]:
from sklearn.model_selection import RandomizedSearchCV

from sklearn.svm import SVC
classifier=SVC(kernel='rbf',random_state=0)
parameters={'C':[1e-2,1e-1,1,1e1,0.009,0.01,0.09,5,10,25],'kernel':['rbf'],'gamma':[0.1,0.01,1e-5,1e-4,1e-3]}
random =RandomizedSearchCV(classifier,parameters,n_iter=10,cv=3,scoring='accuracy',n_jobs=-1)
%time random=random.fit(X_train,Y_train)
best_acc=random.best_score_
best_param=random.best_params_

CPU times: user 8.61 s, sys: 69.9 ms, total: 8.68 s
Wall time: 22.6 s


In [19]:
print(best_acc)
print(best_param)
pd.DataFrame(random.cv_results_)

0.8059514400200886


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_kernel,param_gamma,param_C,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,3.078562,0.129283,0.594812,0.003686,rbf,1e-05,10.0,"{'kernel': 'rbf', 'gamma': 1e-05, 'C': 10.0}",0.788766,0.790762,0.78697,0.788833,0.001549,4
1,1.981578,0.075742,0.796105,0.006507,rbf,0.0001,0.1,"{'kernel': 'rbf', 'gamma': 0.0001, 'C': 0.1}",0.80255,0.802827,0.800414,0.80193,0.001078,3
2,3.250725,0.375014,1.163609,0.010201,rbf,0.01,1.0,"{'kernel': 'rbf', 'gamma': 0.01, 'C': 1.0}",0.804618,0.807653,0.805584,0.805951,0.001266,1
3,2.672245,0.073355,0.865898,0.100789,rbf,0.001,1.0,"{'kernel': 'rbf', 'gamma': 0.001, 'C': 1.0}",0.804962,0.805929,0.804895,0.805262,0.000472,2
4,4.760247,0.291469,0.959716,0.004096,rbf,0.001,10.0,"{'kernel': 'rbf', 'gamma': 0.001, 'C': 10.0}",0.782908,0.785591,0.78697,0.785157,0.001686,5
