In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [2]:
data=pd.read_csv('voice.csv')

In [3]:
data.head()

Unnamed: 0,meanfreq,sd,median,Q25,Q75,IQR,skew,kurt,sp.ent,sfm,...,centroid,meanfun,minfun,maxfun,meandom,mindom,maxdom,dfrange,modindx,label
0,0.059781,0.064241,0.032027,0.015071,0.090193,0.075122,12.863462,274.402906,0.893369,0.491918,...,0.059781,0.084279,0.015702,0.275862,0.007812,0.007812,0.007812,0.0,0.0,male
1,0.066009,0.06731,0.040229,0.019414,0.092666,0.073252,22.423285,634.613855,0.892193,0.513724,...,0.066009,0.107937,0.015826,0.25,0.009014,0.007812,0.054688,0.046875,0.052632,male
2,0.077316,0.083829,0.036718,0.008701,0.131908,0.123207,30.757155,1024.927705,0.846389,0.478905,...,0.077316,0.098706,0.015656,0.271186,0.00799,0.007812,0.015625,0.007812,0.046512,male
3,0.151228,0.072111,0.158011,0.096582,0.207955,0.111374,1.232831,4.177296,0.963322,0.727232,...,0.151228,0.088965,0.017798,0.25,0.201497,0.007812,0.5625,0.554688,0.247119,male
4,0.13512,0.079146,0.124656,0.07872,0.206045,0.127325,1.101174,4.333713,0.971955,0.783568,...,0.13512,0.106398,0.016931,0.266667,0.712812,0.007812,5.484375,5.476562,0.208274,male


#### SVM is a distence based model- scaling will be performed.
#### Target value is categorical so we will have to encode it.

In [4]:
data.shape

(3168, 21)

In [5]:
data['label'].unique()

array(['male', 'female'], dtype=object)

In [6]:
print('Total no of rows: %d'%data.shape[0])
print('Total no of male: %d'%data[data['label']=='male'].shape[0])
print('Total no of female: %d'%data[data['label']=='female'].shape[0])

Total no of rows: 3168
Total no of male: 1584
Total no of female: 1584


## FEATURE EXTRACTION

In [7]:
x=data.iloc[:,:-1]
y=data.iloc[:,-1:]

## Encoding

In [8]:
from sklearn.preprocessing import LabelEncoder
gender_encoder=LabelEncoder()
y=gender_encoder.fit_transform(y)

### Train_test_split

In [9]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(x,y,train_size=0.8,random_state=5)

In [10]:
# SCale the data from -1 to 1
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)

#### SVM with default hyperparameter

In [12]:
from sklearn.svm import SVC  # support vector classifier
from sklearn import metrics  # default kernel is rbf- radial basis function
svc=SVC(random_state=5)
svc.fit(X_train,y_train)
y_pred=svc.predict(X_test)
print("Accuracy Score:")
print(metrics.accuracy_score(y_test,y_pred))

Accuracy Score:
0.9810725552050473


### Identify the default value

In [13]:
svc.gamma

# scale=1/(number of features*ariance)

'scale'

In [14]:
svc.kernel

'rbf'

In [15]:
svc.C

1.0

### Optimaize Parameter from GridSearchCV

In [17]:
tuned_parameter={'C':[0.1,1,10,100,1000],
                'gamma':[1,0.1,0.01,0.001,0.0001],
                'kernel':['linear','rbf','ploy']}

In [18]:
from sklearn.model_selection import GridSearchCV
model_svm=GridSearchCV(svc,tuned_parameter,cv=10,scoring='accuracy',
                      verbose=3, n_jobs= -1)

In [20]:
model_svm.fit(X_train,y_train)

Fitting 10 folds for each of 75 candidates, totalling 750 fits


In [21]:
print(model_svm.best_params_)

{'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}


In [22]:
# when kernel is ploy, we also can also mention the degree of polynomial we want
# svc= SVC(kernel='poly', C=0.99,degree=3, gamma=0.05)
# svc.fit(X_train,y_train)
# y_pred=svc.predict(X_test)
# print("Accuracy Score:")
# print(metrics.accuracy_score(y_test,y_pred)) 

svc=SVC(kernel='rbf',C=1, gamma=0.1)
svc.fit(X_train,y_train)
y_pred=svc.predict(X_test)
print("Accuracy Score:")
print(metrics.accuracy_score(y_test,y_pred))

Accuracy Score:
0.9842271293375394


In [23]:
from sklearn.metrics import classification_report, roc_auc_score
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.98      0.99      0.98       314
           1       0.99      0.98      0.98       320

    accuracy                           0.98       634
   macro avg       0.98      0.98      0.98       634
weighted avg       0.98      0.98      0.98       634



In [24]:
auc=roc_auc_score(y_test,y_pred)

In [25]:
auc

0.9842555732484076