In [None]:
# Prepare a classification model using SVM for salary data 

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set()

In [2]:
# SVM lib
from sklearn import svm
from sklearn.svm import SVC

In [3]:
# split the data

from sklearn.model_selection import train_test_split

# Gridsearch
from sklearn.model_selection import GridSearchCV

In [4]:
# metrics

from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [5]:
from sklearn.preprocessing import LabelEncoder


In [6]:
import warnings
warnings.filterwarnings('ignore')

## Preprocessing the Data

In [7]:
test_data = pd.read_csv('SalaryData_Test(1).csv')
train_data = pd.read_csv('SalaryData_Train(1).csv')

In [8]:
test_data.head()

Unnamed: 0,age,workclass,education,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
0,25,Private,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,34,Private,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K


In [9]:
train_data.head()

Unnamed: 0,age,workclass,education,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [10]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30161 entries, 0 to 30160
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   age            30161 non-null  int64 
 1   workclass      30161 non-null  object
 2   education      30161 non-null  object
 3   educationno    30161 non-null  int64 
 4   maritalstatus  30161 non-null  object
 5   occupation     30161 non-null  object
 6   relationship   30161 non-null  object
 7   race           30161 non-null  object
 8   sex            30161 non-null  object
 9   capitalgain    30161 non-null  int64 
 10  capitalloss    30161 non-null  int64 
 11  hoursperweek   30161 non-null  int64 
 12  native         30161 non-null  object
 13  Salary         30161 non-null  object
dtypes: int64(5), object(9)
memory usage: 3.2+ MB


In [11]:
label=['workclass','education','maritalstatus','occupation','relationship','race','sex','native']

In [12]:
# changing the categorical data in numerical
label_encode = LabelEncoder()

for i in label:
    train_data[i] = label_encode.fit_transform(train_data[i])
    test_data[i] = label_encode.fit_transform(test_data[i])

In [None]:
# lalbeling done

In [13]:
# splitting the data - 

x_train = train_data.iloc[:1000,0:13]
y_train = train_data.iloc[:1000,13]

x_test = test_data.iloc[:500,0:13]
y_test = test_data.iloc[:500,13]

## SVM - Linear Model

In [14]:
# creating the model
lin_model = SVC(kernel='linear')

#fiting the model
lin_model.fit(x_train,y_train)

# predicting on training Data
pred_train_linear = lin_model.predict(x_train)

# Accuracy & Confusion Matix over the Training data

print(f'The Accuracy of Linear Model over Training data is = {accuracy_score(y_train,pred_train_linear)*100}%')

print(f'Confusion Matrix is \n {confusion_matrix(y_train,pred_train_linear)}')


The Accuracy of Linear Model over Training data is = 80.2%
Confusion Matrix is 
 [[709  47]
 [151  93]]


In [15]:
# predicting on testing Data
pred_test_linear = lin_model.predict(x_test)

# Accuracy & Confusion Matix over the Testing data

print(f'The Accuracy of Linear Model over Testing data is = {accuracy_score(y_test,pred_test_linear)*100}%')

print(f'Confusion Matrix is \n {confusion_matrix(y_test,pred_test_linear)}')


The Accuracy of Linear Model over Testing data is = 80.0%
Confusion Matrix is 
 [[354  28]
 [ 72  46]]


## SVM - Polynomial

In [16]:
# creating the model
poly_model = SVC(kernel='poly')

#fiting the model
poly_model.fit(x_train,y_train)

# predicting on training Data
pred_train_poly = poly_model.predict(x_train)

# Accuracy & Confusion Matix over the Training data

print(f'The Accuracy of Polynomial Model over Training data is = {accuracy_score(y_train,pred_train_poly)*100}%')

print(f'Confusion Matrix is \n {confusion_matrix(y_train,pred_train_poly)}')


The Accuracy of Polynomial Model over Training data is = 79.0%
Confusion Matrix is 
 [[749   7]
 [203  41]]


In [17]:
# predicting on testing Data
pred_test_poly = poly_model.predict(x_test)

# Accuracy & Confusion Matix over the Testing data

print(f'The Accuracy of Polynomial Model over Testing data is = {accuracy_score(y_test,pred_test_poly)*100}%')

print(f'Confusion Matrix is \n {confusion_matrix(y_test,pred_test_poly)}')

The Accuracy of Polynomial Model over Testing data is = 80.60000000000001%
Confusion Matrix is 
 [[381   1]
 [ 96  22]]


## RBF - Exponential

In [18]:
# finding optimal no of dimension

rbf_model = SVC()

param = [{'kernel':['rbf'],'C':[15,14,13, 12, 11, 10,5, 0.1],'gamma':[5, 50, 10, 0.5]}]

grid = GridSearchCV(estimator=rbf_model,param_grid=param)

grid.fit(x_train,y_train)

print(f'The best Score is {np.round(grid.best_score_,4)}')
print(f'The Best Parameter for RBF is = {grid.best_params_}')

The best Score is 0.756
The Best Parameter for RBF is = {'C': 15, 'gamma': 5, 'kernel': 'rbf'}


In [19]:
# Model of Expontial form - e^-x
svm_rbf_model = SVC(kernel='rbf',C =15, gamma=5)

# fitting the Data
svm_rbf_model.fit(x_train,y_train)

# prediction on train Data
pred_rbf = svm_rbf_model.predict(x_train)

# Accuracy & Confusion Matix over the Training data

print(f'The Accuracy over Training data is = {accuracy_score(y_train,pred_rbf)*100}%')

print(f'Confusion Matrix is \n {confusion_matrix(y_train,pred_rbf)}')

The Accuracy over Training data is = 99.6%
Confusion Matrix is 
 [[756   0]
 [  4 240]]


In [20]:
# prediction on testing Data
pred_rbf_test = svm_rbf_model.predict(x_test)

# Accuracy & Confusion Matix over the Testing data

print(f'The Accuracy over Testing data is = {accuracy_score(y_test,pred_rbf_test)*100}%')

print(f'Confusion Matrix is \n {confusion_matrix(y_test,pred_rbf_test)}')

The Accuracy over Testing data is = 76.4%
Confusion Matrix is 
 [[382   0]
 [118   0]]
