In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
df = pd.read_csv("SalaryData_Train (1).csv")

In [3]:
df.head()

Unnamed: 0,age,workclass,education,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
df.shape

(30161, 14)

In [5]:
df.describe()

Unnamed: 0,age,educationno,capitalgain,capitalloss,hoursperweek
count,30161.0,30161.0,30161.0,30161.0,30161.0
mean,38.438115,10.121316,1092.044064,88.302311,40.931269
std,13.13483,2.550037,7406.466611,404.121321,11.980182
min,17.0,1.0,0.0,0.0,1.0
25%,28.0,9.0,0.0,0.0,40.0
50%,37.0,10.0,0.0,0.0,40.0
75%,47.0,13.0,0.0,0.0,45.0
max,90.0,16.0,99999.0,4356.0,99.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30161 entries, 0 to 30160
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   age            30161 non-null  int64 
 1   workclass      30161 non-null  object
 2   education      30161 non-null  object
 3   educationno    30161 non-null  int64 
 4   maritalstatus  30161 non-null  object
 5   occupation     30161 non-null  object
 6   relationship   30161 non-null  object
 7   race           30161 non-null  object
 8   sex            30161 non-null  object
 9   capitalgain    30161 non-null  int64 
 10  capitalloss    30161 non-null  int64 
 11  hoursperweek   30161 non-null  int64 
 12  native         30161 non-null  object
 13  Salary         30161 non-null  object
dtypes: int64(5), object(9)
memory usage: 3.2+ MB


In [7]:
df.nunique()

age               72
workclass          7
education         16
educationno       16
maritalstatus      7
occupation        14
relationship       6
race               5
sex                2
capitalgain      118
capitalloss       90
hoursperweek      94
native            40
Salary             2
dtype: int64

In [8]:
df.columns

Index(['age', 'workclass', 'education', 'educationno', 'maritalstatus',
       'occupation', 'relationship', 'race', 'sex', 'capitalgain',
       'capitalloss', 'hoursperweek', 'native', 'Salary'],
      dtype='object')

#### Checking the duplicates from data :

In [9]:
df.duplicated().sum()

3258

data has 930 duplicate values 

#### Removing this duplicates 

In [10]:
df.drop_duplicates(inplace = True)

In [11]:
df.head()

Unnamed: 0,age,workclass,education,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [12]:
df['Salary'].unique()

array([' <=50K', ' >50K'], dtype=object)

#### Splitting Data Into Independent and Target Features 

In [13]:
x = df.drop('Salary', axis = 'columns')
y = df["Salary"]

Seperating x into categorical and Numerical features :

In [14]:
cat_col = df.select_dtypes(include = 'object')
num_col = df.select_dtypes(exclude = 'object')

In [15]:
cat_col.head()

Unnamed: 0,workclass,education,maritalstatus,occupation,relationship,race,sex,native,Salary
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States,<=50K
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,<=50K
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States,<=50K
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States,<=50K
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba,<=50K


#### Encoding the Categorical Features 

In [16]:
ohe = OneHotEncoder(drop = 'first')

In [17]:
data1 = ohe.fit_transform(cat_col).toarray()

In [18]:
data1 = pd.DataFrame(data1, columns = ohe.get_feature_names_out(cat_col.columns))

Encoded Data 

In [19]:
data1.head()

Unnamed: 0,workclass_ Local-gov,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,workclass_ State-gov,workclass_ Without-pay,education_ 11th,education_ 12th,education_ 1st-4th,education_ 5th-6th,...,native_ Puerto-Rico,native_ Scotland,native_ South,native_ Taiwan,native_ Thailand,native_ Trinadad&Tobago,native_ United-States,native_ Vietnam,native_ Yugoslavia,Salary_ >50K
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
num_col.head()

Unnamed: 0,age,educationno,capitalgain,capitalloss,hoursperweek
0,39,13,2174,0,40
1,50,13,0,0,13
2,38,9,0,0,40
3,53,7,0,0,40
4,28,13,0,0,40


In [21]:
sc = StandardScaler()

In [22]:
data2 = sc.fit_transform(num_col)

In [23]:
data2

array([[-0.00372591,  1.08991936,  0.12280968, -0.23132996, -0.09432316],
       [ 0.83032903,  1.08991936, -0.15572182, -0.23132996, -2.28112073],
       [-0.07954909, -0.43673666, -0.15572182, -0.23132996, -0.09432316],
       ...,
       [-0.91360403,  0.70825535, -0.15572182, -0.23132996, -0.25630817],
       [ 1.43691444, -0.43673666, -0.15572182, -0.23132996, -0.09432316],
       [ 0.98197538, -0.43673666,  1.76914352, -0.23132996, -0.09432316]])

In [24]:
data2 = pd.DataFrame(data2, columns = sc.get_feature_names_out(num_col.columns))

In [25]:
data2.head()

Unnamed: 0,age,educationno,capitalgain,capitalloss,hoursperweek
0,-0.003726,1.089919,0.12281,-0.23133,-0.094323
1,0.830329,1.089919,-0.155722,-0.23133,-2.281121
2,-0.079549,-0.436737,-0.155722,-0.23133,-0.094323
3,1.057799,-1.200065,-0.155722,-0.23133,-0.094323
4,-0.837781,1.089919,-0.155722,-0.23133,-0.094323


In [26]:
X = pd.concat([data1, data2], axis ='columns')

In [27]:
X.head()

Unnamed: 0,workclass_ Local-gov,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,workclass_ State-gov,workclass_ Without-pay,education_ 11th,education_ 12th,education_ 1st-4th,education_ 5th-6th,...,native_ Trinadad&Tobago,native_ United-States,native_ Vietnam,native_ Yugoslavia,Salary_ >50K,age,educationno,capitalgain,capitalloss,hoursperweek
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,-0.003726,1.089919,0.12281,-0.23133,-0.094323
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.830329,1.089919,-0.155722,-0.23133,-2.281121
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,-0.079549,-0.436737,-0.155722,-0.23133,-0.094323
3,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.057799,-1.200065,-0.155722,-0.23133,-0.094323
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.837781,1.089919,-0.155722,-0.23133,-0.094323


Too many columns are there so we will use Dimensionality Reducation

Converting 95 columsn to 4 

In [28]:
dr = PCA(n_components=4)

In [29]:
X = dr.fit_transform(X)

In [30]:
X

array([[ 0.45467932, -1.01871553,  0.06777819,  0.34568787],
       [ 0.18137605,  0.09456985,  0.10953597,  2.22122315],
       [-0.57513355,  0.2980728 ,  0.15889976, -0.24257943],
       ...,
       [-0.19644719, -0.93990156, -0.09192347,  0.08229911],
       [-0.00291227,  1.42509511,  0.37435268,  0.30523498],
       [ 1.09254596,  0.85255593,  1.42097247,  0.76025237]])

Now Independent Features are ready 

#### For dependent features

In [31]:
enc = LabelEncoder()

In [32]:
Y = enc.fit_transform(y)

In [33]:
Y

array([0, 0, 0, ..., 0, 0, 1])

Both Dependent and independent features are ready Now

Splitting the dataset into training and testing data 

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.25)

Data is ready 

#### Modelling 

In [35]:
model = SVC()

In [36]:
model.fit(X_train, y_train)

In [37]:
pred = model.predict(X_test)

Accuracy Of Model :

In [38]:
accuracy_score(pred, y_test)

0.840321141837645

In [39]:
confusion_matrix(pred, y_test)

array([[4735,  813],
       [ 261,  917]], dtype=int64)