In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
df = pd.read_csv("SalaryData_Test (1).csv")

In [3]:
df.head()

Unnamed: 0,age,workclass,education,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
0,25,Private,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,34,Private,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K


In [40]:
df.isna().sum()

age              0
workclass        0
education        0
educationno      0
maritalstatus    0
occupation       0
relationship     0
race             0
sex              0
capitalgain      0
capitalloss      0
hoursperweek     0
native           0
Salary           0
dtype: int64

In [41]:
df.isnull().sum()

age              0
workclass        0
education        0
educationno      0
maritalstatus    0
occupation       0
relationship     0
race             0
sex              0
capitalgain      0
capitalloss      0
hoursperweek     0
native           0
Salary           0
dtype: int64

In [4]:
df.shape

(15060, 14)

In [5]:
df.describe()

Unnamed: 0,age,educationno,capitalgain,capitalloss,hoursperweek
count,15060.0,15060.0,15060.0,15060.0,15060.0
mean,38.768327,10.112749,1120.301594,89.041899,40.951594
std,13.380676,2.558727,7703.181842,406.283245,12.062831
min,17.0,1.0,0.0,0.0,1.0
25%,28.0,9.0,0.0,0.0,40.0
50%,37.0,10.0,0.0,0.0,40.0
75%,48.0,13.0,0.0,0.0,45.0
max,90.0,16.0,99999.0,3770.0,99.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15060 entries, 0 to 15059
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   age            15060 non-null  int64 
 1   workclass      15060 non-null  object
 2   education      15060 non-null  object
 3   educationno    15060 non-null  int64 
 4   maritalstatus  15060 non-null  object
 5   occupation     15060 non-null  object
 6   relationship   15060 non-null  object
 7   race           15060 non-null  object
 8   sex            15060 non-null  object
 9   capitalgain    15060 non-null  int64 
 10  capitalloss    15060 non-null  int64 
 11  hoursperweek   15060 non-null  int64 
 12  native         15060 non-null  object
 13  Salary         15060 non-null  object
dtypes: int64(5), object(9)
memory usage: 1.6+ MB


In [7]:
df.nunique()

age               73
workclass          7
education         16
educationno       16
maritalstatus      7
occupation        14
relationship       6
race               5
sex                2
capitalgain      110
capitalloss       79
hoursperweek      89
native            40
Salary             2
dtype: int64

In [8]:
df.columns

Index(['age', 'workclass', 'education', 'educationno', 'maritalstatus',
       'occupation', 'relationship', 'race', 'sex', 'capitalgain',
       'capitalloss', 'hoursperweek', 'native', 'Salary'],
      dtype='object')

#### Checking the duplicates from data :

In [9]:
df.duplicated().sum()

930

data has 930 duplicate values 

#### Removing this duplicates 

In [10]:
df.drop_duplicates(inplace = True)

In [11]:
df.head()

Unnamed: 0,age,workclass,education,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
0,25,Private,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,34,Private,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K


In [12]:
df['Salary'].unique()

array([' <=50K', ' >50K'], dtype=object)

#### Splitting Data Into Independent and Target Features 

In [13]:
x = df.drop('Salary', axis = 'columns')
y = df["Salary"]

Seperating x into categorical and Numerical features :

In [14]:
cat_col = df.select_dtypes(include = 'object')
num_col = df.select_dtypes(exclude = 'object')

In [15]:
cat_col.head()

Unnamed: 0,workclass,education,maritalstatus,occupation,relationship,race,sex,native,Salary
0,Private,11th,Never-married,Machine-op-inspct,Own-child,Black,Male,United-States,<=50K
1,Private,HS-grad,Married-civ-spouse,Farming-fishing,Husband,White,Male,United-States,<=50K
2,Local-gov,Assoc-acdm,Married-civ-spouse,Protective-serv,Husband,White,Male,United-States,>50K
3,Private,Some-college,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,United-States,>50K
4,Private,10th,Never-married,Other-service,Not-in-family,White,Male,United-States,<=50K


#### Encoding the Categorical Features 

In [16]:
ohe = OneHotEncoder(drop = 'first')

In [17]:
data1 = ohe.fit_transform(cat_col).toarray()

In [18]:
data1 = pd.DataFrame(data1, columns = ohe.get_feature_names_out(cat_col.columns))

Encoded Data 

In [19]:
data1.head()

Unnamed: 0,workclass_ Local-gov,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,workclass_ State-gov,workclass_ Without-pay,education_ 11th,education_ 12th,education_ 1st-4th,education_ 5th-6th,...,native_ Puerto-Rico,native_ Scotland,native_ South,native_ Taiwan,native_ Thailand,native_ Trinadad&Tobago,native_ United-States,native_ Vietnam,native_ Yugoslavia,Salary_ >50K
0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [20]:
num_col.head()

Unnamed: 0,age,educationno,capitalgain,capitalloss,hoursperweek
0,25,7,0,0,40
1,38,9,0,0,50
2,28,12,0,0,40
3,44,10,7688,0,40
4,34,6,0,0,30


In [21]:
sc = StandardScaler()

In [22]:
data2 = sc.fit_transform(num_col)

In [23]:
data2

array([[-1.05408752, -1.19914608, -0.15005852, -0.22610763, -0.08753833],
       [-0.08603828, -0.43064399, -0.15005852, -0.22610763,  0.72761694],
       [-0.83069154,  0.72210915, -0.15005852, -0.22610763, -0.08753833],
       ...,
       [-0.08603828,  1.10636019, -0.15005852, -0.22610763,  0.72761694],
       [ 0.36075367,  1.10636019,  0.54019335, -0.22610763, -0.08753833],
       [-0.30943426,  1.10636019, -0.15005852, -0.22610763,  1.54277221]])

In [24]:
data2 = pd.DataFrame(data2, columns = sc.get_feature_names_out(num_col.columns))

In [25]:
data2.head()

Unnamed: 0,age,educationno,capitalgain,capitalloss,hoursperweek
0,-1.054088,-1.199146,-0.150059,-0.226108,-0.087538
1,-0.086038,-0.430644,-0.150059,-0.226108,0.727617
2,-0.830692,0.722109,-0.150059,-0.226108,-0.087538
3,0.360754,-0.046393,0.822747,-0.226108,-0.087538
4,-0.3839,-1.583397,-0.150059,-0.226108,-0.902694


In [26]:
X = pd.concat([data1, data2], axis ='columns')

In [27]:
X.head()

Unnamed: 0,workclass_ Local-gov,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,workclass_ State-gov,workclass_ Without-pay,education_ 11th,education_ 12th,education_ 1st-4th,education_ 5th-6th,...,native_ Trinadad&Tobago,native_ United-States,native_ Vietnam,native_ Yugoslavia,Salary_ >50K,age,educationno,capitalgain,capitalloss,hoursperweek
0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,-1.054088,-1.199146,-0.150059,-0.226108,-0.087538
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,-0.086038,-0.430644,-0.150059,-0.226108,0.727617
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,-0.830692,0.722109,-0.150059,-0.226108,-0.087538
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.360754,-0.046393,0.822747,-0.226108,-0.087538
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,-0.3839,-1.583397,-0.150059,-0.226108,-0.902694


Too many columns are there so we will use Dimensionality Reducation

Converting 95 columsn to 4 

In [28]:
dr = PCA(n_components=4)

In [29]:
X = dr.fit_transform(X)

In [30]:
X

array([[-1.63139662, -0.15367389,  0.14565882,  0.69560445],
       [ 0.06577769,  0.62893992,  0.13940582,  0.93419241],
       [ 0.3036335 , -0.80121269, -0.01125137,  0.1674419 ],
       ...,
       [ 1.04766802, -0.69555925, -0.03433937,  0.47144708],
       [ 0.69640244, -0.74955353,  0.41113076, -0.54074368],
       [ 1.65029458, -0.7002191 , -0.04262679,  1.23476514]])

Now Independent Features are ready 

#### For dependent features

In [31]:
enc = LabelEncoder()

In [32]:
Y = enc.fit_transform(y)

In [33]:
Y

array([0, 0, 1, ..., 0, 0, 1])

Both Dependent and independent features are ready Now

Splitting the dataset into training and testing data 

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.25)

Data is ready 

#### Modelling 

In [35]:
model = SVC()

In [36]:
model.fit(X_train, y_train)

In [37]:
pred = model.predict(X_test)

Accuracy Of Model :

In [38]:
accuracy_score(pred, y_test)

0.855080667987546

In [39]:
confusion_matrix(pred, y_test)

array([[2562,  394],
       [ 118,  459]], dtype=int64)