In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('adult_income.csv')

In [3]:
df.shape

(19240, 15)

In [4]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0.0,40.0,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0.0,50.0,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0.0,40.0,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0.0,40.0,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0.0,30.0,United-States,<=50K


In [5]:
df.duplicated().sum()

6

In [6]:
df.drop_duplicates(inplace=True)

In [7]:
df.isnull().sum()

Unnamed: 0,0
age,0
workclass,0
fnlwgt,0
education,0
educational-num,0
marital-status,0
occupation,0
relationship,0
race,0
gender,0


In [8]:
df.dropna(inplace=True)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19233 entries, 0 to 19238
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              19233 non-null  int64  
 1   workclass        19233 non-null  object 
 2   fnlwgt           19233 non-null  int64  
 3   education        19233 non-null  object 
 4   educational-num  19233 non-null  int64  
 5   marital-status   19233 non-null  object 
 6   occupation       19233 non-null  object 
 7   relationship     19233 non-null  object 
 8   race             19233 non-null  object 
 9   gender           19233 non-null  object 
 10  capital-gain     19233 non-null  int64  
 11  capital-loss     19233 non-null  float64
 12  hours-per-week   19233 non-null  float64
 13  native-country   19233 non-null  object 
 14  income           19233 non-null  object 
dtypes: float64(2), int64(4), object(9)
memory usage: 2.3+ MB


In [10]:
cat_col = df.select_dtypes(include='object').columns

In [11]:
for col in cat_col:
    print(f'{col} : {df[col].unique()}')
    print('-----'*5)

workclass : ['Private' 'Local-gov' '?' 'Self-emp-not-inc' 'Federal-gov' 'State-gov'
 'Self-emp-inc' 'Without-pay' 'Never-worked']
-------------------------
education : ['11th' 'HS-grad' 'Assoc-acdm' 'Some-college' '10th' 'Prof-school'
 '7th-8th' 'Bachelors' 'Masters' 'Doctorate' '5th-6th' 'Assoc-voc' '9th'
 '12th' '1st-4th' 'Preschool']
-------------------------
marital-status : ['Never-married' 'Married-civ-spouse' 'Widowed' 'Divorced' 'Separated'
 'Married-spouse-absent' 'Married-AF-spouse']
-------------------------
occupation : ['Machine-op-inspct' 'Farming-fishing' 'Protective-serv' '?'
 'Other-service' 'Prof-specialty' 'Craft-repair' 'Adm-clerical'
 'Exec-managerial' 'Tech-support' 'Sales' 'Priv-house-serv'
 'Transport-moving' 'Handlers-cleaners' 'Armed-Forces']
-------------------------
relationship : ['Own-child' 'Husband' 'Not-in-family' 'Unmarried' 'Wife' 'Other-relative']
-------------------------
race : ['Black' 'White' 'Asian-Pac-Islander' 'Other' 'Amer-Indian-Eskimo']
---

In [12]:
index_to_drop = df[(df['workclass'] == '?') | (df['occupation'] == '?') | (df['native-country'] == '?')].index

In [13]:
df.drop(index_to_drop, inplace=True)

In [14]:
from sklearn.preprocessing import LabelEncoder
for col in cat_col:
    if col != 'education':
      le = LabelEncoder()
      df[col] = le.fit_transform(df[col])

In [15]:
df['education'].unique()

array(['11th', 'HS-grad', 'Assoc-acdm', 'Some-college', '10th',
       'Prof-school', '7th-8th', 'Bachelors', 'Masters', '5th-6th',
       'Assoc-voc', '9th', 'Doctorate', '12th', '1st-4th', 'Preschool'],
      dtype=object)

In [16]:
from sklearn.preprocessing import OrdinalEncoder
df['education'] = OrdinalEncoder(categories=[[
    'Preschool', '1st-4th', '5th-6th', '7th-8th', '9th', '10th', '11th', '12th',
    'HS-grad', 'Some-college', 'Assoc-acdm', 'Assoc-voc',
    'Bachelors', 'Masters', 'Prof-school', 'Doctorate'
]]).fit_transform(df[['education']])

In [17]:
df.head()
#income '<=50k'-0, '>50k'-1

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,2,226802,6.0,7,4,6,3,2,1,0,0.0,40.0,37,0
1,38,2,89814,8.0,9,2,4,0,4,1,0,0.0,50.0,37,0
2,28,1,336951,10.0,12,2,10,0,4,1,0,0.0,40.0,37,1
3,44,2,160323,9.0,10,2,6,0,2,1,7688,0.0,40.0,37,1
5,34,2,198693,5.0,6,4,7,1,4,1,0,0.0,30.0,37,0


In [18]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
X_train.shape, X_test.shape

((14215, 14), (3554, 14))

In [21]:
from sklearn.svm import SVC

In [22]:
model = SVC()

In [23]:
model.fit(X_train, y_train)

In [36]:
y_pred = model.predict(X_test)

In [37]:
from sklearn.metrics import accuracy_score

In [38]:
print(accuracy_score(y_test, y_pred))

0.775
