In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
df = pd.read_csv('income_evaluation.csv')  # Or use your actual DataFrame if already loaded
print(df.head())
print(df.info())

   age          workclass   fnlwgt   education   education-num  \
0   39          State-gov    77516   Bachelors              13   
1   50   Self-emp-not-inc    83311   Bachelors              13   
2   38            Private   215646     HS-grad               9   
3   53            Private   234721        11th               7   
4   28            Private   338409   Bachelors              13   

        marital-status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

    capital-gain   capital-loss   hours-per-week  native-country  income  
0           2174              0               40   Un

In [3]:
print(df.columns)

Index(['age', ' workclass', ' fnlwgt', ' education', ' education-num',
       ' marital-status', ' occupation', ' relationship', ' race', ' sex',
       ' capital-gain', ' capital-loss', ' hours-per-week', ' native-country',
       ' income'],
      dtype='object')


In [4]:
df.columns = df.columns.str.strip()  # Remove leading/trailing spaces from column names
np.unique(df['income'])


array([' <=50K', ' >50K'], dtype=object)

In [5]:
df.drop("fnlwgt", axis=1, inplace=True, errors='ignore')

In [6]:
col_name=['age', ' workclass', ' education', ' education-num',
       ' marital-status', ' occupation', ' relationship', ' race', ' sex',
       ' capital-gain', ' capital-loss', ' hours-per-week', ' native-country',
       ' income']
df.colums=col_name
df.columns

  df.colums=col_name


Index(['age', 'workclass', 'education', 'education-num', 'marital-status',
       'occupation', 'relationship', 'race', 'sex', 'capital-gain',
       'capital-loss', 'hours-per-week', 'native-country', 'income'],
      dtype='object')

In [7]:
df.loc[df['workclass']=='?']

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income


In [8]:
df.loc[df['workclass']=='?','workclass'] = np.nan
df.loc[df['occupation']=='?','occupation'] = np.nan 
df.loc[df['native-country']=='?','native-country'] = np.nan

In [9]:
df.isnull().sum()

age               0
workclass         0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income            0
dtype: int64

In [10]:
df.dropna(inplace=True) #dropping null values

In [11]:
#to change our income column datatype to int
df['income'] = df['income'].map({' <=50K':0, ' >50K':1})
df.head()

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


In [12]:
df.income.value_counts()

income
0    24720
1     7841
Name: count, dtype: int64

In [13]:
#balance income column with equal number of 0 and 1
df = df.groupby('income').apply(lambda x: x.sample(df.income.value_counts().min(), replace=False)).reset_index(drop=True)
df.income.value_counts()


  df = df.groupby('income').apply(lambda x: x.sample(df.income.value_counts().min(), replace=False)).reset_index(drop=True)


income
0    7841
1    7841
Name: count, dtype: int64

In [14]:
cat_col=df.select_dtypes(include=['object']).columns #selecting categorical columns
cat_col

Index(['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'sex', 'native-country'],
      dtype='object')

In [15]:
cat_col = df.select_dtypes(include="object")
cat_col.head()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country
0,Private,HS-grad,Never-married,Other-service,Unmarried,Black,Female,United-States
1,Private,Masters,Divorced,Prof-specialty,Not-in-family,White,Male,United-States
2,Local-gov,Bachelors,Married-civ-spouse,Prof-specialty,Wife,White,Female,United-States
3,Private,HS-grad,Never-married,Transport-moving,Other-relative,White,Male,United-States
4,Private,Assoc-voc,Divorced,Craft-repair,Unmarried,White,Male,United-States


In [16]:
num_col=df.select_dtypes(exclude='object') #selecting numerical columns
num_col.head()

Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week,income
0,37,9,0,0,30,0
1,45,14,0,0,40,0
2,39,13,0,0,42,0
3,20,9,0,0,40,0
4,46,11,0,0,40,0


In [17]:
#label encoding
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
cat_col_encoded = cat_col.apply(le.fit_transform)
cat_col_encoded

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country
0,4,11,4,8,4,2,0,39
1,4,12,0,10,1,4,1,39
2,2,9,2,10,5,4,0,39
3,4,11,4,14,2,4,1,39
4,4,8,0,3,4,4,1,39
...,...,...,...,...,...,...,...,...
15677,4,15,2,3,0,4,1,39
15678,5,7,2,12,0,4,1,39
15679,4,11,2,3,0,4,1,39
15680,6,14,2,10,0,4,1,39


In [18]:
final_df = pd.concat([cat_col_encoded, num_col], axis=1)

In [19]:
x = final_df.drop("income",axis=1)
y = final_df["income"]

In [20]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [21]:
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [23]:
#svc
from sklearn.svm import SVC
svc = SVC()
svc.fit(x_train_scaled,y_train)

0,1,2
,C,1.0
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [24]:
svc.score(x_test_scaled, y_test)

0.8182977366911062

In [26]:
svc.score(x_train_scaled, y_train)

0.8332403347947389

In [None]:
from sklearn.model_selection import GridSearchCV
grid = {
    "C" : [0.01,0.1,1,10],
    "kernel" : ["linear","rbf","poly","sigmoid"],
    "degree" : [1,3,5,7],
    "gamma" : [0.01,1]
}
svm = SVC()
svm_cv = GridSearchCV(svm, grid, cv=5)
svm_cv.fit(x_train_scaled,y_train)