1.Importing the necessary libraries

In [None]:
from sklearn import datasets
import pandas as pd
from sklearn import model_selection
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score
import matplotlib.pyplot as plt
import numpy as np
import openpyxl


2.Load the dataset, the dataset used here is of the adult census data 

In [2]:
df = pd.read_csv("adult.csv")
print(df)

       age workclass  fnlwgt     education  education.num      marital.status  \
0       90         ?   77053       HS-grad              9             Widowed   
1       82   Private  132870       HS-grad              9             Widowed   
2       66         ?  186061  Some-college             10             Widowed   
3       54   Private  140359       7th-8th              4            Divorced   
4       41   Private  264663  Some-college             10           Separated   
...    ...       ...     ...           ...            ...                 ...   
32556   22   Private  310152  Some-college             10       Never-married   
32557   27   Private  257302    Assoc-acdm             12  Married-civ-spouse   
32558   40   Private  154374       HS-grad              9  Married-civ-spouse   
32559   58   Private  151910       HS-grad              9             Widowed   
32560   22   Private  201490       HS-grad              9       Never-married   

              occupation   

3. Removing the duplicate entries

In [3]:
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

4. Checking for missing values

In [4]:
missing_values = df.isnull().sum()
print(missing_values)

age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
income            0
dtype: int64


5.Encoding categorical variables is a crucial step in preparing data for machine learning models. Machine learning algorithms typically require numerical input, so categorical data must be converted into a numerical format.

In [27]:

label_encoders = {}
df_cleaned=df.copy()
for column in df_cleaned.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df_cleaned[column] = le.fit_transform(df_cleaned[column])
    label_encoders[column] = le
    


6. Here the dataset is divided into training and testing set with 80% of data assigned for training and 20% for testing .

In [6]:

X=df_cleaned.drop('income',axis=1)
y=df_cleaned['income']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

7. The code initializes a StandardScaler to standardize the features by removing the mean and scaling to unit variance. It fits the scaler on the training data and transforms it, then uses the same scaler to transform the testing data, ensuring consistency and preventing data leakage. This process is crucial for optimal performance of many machine learning algorithms.

In [7]:

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

8. Define the SVM model and the hyperparameters

In [21]:
from sklearn.svm import SVC


model_svm = SVC()
params = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto'],
}



9.GridSearchCV tries different sets of hyperparameters for a model (e.g., SVM) using cross-validation to find the best-performing set. In the code below, it attempts different parameter settings in param_grid on 5-fold cross-validation and selects the setting that gives the highest accuracy.

In [23]:

from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(model_svm, param_grid=params, cv=5, scoring='accuracy', n_jobs=-1, error_score='raise')
grid_search.fit(X_train, y_train)
best_svm = grid_search.best_estimator_




In [24]:

y_pred = grid_search.predict(X_test)

In [26]:

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
print("Best parameters: ", grid_search.best_params_)
print("Classification report: ", grid_search.best_estimator_)
print("Classification report: ", classification_report(y_test, y_pred))



Best parameters:  {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Classification report:  SVC(C=10)
Classification report:                precision    recall  f1-score   support

           0       0.88      0.94      0.91      4988
           1       0.73      0.57      0.64      1520

    accuracy                           0.85      6508
   macro avg       0.81      0.76      0.78      6508
weighted avg       0.84      0.85      0.84      6508

