In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('data/adult.csv')

In [5]:
def whitespace_remover(dataframe):

    # iterating over the columns
    for i in dataframe.columns:

        # checking datatype of each columns
        if dataframe[i].dtype == 'object':

            # applying strip function on column
            dataframe[i] = dataframe[i].map(str.strip)
        else:

            # if condition is False then it will do nothing.
            pass

# applying whitespace_remover function on dataframe
whitespace_remover(df)

In [6]:
df.drop_duplicates(keep='first',inplace=True)

In [7]:
income_map = {'<=50K':0,'>50K':1}

df['income'] = df['income'].map(income_map)

In [8]:
df.replace('?',np.nan,inplace=True)

In [9]:
df.drop(labels=['fnlwgt','capital-loss'],axis=1,inplace=True)

In [10]:
df.head()

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,hours-per-week,country,income
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,40,United-States,0
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,13,United-States,0
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,40,United-States,0
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,40,United-States,0
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,40,Cuba,0


In [11]:
X = df.iloc[:,0:12]
Y = df['income']

In [12]:
X.head()

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,hours-per-week,country
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,40,United-States
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,13,United-States
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,40,United-States
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,40,United-States
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,40,Cuba


In [13]:
Y

0        0
1        0
2        0
3        0
4        0
        ..
32556    0
32557    1
32558    0
32559    0
32560    1
Name: income, Length: 32537, dtype: int64

In [14]:
## Define which column shoulbe be ordinal-encoded and which should be scaled

categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns

print(f'Categorical Features: {categorical_cols}')
print(f'Numerical Features: {numerical_cols}')

Categorical Features: Index(['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'sex', 'country'],
      dtype='object')
Numerical Features: Index(['age', 'education-num', 'capital-gain', 'hours-per-week'], dtype='object')


In [15]:
from sklearn.impute import SimpleImputer ## Handling Missing Values
from sklearn.preprocessing import StandardScaler ## Handling feature Scaling
from sklearn.preprocessing import OneHotEncoder ## Encoding
## Pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [16]:
## Numerical Pipelines

num_pipeline = Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())
    ]
)

## Categorical Pipeline

cat_pipeline = Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('one_hot_encoder', OneHotEncoder(sparse_output=False)),
    ('scaler',StandardScaler())
    ]
)

preprossor = ColumnTransformer([
    ('num_pipeline',num_pipeline,numerical_cols),
    ('cat_pipeline',cat_pipeline,categorical_cols)
])

In [17]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=0.20,random_state=42)

In [18]:
X_train = pd.DataFrame(preprossor.fit_transform(X_train),columns=preprossor.get_feature_names_out())

In [19]:
X_test = pd.DataFrame(preprossor.transform(X_test),columns=preprossor.get_feature_names_out())

In [20]:
X_train.head()

Unnamed: 0,num_pipeline__age,num_pipeline__education-num,num_pipeline__capital-gain,num_pipeline__hours-per-week,cat_pipeline__workclass_Federal-gov,cat_pipeline__workclass_Local-gov,cat_pipeline__workclass_Never-worked,cat_pipeline__workclass_Private,cat_pipeline__workclass_Self-emp-inc,cat_pipeline__workclass_Self-emp-not-inc,...,cat_pipeline__country_Portugal,cat_pipeline__country_Puerto-Rico,cat_pipeline__country_Scotland,cat_pipeline__country_South,cat_pipeline__country_Taiwan,cat_pipeline__country_Thailand,cat_pipeline__country_Trinadad&Tobago,cat_pipeline__country_United-States,cat_pipeline__country_Vietnam,cat_pipeline__country_Yugoslavia
0,-0.18882,-0.421694,-0.143089,-0.193977,-0.173543,-0.261919,-0.013861,0.571243,-0.190335,-0.289359,...,-0.032816,-0.06116,-0.019604,-0.048864,-0.041615,-0.024013,-0.022354,0.310217,-0.046016,-0.023198
1,0.984862,-0.421694,0.322647,-0.031915,-0.173543,-0.261919,-0.013861,0.571243,-0.190335,-0.289359,...,-0.032816,-0.06116,-0.019604,-0.048864,-0.041615,-0.024013,-0.022354,0.310217,-0.046016,-0.023198
2,-0.33553,-0.032368,-0.143089,-0.031915,-0.173543,-0.261919,-0.013861,0.571243,-0.190335,-0.289359,...,-0.032816,-0.06116,-0.019604,-0.048864,-0.041615,-0.024013,-0.022354,0.310217,-0.046016,-0.023198
3,1.131572,-0.421694,-0.143089,-0.031915,-0.173543,-0.261919,-0.013861,0.571243,-0.190335,-0.289359,...,-0.032816,-0.06116,-0.019604,-0.048864,-0.041615,-0.024013,-0.022354,0.310217,-0.046016,-0.023198
4,-0.62895,-0.421694,-0.143089,1.183544,-0.173543,-0.261919,-0.013861,0.571243,-0.190335,-0.289359,...,-0.032816,-0.06116,-0.019604,-0.048864,-0.041615,-0.024013,-0.022354,0.310217,-0.046016,-0.023198


In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [22]:
parameter = {'penalty':['l1', 'l2', 'elasticnet', 'None'],'C':[1,2,0.1,0.001,0.05,0.5],'max_iter':[100,200,300]}

In [23]:
classifier = LogisticRegression()

In [24]:
classifier_regressor=GridSearchCV(classifier,param_grid=parameter,scoring='accuracy',cv=5)

In [25]:
classifier_regressor.fit(X_train,y_train)

270 fits failed out of a total of 360.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
90 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

------------------------------------

In [26]:
print(classifier_regressor.best_params_)

{'C': 0.1, 'max_iter': 100, 'penalty': 'l2'}


In [27]:
print(classifier_regressor.best_score_)

0.8477466958799849


In [28]:
y_pred = classifier_regressor.predict(X_test)

In [29]:
accuracy_score(y_test,y_pred)

0.8527965580823602

In [30]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.88      0.93      0.90      4905
           1       0.74      0.62      0.67      1603

    accuracy                           0.85      6508
   macro avg       0.81      0.77      0.79      6508
weighted avg       0.85      0.85      0.85      6508



In [31]:
confusion_matrix(y_test,y_pred)

array([[4557,  348],
       [ 610,  993]])

# RandomForest Classifier

In [32]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
model = RandomForestClassifier()

In [33]:
model.fit(X_train,y_train)

In [34]:
model.score(X_train,y_train)

0.9751431096085136

In [35]:
model_pred = model.predict(X_test)

In [36]:
accuracy_score(y_test,model_pred)

0.8403503380454825

In [37]:
params = dict(criterion=['gini', 'entropy'],max_depth=[1,10,5],min_samples_split=[1,10])

In [38]:
best_ran_model = RandomizedSearchCV(model,params,random_state=7)
best_ran_model.fit(X_train,y_train)

25 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/ensemble/_forest.py", line 340, in fit
    self._validate_params()
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_param_validation.py", line 97, in validate_parameter_constraints
    raise InvalidParameterError(
skl

In [39]:
best_ran_model.best_params_

{'min_samples_split': 10, 'max_depth': 10, 'criterion': 'gini'}

In [40]:
best_model = RandomForestClassifier(min_samples_split= 5, max_depth= 10, criterion= 'entropy').fit(X_train,y_train)

In [41]:
best_model.score(X_train,y_train)

0.8548926197702562

In [42]:
best_pred = best_model.predict(X_test)

In [43]:
accuracy_score(y_test,best_pred)

0.8551014136447449

# Naive Bayes

In [44]:
from sklearn.naive_bayes import GaussianNB
model  = GaussianNB().fit(X_train,y_train)

In [45]:
model.score(X_train,y_train)

0.4217219255445849

In [46]:
model_pred = model.predict(X_test)

In [47]:
accuracy_score(y_test,model_pred)

0.4248617086662569

# SVC

In [48]:
from sklearn.svm import SVC
svc = SVC().fit(X_train,y_train)

In [49]:
svc.score(X_train,y_train)

0.8587729071420339

In [54]:
svc_pred = svc.predict(X_test)
accuracy_score(y_test,svc_pred)

0.8455746773202213

##### Hyperparameter Tunning

In [55]:
params = dict(kernel=['linear', 'poly'],C=[1,0.1,0.01])

In [56]:
Grid_model = GridSearchCV(svc,params,n_jobs=-1)

In [57]:
Grid_model.fit(X_train,y_train)

In [58]:
Grid_model.best_params_

{'C': 0.01, 'kernel': 'linear'}

In [60]:
best_model = SVC(kernel='linear',gamma='scale',C=0.01).fit(X_train,y_train)

In [61]:
best_model.score(X_train,y_train)

0.8463636712897153

In [62]:
best_pred = best_model.predict(X_test)

In [63]:
accuracy_score(y_test,best_pred)

0.8488014751075599

#### Conclusion

we got best accuracy with random forest model so we use that model in our pipeline.