In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler , OrdinalEncoder, LabelEncoder 
from sklearn.model_selection import train_test_split , RandomizedSearchCV , cross_val_score
from sklearn.feature_selection import mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier , AdaBoostClassifier
#from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score , confusion_matrix

# Read the data

In [2]:
adult = pd.read_csv(r'C:\Users\divij\OneDrive\Documents\adult.csv' , header=None , na_values=' ?')

In [3]:
col = ['age', 'workclass','fnlwgt','education' ,'education-num','marital-status','occupation' ,'relationship' ,'race' ,'sex' ,'capital-gain','capital-loss','hours-per-week','native-country', 'Income']

In [4]:
adult.columns = col

In [5]:
adult.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,Income
0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
1,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
2,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
3,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
4,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K


In [6]:
adult.tail(10)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,Income
48833,32,Private,34066,10th,6,Married-civ-spouse,Handlers-cleaners,Husband,Amer-Indian-Eskimo,Male,0,0,40,United-States,<=50K
48834,43,Private,84661,Assoc-voc,11,Married-civ-spouse,Sales,Husband,White,Male,0,0,45,United-States,<=50K
48835,32,Private,116138,Masters,14,Never-married,Tech-support,Not-in-family,Asian-Pac-Islander,Male,0,0,11,Taiwan,<=50K
48836,53,Private,321865,Masters,14,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,>50K
48837,22,Private,310152,Some-college,10,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40,United-States,<=50K
48838,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
48839,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
48840,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
48841,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K
48842,52,Self-emp-inc,287927,HS-grad,9,Married-civ-spouse,Exec-managerial,Wife,White,Female,15024,0,40,United-States,>50K


In [7]:
adult.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48843 entries, 0 to 48842
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             48843 non-null  object
 1   workclass       48843 non-null  object
 2   fnlwgt          48843 non-null  object
 3   education       48843 non-null  object
 4   education-num   48843 non-null  object
 5   marital-status  48843 non-null  object
 6   occupation      48843 non-null  object
 7   relationship    48843 non-null  object
 8   race            48843 non-null  object
 9   sex             48843 non-null  object
 10  capital-gain    48843 non-null  object
 11  capital-loss    48843 non-null  object
 12  hours-per-week  48843 non-null  object
 13  native-country  48843 non-null  object
 14  Income          48843 non-null  object
dtypes: object(15)
memory usage: 5.6+ MB


# Find null values

In [8]:
adult.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
Income            0
dtype: int64

In [9]:
adult.shape

(48843, 15)

In [10]:
adult.workclass.value_counts()

Private             33906
Self-emp-not-inc     3862
Local-gov            3136
?                    2799
State-gov            1981
Self-emp-inc         1695
Federal-gov          1432
Without-pay            21
Never-worked           10
workclass               1
Name: workclass, dtype: int64

In [11]:
adult.workclass.fillna('No_Info', inplace=True)

In [12]:
adult.occupation.value_counts()

Prof-specialty       6172
Craft-repair         6112
Exec-managerial      6086
Adm-clerical         5611
Sales                5504
Other-service        4923
Machine-op-inspct    3022
?                    2809
Transport-moving     2355
Handlers-cleaners    2072
Farming-fishing      1490
Tech-support         1446
Protective-serv       983
Priv-house-serv       242
Armed-Forces           15
occupation              1
Name: occupation, dtype: int64

In [13]:
adult[(adult.occupation.isnull()) & (adult.workclass=='No_Info')]

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,Income


In [14]:
adult.occupation.fillna('Unemployed',inplace=True)

In [15]:
adult['native-country'].value_counts()

United-States                 43832
Mexico                          951
?                               857
Philippines                     295
Germany                         206
Puerto-Rico                     184
Canada                          182
El-Salvador                     155
India                           151
Cuba                            138
England                         127
China                           122
South                           115
Jamaica                         106
Italy                           105
Dominican-Republic              103
Japan                            92
Guatemala                        88
Poland                           87
Vietnam                          86
Columbia                         85
Haiti                            75
Portugal                         67
Taiwan                           65
Iran                             59
Nicaragua                        49
Greece                           49
Peru                        

In [16]:
adult[adult['native-country'].isnull()]

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,Income


In [17]:
adult['native-country'].fillna('other',inplace=True)

In [18]:
adult.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
Income            0
dtype: int64

In [19]:
adult.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,Income
0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
1,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
2,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
3,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
4,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K


# Replace the - by _ in all columns

In [20]:
adult.columns = list(map(lambda x : x.replace('-','_') ,adult.columns))

In [21]:
adult.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,Income
0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
1,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
2,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
3,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
4,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K


In [22]:
adult.workclass.unique()

array(['workclass', 'Private', 'Local-gov', '?', 'Self-emp-not-inc',
       'Federal-gov', 'State-gov', 'Self-emp-inc', 'Without-pay',
       'Never-worked'], dtype=object)

In [23]:
adult.workclass = adult.workclass.apply(lambda x : x.replace('-','_'))

In [24]:
adult.workclass.unique()

array(['workclass', 'Private', 'Local_gov', '?', 'Self_emp_not_inc',
       'Federal_gov', 'State_gov', 'Self_emp_inc', 'Without_pay',
       'Never_worked'], dtype=object)

# Convert the target variable to numerical

In [25]:
adult.Income.value_counts() 

<=50K     37155
>50K      11687
income        1
Name: Income, dtype: int64

In [26]:
adult.Income.value_counts() / len(adult)

<=50K     0.760703
>50K      0.239277
income    0.000020
Name: Income, dtype: float64

In [27]:
adult[adult.Income == ' <=50K']

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,Income


In [28]:
# the data is imbalanced 
adult.Income = adult.Income.apply(lambda x : 0 if x ==' <=50K' else 1)

In [29]:
adult.Income.value_counts()

1    48843
Name: Income, dtype: int64

In [30]:
adult.age.max()

'age'

In [31]:
adult.age.min()

'17'

In [32]:
adult = pd.get_dummies(data= adult, columns=['workclass'] , drop_first=True)

In [33]:
adult.head()

Unnamed: 0,age,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,...,Income,workclass_Federal_gov,workclass_Local_gov,workclass_Never_worked,workclass_Private,workclass_Self_emp_inc,workclass_Self_emp_not_inc,workclass_State_gov,workclass_Without_pay,workclass_workclass
0,age,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,...,1,0,0,0,0,0,0,0,0,1
1,25,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,...,1,0,0,0,1,0,0,0,0,0
2,38,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,...,1,0,0,0,1,0,0,0,0,0
3,28,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,...,1,0,1,0,0,0,0,0,0,0
4,44,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,...,1,0,0,0,1,0,0,0,0,0


In [34]:
adult.education.unique()

array(['education', '11th', 'HS-grad', 'Assoc-acdm', 'Some-college',
       '10th', 'Prof-school', '7th-8th', 'Bachelors', 'Masters',
       'Doctorate', '5th-6th', 'Assoc-voc', '9th', '12th', '1st-4th',
       'Preschool'], dtype=object)

In [35]:
val = [[' Preschool',1] ,[' 1st-4th' ,2],[ ' 5th-6th' ,3] , [' 7th-8th' ,4] ,[ ' 9th',5] , [' 10th',6] ,[' 11th' ,7],
       [ ' 12th',8],[ ' Some-college',9] , [' Bachelors',10],
      [' HS-grad' ,11] ,[ ' Masters',12] ,[ ' Doctorate' ,13] , [' Prof-school' ,14] ,[' Assoc-acdm',15], [' Assoc-voc' ,16]]

In [36]:
Ord_enc = OrdinalEncoder()

In [37]:
adult.education = Ord_enc.fit_transform(np.array(adult.education).reshape(-1,1))

In [38]:
adult.head()

Unnamed: 0,age,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,...,Income,workclass_Federal_gov,workclass_Local_gov,workclass_Never_worked,workclass_Private,workclass_Self_emp_inc,workclass_Self_emp_not_inc,workclass_State_gov,workclass_Without_pay,workclass_workclass
0,age,fnlwgt,16.0,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,...,1,0,0,0,0,0,0,0,0,1
1,25,226802,1.0,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,...,1,0,0,0,1,0,0,0,0,0
2,38,89814,11.0,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,...,1,0,0,0,1,0,0,0,0,0
3,28,336951,7.0,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,...,1,0,1,0,0,0,0,0,0,0
4,44,160323,15.0,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,...,1,0,0,0,1,0,0,0,0,0


In [39]:
adult.marital_status.unique()

array(['marital-status', 'Never-married', 'Married-civ-spouse', 'Widowed',
       'Divorced', 'Separated', 'Married-spouse-absent',
       'Married-AF-spouse'], dtype=object)

# Remove the space from each value of categorical column

In [40]:
adult.marital_status = adult.marital_status.str.replace(' ',"")

In [41]:
adult.marital_status.unique()

array(['marital-status', 'Never-married', 'Married-civ-spouse', 'Widowed',
       'Divorced', 'Separated', 'Married-spouse-absent',
       'Married-AF-spouse'], dtype=object)

In [42]:
adult.marital_status = adult.marital_status.str.replace('-', '_')

In [43]:
adult = pd.get_dummies(data = adult , columns=['marital_status'], drop_first=True)

In [44]:
adult.occupation = Ord_enc.fit_transform(np.array(adult.occupation).reshape(-1,1))

In [45]:
adult.relationship.unique()

array(['relationship', 'Own-child', 'Husband', 'Not-in-family',
       'Unmarried', 'Wife', 'Other-relative'], dtype=object)

In [46]:
adult.relationship = adult.relationship.str.replace('-', '_')

In [47]:
le = LabelEncoder()
adult.relationship = le.fit_transform(np.array(adult.relationship).reshape(-1,1))
adult.race = le.fit_transform(np.array(adult.race).reshape(-1,1))

  y = column_or_1d(y, warn=True)


In [48]:
adult.race.unique()

array([5, 2, 4, 1, 3, 0])

In [49]:
adult = pd.get_dummies(adult , columns=['sex'], drop_first=True)

In [50]:
adult.head()

Unnamed: 0,age,fnlwgt,education,education_num,occupation,relationship,race,capital_gain,capital_loss,hours_per_week,...,workclass_workclass,marital_status_Married_AF_spouse,marital_status_Married_civ_spouse,marital_status_Married_spouse_absent,marital_status_Never_married,marital_status_Separated,marital_status_Widowed,marital_status_marital_status,sex_Male,sex_gender
0,age,fnlwgt,16.0,educational-num,15.0,6,5,capital-gain,capital-loss,hours-per-week,...,1,0,0,0,0,0,0,1,0,1
1,25,226802,1.0,7,7.0,3,2,0,0,40,...,0,0,0,0,1,0,0,0,1,0
2,38,89814,11.0,9,5.0,0,4,0,0,50,...,0,0,1,0,0,0,0,0,1,0
3,28,336951,7.0,12,11.0,0,4,0,0,40,...,0,0,1,0,0,0,0,0,1,0
4,44,160323,15.0,10,7.0,0,2,7688,0,40,...,0,0,1,0,0,0,0,0,1,0


In [51]:
adult.native_country.unique()

array(['native-country', 'United-States', '?', 'Peru', 'Guatemala',
       'Mexico', 'Dominican-Republic', 'Ireland', 'Germany',
       'Philippines', 'Thailand', 'Haiti', 'El-Salvador', 'Puerto-Rico',
       'Vietnam', 'South', 'Columbia', 'Japan', 'India', 'Cambodia',
       'Poland', 'Laos', 'England', 'Cuba', 'Taiwan', 'Italy', 'Canada',
       'Portugal', 'China', 'Nicaragua', 'Honduras', 'Iran', 'Scotland',
       'Jamaica', 'Ecuador', 'Yugoslavia', 'Hungary', 'Hong', 'Greece',
       'Trinadad&Tobago', 'Outlying-US(Guam-USVI-etc)', 'France',
       'Holand-Netherlands'], dtype=object)

In [52]:
adult.native_country = le.fit_transform(np.ravel(adult.native_country))

In [53]:
adult.native_country.value_counts()

39    43832
26      951
0       857
30      295
11      206
33      184
2       182
8       155
19      151
5       138
9       127
3       122
35      115
23      106
22      105
6       103
24       92
13       88
31       87
40       86
4        85
14       75
32       67
36       65
20       59
27       49
12       49
29       46
7        45
10       38
21       37
17       30
37       30
1        28
38       27
25       23
41       23
28       23
34       21
16       20
18       19
42        1
15        1
Name: native_country, dtype: int64

In [54]:
adult.head(10)

Unnamed: 0,age,fnlwgt,education,education_num,occupation,relationship,race,capital_gain,capital_loss,hours_per_week,...,workclass_workclass,marital_status_Married_AF_spouse,marital_status_Married_civ_spouse,marital_status_Married_spouse_absent,marital_status_Never_married,marital_status_Separated,marital_status_Widowed,marital_status_marital_status,sex_Male,sex_gender
0,age,fnlwgt,16.0,educational-num,15.0,6,5,capital-gain,capital-loss,hours-per-week,...,1,0,0,0,0,0,0,1,0,1
1,25,226802,1.0,7,7.0,3,2,0,0,40,...,0,0,0,0,1,0,0,0,1,0
2,38,89814,11.0,9,5.0,0,4,0,0,50,...,0,0,1,0,0,0,0,0,1,0
3,28,336951,7.0,12,11.0,0,4,0,0,40,...,0,0,1,0,0,0,0,0,1,0
4,44,160323,15.0,10,7.0,0,2,7688,0,40,...,0,0,1,0,0,0,0,0,1,0
5,18,103497,15.0,10,0.0,3,4,0,0,30,...,0,0,0,0,1,0,0,0,0,0
6,34,198693,0.0,6,8.0,1,4,0,0,30,...,0,0,0,0,1,0,0,0,1,0
7,29,227026,11.0,9,0.0,4,2,0,0,40,...,0,0,0,0,1,0,0,0,1,0
8,63,104626,14.0,15,10.0,0,4,3103,0,32,...,0,0,1,0,0,0,0,0,1,0
9,24,369667,15.0,10,8.0,4,4,0,0,40,...,0,0,0,0,1,0,0,0,0,0


In [55]:
#sns.boxplot(adult.fnlwgt)

In [56]:
#sns.boxplot(adult.capital_gain)

In [57]:
adult.hours_per_week.max()

'hours-per-week'

In [58]:
adult.hours_per_week.min()

'1'

In [59]:
min_trans = MinMaxScaler(feature_range=(0,100))

In [None]:
adult.head()

# perform feature selection

In [None]:
plt.figure(figsize=(25,18))
sns.heatmap(adult.corr() , annot=True)

In [None]:
# find the feature importance using Random Forest model

In [None]:
RFC = RandomForestClassifier(n_estimators=200)

In [None]:
RFC.fit(adult.drop('Income',axis=1) , adult.Income )

In [None]:
RFC.feature_importances_

In [None]:
feature_score = pd.DataFrame(RFC.feature_importances_ , index= adult.drop('Income',axis=1).columns )

In [None]:
feature_score.columns = ['score']

In [None]:
feature_score.sort_values(by='score', ascending=False)

In [None]:
x = adult.drop('Income' , axis=1)
y = adult.Income

In [None]:
mut_info = mutual_info_classif(x,y)

In [None]:
pd.DataFrame(mut_info , index=x.columns).sort_values(0,ascending=False)

In [None]:
pipeline ('impute ', 'labelencoder','minmaxscalar','feature_selection','logistic regression')
pipe.fit(x,y)