In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, LabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

In [2]:
CSV_HEADER = [
    "age",
    "workclass",
    "fnlwgt",
    "education",
    "education_num",
    "marital_status",
    "occupation",
    "relationship",
    "race",
    "gender",
    "capital_gain",
    "capital_loss",
    "hours_per_week",
    "native_country",
    "income_bracket",
]

train_data_file = ("../../input/census_income/adult.data")
train_data = pd.read_csv(train_data_file, header=None, names=CSV_HEADER)

test_data_file = ("../../input/census_income/adult.test")
test_data = pd.read_csv(test_data_file, header=None, names=CSV_HEADER)

test_data.income_bracket = test_data.income_bracket.apply(
    lambda value: value.replace(".", "")
)

print(f"Train dataset shape: {train_data.shape}")
print(f"Test dataset shape: {test_data.shape}")

Train dataset shape: (32561, 15)
Test dataset shape: (16281, 15)


In [3]:
train_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
test_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [5]:
train_data['income_bracket'].values

array([' <=50K', ' <=50K', ' <=50K', ..., ' <=50K', ' <=50K', ' >50K'],
      dtype=object)

In [6]:
train_data['income_bracket'] = 1*(train_data['income_bracket'] == ' >50K')
test_data['income_bracket'] = 1*(test_data['income_bracket'] == ' >50K')

In [7]:
train_data_file = "../input/train_data.csv"
test_data_file = "../input/test_data.csv"

train_data.to_csv(train_data_file, index=False, header=False)
test_data.to_csv(test_data_file, index=False, header=False)

In [8]:
train_data.income_bracket.mean()

0.2408095574460244

In [9]:
test_data.income_bracket.mean()

0.23622627602727106

In [10]:
lr = LogisticRegression(C=1)
lr.fit(train_data[['education_num', 'age', 'capital_gain', 'capital_loss', 'hours_per_week']].values, train_data['income_bracket'])
test_preds = lr.predict(test_data[['education_num', 'age', 'capital_gain', 'capital_loss', 'hours_per_week']].values)
accuracy_score(test_data['income_bracket'], test_preds)

0.8124808058473066

In [11]:
np.unique(train_data['gender'].values)

array([' Female', ' Male'], dtype=object)

In [12]:
train_data['gender'] = 1*(train_data['gender'] == ' Male')
test_data['gender'] = 1*(test_data['gender'] == ' Male')

In [13]:
train_data['gender'].mean()

0.6692054912318418

In [14]:
lr = LogisticRegression(C=1, max_iter=1000)
lr.fit(train_data[['education_num', 'age', 'capital_gain', 'capital_loss', 'hours_per_week', 'gender']].values, train_data['income_bracket'])
test_preds = lr.predict(test_data[['education_num', 'age', 'capital_gain', 'capital_loss', 'hours_per_week', 'gender']].values)
accuracy_score(test_data['income_bracket'], test_preds)

0.8228610036238561

In [15]:
np.unique(train_data['relationship'].values)

array([' Husband', ' Not-in-family', ' Other-relative', ' Own-child',
       ' Unmarried', ' Wife'], dtype=object)

In [16]:
np.unique(train_data['race'].values)

array([' Amer-Indian-Eskimo', ' Asian-Pac-Islander', ' Black', ' Other',
       ' White'], dtype=object)

In [17]:
lb = LabelBinarizer()
race_train = lb.fit_transform(train_data['race'].values)

In [18]:
race_test = lb.transform(test_data['race'].values)

In [19]:
race_train.shape

(32561, 5)

In [20]:
race_train = pd.DataFrame(race_train, columns = ['race0', 'race1', 'race2', 'race3', 'race4'] )
race_test = pd.DataFrame(race_test, columns = ['race0', 'race1', 'race2', 'race3', 'race4'] )

In [21]:
train_data = pd.concat([train_data, race_train], axis=1)
test_data = pd.concat([test_data, race_test], axis=1)
train_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket,race0,race1,race2,race3,race4
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,1,2174,0,40,United-States,0,0,0,0,0,1
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,1,0,0,13,United-States,0,0,0,0,0,1
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,1,0,0,40,United-States,0,0,0,0,0,1
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,1,0,0,40,United-States,0,0,0,1,0,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,0,0,0,40,Cuba,0,0,0,1,0,0


In [22]:
columns = ['education_num', 'age', 'capital_gain', 'capital_loss', 'hours_per_week', 'gender', 'race0', 'race1', 'race2', 'race3', 'race4']
lr = LogisticRegression(C=1, max_iter=1000)
lr.fit(train_data[columns].values, train_data['income_bracket'])
test_preds = lr.predict(test_data[columns].values)
accuracy_score(test_data['income_bracket'], test_preds)

0.8242736932620847

In [23]:
lb = LabelBinarizer()
relationship_train = lb.fit_transform(train_data['relationship'].values)
relationship_test = lb.transform(test_data['relationship'].values)
relationship_train = pd.DataFrame(relationship_train, columns = ['relationship0', 'relationship1', 'relationship2', 
                                                                 'relationship3', 'relationship4', 'relationship5'] )
relationship_test = pd.DataFrame(relationship_test, columns = ['relationship0', 'relationship1', 'relationship2', 
                                                                 'relationship3', 'relationship4', 'relationship5'] )

In [24]:
train_data = pd.concat([train_data, relationship_train], axis=1)
test_data = pd.concat([test_data, relationship_test], axis=1)
train_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,gender,...,race1,race2,race3,race4,relationship0,relationship1,relationship2,relationship3,relationship4,relationship5
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,1,...,0,0,0,1,0,1,0,0,0,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,1,...,0,0,0,1,1,0,0,0,0,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,1,...,0,0,0,1,0,1,0,0,0,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,1,...,0,1,0,0,1,0,0,0,0,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,0,...,0,1,0,0,0,0,0,0,0,1


In [25]:
columns = ['education_num', 'age', 'capital_gain', 'capital_loss', 'hours_per_week', 'gender', 
           'race0', 'race1', 'race2', 'race3', 'race4', 'relationship0', 'relationship1', 'relationship2', 
            'relationship3', 'relationship4', 'relationship5']
lr = LogisticRegression(C=1, max_iter=5000)
lr.fit(train_data[columns].values, train_data['income_bracket'])
test_preds = lr.predict(test_data[columns].values)
accuracy_score(test_data['income_bracket'], test_preds)

0.8441127694859039

In [26]:
np.unique(train_data['workclass'].values)

array([' ?', ' Federal-gov', ' Local-gov', ' Never-worked', ' Private',
       ' Self-emp-inc', ' Self-emp-not-inc', ' State-gov', ' Without-pay'],
      dtype=object)

In [27]:
workclass_train = lb.fit_transform(train_data['workclass'].values)
workclass_test = lb.transform(test_data['workclass'].values)
workclass_train = pd.DataFrame(workclass_train, columns = ['workclass0', 'workclass1', 'workclass2', 
                                                                 'workclass3', 'workclass4', 'workclass5',
                                                                'workclass6', 'workclass7', 'workclass8'] )
workclass_test = pd.DataFrame(workclass_test, columns = ['workclass0', 'workclass1', 'workclass2', 
                                                                 'workclass3', 'workclass4', 'workclass5',
                                                                'workclass6', 'workclass7', 'workclass8'] )

In [28]:
train_data = pd.concat([train_data, workclass_train], axis=1)
test_data = pd.concat([test_data, workclass_test], axis=1)
train_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,gender,...,relationship5,workclass0,workclass1,workclass2,workclass3,workclass4,workclass5,workclass6,workclass7,workclass8
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,1,...,0,0,0,0,0,0,0,0,1,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,1,...,0,0,0,0,0,0,0,1,0,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,1,...,0,0,0,0,0,1,0,0,0,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,1,...,0,0,0,0,0,1,0,0,0,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,0,...,1,0,0,0,0,1,0,0,0,0


In [29]:
columns = ['education_num', 'age', 'capital_gain', 'capital_loss', 'hours_per_week', 'gender', 
           'race0', 'race1', 'race2', 'race3', 'race4', 'relationship0', 'relationship1', 'relationship2', 
            'relationship3', 'relationship4', 'relationship5', 'workclass0', 'workclass1', 'workclass2', 
                                                                 'workclass3', 'workclass4', 'workclass5',
                                                                'workclass6', 'workclass7', 'workclass8']
lr = LogisticRegression(C=0.3, max_iter=5000)
lr.fit(train_data[columns].values, train_data['income_bracket'])
test_preds = lr.predict(test_data[columns].values)
accuracy_score(test_data['income_bracket'], test_preds)

0.8462625145875561

In [30]:
np.unique(train_data['marital_status'].values)

array([' Divorced', ' Married-AF-spouse', ' Married-civ-spouse',
       ' Married-spouse-absent', ' Never-married', ' Separated',
       ' Widowed'], dtype=object)

In [31]:
marital_status_train = lb.fit_transform(train_data['marital_status'].values)
marital_status_test = lb.transform(test_data['marital_status'].values)
marital_status_train = pd.DataFrame(marital_status_train, columns = ['marital_status0', 'marital_status1', 'marital_status2', 
                                                                 'marital_status3', 'marital_status4', 'marital_status5',
                                                                'marital_status6'] )
marital_status_test = pd.DataFrame(marital_status_test, columns = ['marital_status0', 'marital_status1', 'marital_status2', 
                                                                 'marital_status3', 'marital_status4', 'marital_status5',
                                                                'marital_status6'] )

In [32]:
train_data = pd.concat([train_data, marital_status_train], axis=1)
test_data = pd.concat([test_data, marital_status_test], axis=1)
train_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,gender,...,workclass6,workclass7,workclass8,marital_status0,marital_status1,marital_status2,marital_status3,marital_status4,marital_status5,marital_status6
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,1,...,0,1,0,0,0,0,0,1,0,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,1,...,1,0,0,0,0,1,0,0,0,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,1,...,0,0,0,1,0,0,0,0,0,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,1,...,0,0,0,0,0,1,0,0,0,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,0,...,0,0,0,0,0,1,0,0,0,0


In [33]:
columns = ['education_num', 'age', 'capital_gain', 'capital_loss', 'hours_per_week', 'gender', 
           'race0', 'race1', 'race2', 'race3', 'race4', 'relationship0', 'relationship1', 'relationship2', 
            'relationship3', 'relationship4', 'relationship5', 'workclass0', 'workclass1', 'workclass2', 
                                                                 'workclass3', 'workclass4', 'workclass5',
                                                                'workclass6', 'workclass7', 'workclass8',
          'marital_status0', 'marital_status1', 'marital_status2', 
                                                                 'marital_status3', 'marital_status4', 'marital_status5',
                                                                'marital_status6']
lr = LogisticRegression(C=0.25, max_iter=5000)
lr.fit(train_data[columns].values, train_data['income_bracket'])
test_preds = lr.predict(test_data[columns].values)
accuracy_score(test_data['income_bracket'], test_preds)

0.846446778453412

In [34]:
np.unique(train_data['occupation'].values)

array([' ?', ' Adm-clerical', ' Armed-Forces', ' Craft-repair',
       ' Exec-managerial', ' Farming-fishing', ' Handlers-cleaners',
       ' Machine-op-inspct', ' Other-service', ' Priv-house-serv',
       ' Prof-specialty', ' Protective-serv', ' Sales', ' Tech-support',
       ' Transport-moving'], dtype=object)

In [35]:
occupation_train = lb.fit_transform(train_data['occupation'].values)
occupation_test = lb.transform(test_data['occupation'].values)
occupation_train = pd.DataFrame(occupation_train, columns = ['occupation0', 'occupation1', 'occupation2', 
                                                                 'occupation3', 'occupation4', 'occupation5',
                                                                'occupation6', 'occupation7', 'occupation8',
                                                                'occupation9', 'occupation10', 'occupation11',
                                                                'occupation12', 'occupation13', 'occupation14'] )
occupation_test = pd.DataFrame(occupation_test, columns = ['occupation0', 'occupation1', 'occupation2', 
                                                                 'occupation3', 'occupation4', 'occupation5',
                                                                'occupation6', 'occupation7', 'occupation8',
                                                                'occupation9', 'occupation10', 'occupation11',
                                                                'occupation12', 'occupation13', 'occupation14'])

train_data = pd.concat([train_data, occupation_train], axis=1)
test_data = pd.concat([test_data, occupation_test], axis=1)
train_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,gender,...,occupation5,occupation6,occupation7,occupation8,occupation9,occupation10,occupation11,occupation12,occupation13,occupation14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,1,...,0,0,0,0,0,0,0,0,0,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,1,...,0,0,0,0,0,0,0,0,0,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,1,...,0,1,0,0,0,0,0,0,0,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,1,...,0,1,0,0,0,0,0,0,0,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,0,...,0,0,0,0,0,1,0,0,0,0


In [36]:
columns = ['education_num', 'age', 'capital_gain', 'capital_loss', 'hours_per_week', 'gender', 
           'race0', 'race1', 'race2', 'race3', 'race4', 'relationship0', 'relationship1', 'relationship2', 
            'relationship3', 'relationship4', 'relationship5', 'workclass0', 'workclass1', 'workclass2', 
            'workclass3', 'workclass4', 'workclass5', 'workclass6', 'workclass7', 'workclass8',
          'marital_status0', 'marital_status1', 'marital_status2', 'marital_status3', 'marital_status4', 'marital_status5', 'marital_status6',
          'occupation0', 'occupation1', 'occupation2',  'occupation3', 'occupation4', 'occupation5', 'occupation6', 'occupation7', 'occupation8',
          'occupation9', 'occupation10', 'occupation11', 'occupation12', 'occupation13', 'occupation14']
lr = LogisticRegression(C=0.2, max_iter=5000)
lr.fit(train_data[columns].values, train_data['income_bracket'])
test_preds = lr.predict(test_data[columns].values)
accuracy_score(test_data['income_bracket'], test_preds)

0.8524046434494196

In [37]:
np.unique(train_data['education'].values)

array([' 10th', ' 11th', ' 12th', ' 1st-4th', ' 5th-6th', ' 7th-8th',
       ' 9th', ' Assoc-acdm', ' Assoc-voc', ' Bachelors', ' Doctorate',
       ' HS-grad', ' Masters', ' Preschool', ' Prof-school',
       ' Some-college'], dtype=object)

In [38]:
'''education_train = lb.fit_transform(train_data['education'].values)
education_test = lb.transform(test_data['education'].values)
education_train = pd.DataFrame(education_train, columns = ['education0', 'education1', 'education2', 'education3', 'education4', 'education5', 
                                                        'education6', 'education7', 'education8', 'education9', 'education10', 'education11', 
                                                        'education12', 'education13', 'education14' , 'education15'] )
education_test = pd.DataFrame(education_test, columns = ['education0', 'education1', 'education2', 'education3', 'education4', 'education5', 
                                                        'education6', 'education7', 'education8', 'education9', 'education10', 'education11', 
                                                        'education12', 'education13', 'education14' , 'education15'])

train_data = pd.concat([train_data, education_train], axis=1)
test_data = pd.concat([test_data, education_test], axis=1)
train_data.head()'''

"education_train = lb.fit_transform(train_data['education'].values)\neducation_test = lb.transform(test_data['education'].values)\neducation_train = pd.DataFrame(education_train, columns = ['education0', 'education1', 'education2', 'education3', 'education4', 'education5', \n                                                        'education6', 'education7', 'education8', 'education9', 'education10', 'education11', \n                                                        'education12', 'education13', 'education14' , 'education15'] )\neducation_test = pd.DataFrame(education_test, columns = ['education0', 'education1', 'education2', 'education3', 'education4', 'education5', \n                                                        'education6', 'education7', 'education8', 'education9', 'education10', 'education11', \n                                                        'education12', 'education13', 'education14' , 'education15'])\n\ntrain_data = pd.concat([train_data, education_train]

In [39]:
'''columns = ['education_num', 'age', 'capital_gain', 'capital_loss', 'hours_per_week', 'gender', 
           'race0', 'race1', 'race2', 'race3', 'race4', 'relationship0', 'relationship1', 'relationship2', 
            'relationship3', 'relationship4', 'relationship5', 'workclass0', 'workclass1', 'workclass2', 
            'workclass3', 'workclass4', 'workclass5', 'workclass6', 'workclass7', 'workclass8',
          'marital_status0', 'marital_status1', 'marital_status2', 'marital_status3', 'marital_status4', 'marital_status5', 'marital_status6',
          'occupation0', 'occupation1', 'occupation2',  'occupation3', 'occupation4', 'occupation5', 'occupation6', 'occupation7', 'occupation8',
          'occupation9', 'occupation10', 'occupation11', 'occupation12', 'occupation13', 'occupation14',
          'education0', 'education1', 'education2', 'education3', 'education4', 'education5', 
         'education6', 'education7', 'education8', 'education9', 'education10', 'education11', 
        'education12', 'education13', 'education14' , 'education15']
lr = LogisticRegression(C=0.1, max_iter=5000)
lr.fit(train_data[columns].values, train_data['income_bracket'])
test_preds = lr.predict(test_data[columns].values)
accuracy_score(test_data['income_bracket'], test_preds)'''

"columns = ['education_num', 'age', 'capital_gain', 'capital_loss', 'hours_per_week', 'gender', \n           'race0', 'race1', 'race2', 'race3', 'race4', 'relationship0', 'relationship1', 'relationship2', \n            'relationship3', 'relationship4', 'relationship5', 'workclass0', 'workclass1', 'workclass2', \n            'workclass3', 'workclass4', 'workclass5', 'workclass6', 'workclass7', 'workclass8',\n          'marital_status0', 'marital_status1', 'marital_status2', 'marital_status3', 'marital_status4', 'marital_status5', 'marital_status6',\n          'occupation0', 'occupation1', 'occupation2',  'occupation3', 'occupation4', 'occupation5', 'occupation6', 'occupation7', 'occupation8',\n          'occupation9', 'occupation10', 'occupation11', 'occupation12', 'occupation13', 'occupation14',\n          'education0', 'education1', 'education2', 'education3', 'education4', 'education5', \n         'education6', 'education7', 'education8', 'education9', 'education10', 'education11', \

In [40]:
np.unique(train_data['native_country'].values)

array([' ?', ' Cambodia', ' Canada', ' China', ' Columbia', ' Cuba',
       ' Dominican-Republic', ' Ecuador', ' El-Salvador', ' England',
       ' France', ' Germany', ' Greece', ' Guatemala', ' Haiti',
       ' Holand-Netherlands', ' Honduras', ' Hong', ' Hungary', ' India',
       ' Iran', ' Ireland', ' Italy', ' Jamaica', ' Japan', ' Laos',
       ' Mexico', ' Nicaragua', ' Outlying-US(Guam-USVI-etc)', ' Peru',
       ' Philippines', ' Poland', ' Portugal', ' Puerto-Rico',
       ' Scotland', ' South', ' Taiwan', ' Thailand', ' Trinadad&Tobago',
       ' United-States', ' Vietnam', ' Yugoslavia'], dtype=object)

In [41]:
len(np.unique(test_data['native_country'].values))

41

In [42]:
train_data['native_country'] = 1*(train_data['native_country'] == ' United-States')
test_data['native_country'] = 1*(test_data['native_country'] == ' United-States')
train_data['native_country'].mean()

0.895857006848684

In [43]:
test_data['native_country'].mean()

0.9005589337264296

In [44]:
columns = ['education_num', 'age', 'capital_gain', 'capital_loss', 'hours_per_week', 'gender', 'native_country',
           'race0', 'race1', 'race2', 'race3', 'race4', 'relationship0', 'relationship1', 'relationship2', 
            'relationship3', 'relationship4', 'relationship5', 'workclass0', 'workclass1', 'workclass2', 
            'workclass3', 'workclass4', 'workclass5', 'workclass6', 'workclass7', 'workclass8',
          'marital_status0', 'marital_status1', 'marital_status2', 'marital_status3', 'marital_status4', 'marital_status5', 'marital_status6',
          'occupation0', 'occupation1', 'occupation2',  'occupation3', 'occupation4', 'occupation5', 'occupation6', 'occupation7', 'occupation8',
          'occupation9', 'occupation10', 'occupation11', 'occupation12', 'occupation13', 'occupation14']
lr = LogisticRegression(C=0.22, max_iter=5000)
lr.fit(train_data[columns].values, train_data['income_bracket'])
test_preds = lr.predict(test_data[columns].values)
accuracy_score(test_data['income_bracket'], test_preds)

0.8535716479331736