# Logistic regression  for Census dataset

In [1]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import cross_val_score

import pandas_profiling

In [4]:
cols = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation',
          'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'y']

In [5]:
adult_train = pd.read_csv('train.data', header = None)
adult_test  = pd.read_csv('test.data', header = None)
adult_train.columns = cols
adult_test.columns = cols
adult_train.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,y
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


## Missing value imputation

In [6]:
# remove whitespaces from the beginning of categorical values
for col in cols:
    if col not in ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']:
        adult_test[col] = adult_test[col].str.strip()
        adult_train[col] = adult_train[col].str.strip()

In [7]:
# numeric imputation
numeric_imputer = SimpleImputer(missing_values = 0, strategy = 'median')
adult_test[['capital_gain', 'capital_loss']] = numeric_imputer.fit_transform(adult_test[['capital_gain', 'capital_loss']])
adult_train[['capital_gain', 'capital_loss']] = numeric_imputer.fit_transform(adult_train[['capital_gain', 'capital_loss']])

In [8]:
# categorical imputation
categoric_imputer = SimpleImputer(missing_values= '?',strategy='most_frequent')
adult_train[['workclass', 'occupation', 'native_country']] = categoric_imputer.fit_transform(adult_train[['workclass', 'occupation', 'native_country']])
adult_test[['workclass', 'occupation', 'native_country']] = categoric_imputer.fit_transform(adult_test[['workclass', 'occupation', 'native_country']])

## Transformation of categorical values to numerical ones

In [9]:
train = pd.get_dummies(adult_train, columns = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country'])
test  = pd.get_dummies(adult_test, columns = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country'])

In [10]:
train.head(5)

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,y,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,...,native_country_Portugal,native_country_Puerto-Rico,native_country_Scotland,native_country_South,native_country_Taiwan,native_country_Thailand,native_country_Trinadad&Tobago,native_country_United-States,native_country_Vietnam,native_country_Yugoslavia
0,39,77516,13,2174.0,1887.0,40,<=50K,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,50,83311,13,7298.0,1887.0,13,<=50K,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,38,215646,9,7298.0,1887.0,40,<=50K,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,53,234721,7,7298.0,1887.0,40,<=50K,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,28,338409,13,7298.0,1887.0,40,<=50K,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
# transform y

train.loc[train['y'] == '>50K', 'y'] = 1
train.loc[train['y'] == '<=50K', 'y'] = 0

test.loc[test['y'] == '>50K', 'y'] = 1
test.loc[test['y'] == '<=50K', 'y'] = 0

In [12]:
# add missing column to test dataset
test['native_country_Holand-Netherlands'] = 0

In [13]:
# now no difference
set(train.columns).difference(set(test.columns))

set()

## Prepare datasets for training

In [14]:
y_train = train['y']
x_train = train.loc[:, train.columns != 'y']

In [15]:
y_test = test['y']
x_test = test.loc[:, test.columns != 'y']

## Train model

In [16]:
model = LogisticRegression(solver='lbfgs', max_iter = 1000)
model.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

## Model evaluation

In [18]:
print("Training accuracy: {}".format(model.score(x_train, y_train)))

predicted = model.predict(x_test)

print("Testing accuracy:  {}".format(metrics.accuracy_score(y_test, predicted)))

Training accuracy: 0.819630846718
Testing accuracy:  0.819630846718


In [19]:
probabilities = model.predict_proba(x_test)

probabilities

array([[0.94965613, 0.05034387],
       [0.41252398, 0.58747602],
       [0.94832952, 0.05167048],
       ...,
       [0.94165548, 0.05834452],
       [0.97615635, 0.02384365],
       [0.3946278 , 0.6053722 ]])

## Model evaluation using cross-validation

In [20]:
scores = cross_val_score(LogisticRegression(solver='lbfgs', max_iter = 1000), x_train.append(x_test, sort=False), y_train.append(y_test), scoring='accuracy', cv=10)

In [21]:
print(scores)
# mean accuracy using cross-validation
print(scores.mean())

[0.77230155 0.80500537 0.77687346 0.80697174 0.77272727 0.79914005
 0.76504914 0.77933047 0.78209459 0.77272727]
0.7832220929526319


In [52]:
def print_accuracy(x_train, y_train, x_test, y_test):
        model = LogisticRegression(solver='lbfgs', max_iter = 1000)
        model.fit(x_train, y_train)
        print("Training accuracy: {}".format(model.score(x_train, y_train)))

        predicted = model.predict(x_test)

        print("Testing accuracy:  {}".format(metrics.accuracy_score(y_test, predicted)))
        print()

# 1 - Misclassification noise

In [28]:
percentages = [0.01, 0.05, 0.1, 0.2]
subsets_len = [int(len(y_train) * percentage) for percentage in percentages]

Y_train_001 = y_train.copy(deep='all')
Y_train_005 = y_train.copy(deep='all')
Y_train_010 = y_train.copy(deep='all')
Y_train_020 = y_train.copy(deep='all')

subsets = [Y_train_001, Y_train_005, Y_train_010, Y_train_020]

In [41]:
flipped_subsets = []
for length, subset in zip(subsets_len, subsets):
    curr_indexes = y_train.sample(length).index
    subset = [subset[element_indx] ^ 1 if element_indx in curr_indexes else subset[element_indx] for element_indx in range(len(subset))  ]
    print(sum(subset))
    flipped_subsets.append(subset)

8016
8675
9581
11195


## Check accuracy again

In [53]:
print_accuracy(x_train, subset, x_test, y_test)

Training accuracy: 0.795890789595
Testing accuracy:  0.801940972329
()
Training accuracy: 0.743435398176
Testing accuracy:  0.770277325635
()
Training accuracy: 0.711403212432
Testing accuracy:  0.7657012991
()
Training accuracy: 0.65569239274
Testing accuracy:  0.760848868278
()
Training accuracy: 0.795890789595
Testing accuracy:  0.801940972329
()
Training accuracy: 0.743435398176
Testing accuracy:  0.770277325635
()
Training accuracy: 0.711403212432
Testing accuracy:  0.7657012991
()
Training accuracy: 0.65569239274
Testing accuracy:  0.760848868278
()
Training accuracy: 0.795890789595
Testing accuracy:  0.801940972329
()
Training accuracy: 0.743435398176
Testing accuracy:  0.770277325635
()
Training accuracy: 0.711403212432
Testing accuracy:  0.7657012991
()
Training accuracy: 0.65569239274
Testing accuracy:  0.760848868278
()
Training accuracy: 0.795890789595
Testing accuracy:  0.801940972329
()
Training accuracy: 0.743435398176
Testing accuracy:  0.770277325635
()
Training accura

# 2 - Attribute noise

In [46]:
adult_train.head(5)
new_adult_train = adult_train.drop(columns=["y"])

In [47]:
new_adult_train.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,1887.0,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,7298.0,1887.0,13,United-States
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,7298.0,1887.0,40,United-States
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,7298.0,1887.0,40,United-States
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,7298.0,1887.0,40,Cuba


In [48]:
percentages = [0.01, 0.05, 0.1, 0.2]
subsets_len = [int(len(new_adult_train) * percentage) for percentage in percentages]

x_train_001 = new_adult_train.copy(deep='all')
x_train_005 = new_adult_train.copy(deep='all')
x_train_010 = new_adult_train.copy(deep='all')
x_train_020 = new_adult_train.copy(deep='all')

subsets = [x_train_001, x_train_005, x_train_010, x_train_020]

In [50]:
negated_subsets = []
for length, subset in zip(subsets_len, subsets):
    curr_indexes = y_train.sample(length).index
    subset.loc[curr_indexes, 'age'] = [-el for el in subset.loc[curr_indexes, 'age']]
    print(sum(subset['age']))
    negated_subsets.append(subset)

1205919
1129989
1005843
754629
