In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

adult_df = pd.read_csv("Data/adult.csv")
county_df = pd.read_csv("Data/acs2017_county_data.csv")

### Preprocessing

In [2]:
adult_df.workclass = adult_df.workclass.apply(lambda x: 'unknown' if (x == '?') else x)
adult_df.occupation = adult_df.occupation.apply(lambda x: 'unknown' if (x == '?') else x)

# Imputation of nulls using Simple Imputer
imputer = SimpleImputer(strategy='most_frequent')
county_df['ChildPoverty'] = imputer.fit_transform(county_df[['ChildPoverty']])

# Convert Income to binary in county_df
county_df.Income = county_df.Income.apply(lambda x: '<=50K' if (x <= 50000) else '>50K')

In [3]:
# Encode categorical variables in the adult_df
le = LabelEncoder()
adult_df['workclass'] = le.fit_transform(adult_df['workclass'])
adult_df['education'] = le.fit_transform(adult_df['education'])
adult_df['marital.status'] = le.fit_transform(adult_df['marital.status'])
adult_df['occupation'] = le.fit_transform(adult_df['occupation'])
adult_df['relationship'] = le.fit_transform(adult_df['relationship'])
adult_df['race'] = le.fit_transform(adult_df['race'])
adult_df['sex'] = le.fit_transform(adult_df['sex'])
adult_df['native.country'] = le.fit_transform(adult_df['native.country'])
adult_df['income'] = le.fit_transform(adult_df['income'])

# Encode categorical variables in the county_df
le_county = LabelEncoder()
county_df['State'] = le_county.fit_transform(county_df['State'])
county_df['County'] = le_county.fit_transform(county_df['County'])
county_df['Income'] = le_county.fit_transform(county_df['Income'])

In [4]:
from sklearn.linear_model import LogisticRegression

# Train logistic regression model for adult_df
X_adult = pd.DataFrame(adult_df)
y_adult = X_adult.pop('income')
adult_model = LogisticRegression()
adult_model.fit(X_adult, y_adult)

# Train logistic regression model for county_df
X_county = pd.DataFrame(county_df)
y_county = X_county.pop('Income')
county_model = LogisticRegression()
county_model.fit(X_county, y_county)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [5]:
from sklearn.metrics import accuracy_score

y_pred_adult = adult_model.predict(X_adult)
print("Adults Dataset: ", accuracy_score(y_adult, y_pred_adult))

y_pred_county = county_model.predict(X_county)
print("County Dataset: ", accuracy_score(y_county, y_pred_county))

Adults Dataset:  0.7876907957372317
County Dataset:  0.7838509316770186


In [6]:
adult_df.insert(loc=0, column='score', value=y_pred_adult)

adult_df = adult_df.rename(columns={"income": "label_value"})
cols = adult_df.columns.tolist()
cols.remove('label_value')
cols.insert(1, 'label_value')
adult_df = adult_df[cols]

adult_df.to_csv('adult_processed.csv', index=False)


county_df.insert(loc=0, column='score', value=y_pred_county)

county_df = county_df.rename(columns={"Income": "label_value"})
cols = county_df.columns.tolist()
cols.remove('label_value')
cols.insert(1, 'label_value')
county_df = county_df[cols]

county_df.to_csv('acs2017_county_data_processed.csv', index=False)