In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb
import tensorflow as tf
from sklearn.metrics import mean_squared_error, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras import backend as K
from tensorflow.keras.metrics import BinaryAccuracy

In [None]:
df = pd.read_csv("../input/censusincomedata/census-income-training.csv")

In [None]:
df.head()

Descriptions of Data (From original dataset: https://archive.ics.uci.edu/ml/datasets/Census-Income+%28KDD%29)

age AAGE

class of worker ACLSWKR

industry code ADTIND

occupation code ADTOCC

adjusted gross income AGI

education AHGA

wage per hour AHRSPAY

enrolled in edu inst last wk AHSCOL

marital status AMARITL

major industry code AMJIND

major occupation code AMJOCC

mace ARACE

hispanic Origin AREORGN

sex ASEX

member of a labor union AUNMEM

reason for unemployment AUNTYPE

full or part time employment stat AWKSTAT

capital gains CAPGAIN

capital losses CAPLOSS

divdends from stocks DIVVAL

federal income tax liability FEDTAX

tax filer status FILESTAT

region of previous residence GRINREG

state of previous residence GRINST

detailed household and family stat HHDFMX

detailed household summary in household HHDREL

instance weight MARSUPWT

migration code-change in msa MIGMTR1

migration code-change in reg MIGMTR3

migration code-move within reg MIGMTR4

live in this house 1 year ago MIGSAME

migration prev res in sunbelt MIGSUN

num persons worked for employer NOEMP

family members under 18 PARENT

total person earnings PEARNVAL

country of birth father PEFNTVTY

country of birth mother PEMNTVTY

country of birth self PENATVTY

citizenship PRCITSHP

total person income PTOTVAL

own business or self employed SEOTR

taxable income amount TAXINC

fill inc questionnaire for veteran's admin VETQVA

veterans benefits VETYN

weeks worked in year WKSWORK

In [None]:
print(df.shape)

In [None]:
cols = df.columns.tolist()
cols = cols[-1:] + cols [:-1]
train = df[cols]
train.head()

In [None]:
object_list = []
for col in df.columns:
    if df.dtypes[col] == "object":
        object_list.append(col)
for i in object_list:
    print(f"{i}: {df[i].unique()}")

In [None]:
# Descriptive stat for non object columns
df.describe()

Abnormal responses on object type columns:

AREORGN (hispanic Origin) = "Do not know"

GRINREG (region of previous residence) = "?"

MIGMTR1 (migration code-change in msa) = "?"

MIGMTR3 (migration code-change in reg) = "?"

MIGSAME (live in this house 1 year ago) = "?"

PARENT (family members under 18) = Only one variable "Not in universe"

PEFNTVTY (country of birth father) = "?"

PEMNTVTY (country of birth mother) = "?"

PENATVTY (country of birth self) = "?"

Dropping those variable can reduce the noise. If there are small number of observation containing those observation, then we'll delete those observations. If not, we'll delete the columns since the feature can't give much information. Also we'll be deleting the Parent feature since it has only 1 variable. But before that we need to check if the test_df has the same problem.

In [None]:
test_df = pd.read_csv("../input/censusincomedata/census-income-test.csv")
object_list = []
for col in test_df.columns:
    if test_df.dtypes[col] == "object":
        object_list.append(col)
for i in object_list:
    print(f"{i}: {df[i].unique()}")

In [None]:
print(f"Proportion of 'Do not know' observation in AREORGN: {df.loc[df.AREORGN == 'Do not know', 'AREORGN'].count()/142863}")
print(f"Proportion of '?' observation in GRINREG: {df.loc[df.GRINREG == '?', 'GRINREG'].count()/142863}")
print(f"Proportion of '?' observation in MIGMTR1: {df.loc[df.MIGMTR1 == '?', 'MIGMTR1'].count()/142863}")
print(f"Proportion of '?' observation in MIGMTR3: {df.loc[df.MIGMTR3 == '?', 'MIGMTR3'].count()/142863}")
print(f"Proportion of '?' observation in MIGSAME: {df.loc[df.MIGSAME == '?', 'MIGSAME'].count()/142863}")
print(f"Proportion of '?' observation in PEFNTVTY: {df.loc[df.PEFNTVTY == '?', 'PEFNTVTY'].count()/142863}")
print(f"Proportion of '?' observation in PEMNTVTY: {df.loc[df.PEMNTVTY == '?', 'PEMNTVTY'].count()/142863}")
print(f"Proportion of '?' observation in PENATVTY: {df.loc[df.PENATVTY == '?', 'PENATVTY'].count()/142863}")

MIGMTR1, MIGMTR3, MIGSAME has very high proportion of the abnormal responses (over 0.5). Therefore we'll drop those features with PARENT and ID features.

In [None]:
df = df.drop(['MIGMTR1', 'MIGMTR3', 'MIGSAME', 'PARENT', 'Id'], axis=1)
df.head()

In [None]:
X, y = df.iloc[:,:-1] , df.iloc[:,-1]
X.head()

In [None]:
y.head()

In [None]:
num_data_df = X.describe()
num_col = num_data_df.columns
num_df = X[num_col]
num_data_df = num_df.describe()
num_data_df = num_data_df.iloc[[3,7]]
num_data_df

In [None]:
for i in range(len(num_data_df.columns)):
    print(f"Range of variable {num_data_df.columns[i]} is: {num_data_df.iloc[1, i] - num_data_df.iloc[0,i]}")

The range for variable 'CAPGAIN', 'DIVVAL', 'HHDREL' is too large, so we will first reduce the range through applying log function and normalize those columns through MinMaxScaler function. Since the min values for those columns are 0, we need to add 1 to avoid error.

In [None]:
scale_col = ['CAPGAIN', 'DIVVAL', 'HHDREL']
data = X[scale_col]
data = data.apply(lambda x: x + 1)
data = data.apply(np.log)
scaler = MinMaxScaler()
scaler.fit(data)
print(scaler.data_max_)
data = scaler.transform(data)


In [None]:
scale_col_normalized = ['CAPGAIN_LogNorm', 'DIVVAL_LogNorm', 'HHDREL_LogNorm']
data_df = pd.DataFrame(data, columns = scale_col_normalized)

In [None]:
data_df.head()

In [None]:
print(X.shape)
print(data_df.shape)

In [None]:
X = pd.concat([X,data_df], axis = 1)
print(X.shape)

In [None]:
for col in X.columns:
    print(col)

Will be trying to use two different dataset:
1. Dataset with Normalization
2. Dataset without Normalization

Since the variables that we log - normalized ('CAPGAIN', 'DIVVAL', 'HHDREL') are extremly sckewed, we're not sure whether log normalizing would yeild a better result.

We will see which model performs better.

In [None]:
for col in X.columns:
    if X.dtypes[col] != "object":
        print(col)

Since we are going to use XGBboosting, we need to encode categorical variables into numerical variable. The two vays to encode categorical variables are 1.One-hot Encoding, and 2.Label-encoding. Each encoding methods used in different situation, one-hot encoding when the number of unique variable of the columns are few and label encoding when the unique variable are ordial variable. But for convenience, we'll do one hot encoding to all of the object variable and the order of ordial variable is not defined.

Features with dtype == object

ACLSWKR - One-hot v

AHGA - One-hot v

AHSCOL - One-hot

AMARITL - One-hot v

AMJIND - One-hot 

AMJOCC - One-hot 

ARACE - One-hot v

AREORGN - One-hot

ASEX - One-hot v

AUNMEM - One-hot v

AUNTYPE - One-hot

AWKSTAT - One-hot v

FEDTAX - One-hot

FILESTAT - One-hot

GRINREG - One-hot

GRINST - One-hot

HHDFMX - One-hot

MIGMTR4 - One-hot

PEFNTVTY - One-hot v

PEMNTVTY - One-hot

PENATVTY - One-hot

PRCITSHP - One-hot v

VETQVA - One-hot

----------------------------------------

Feature with dtype != object:

AAGE v

ADTIND v

ADTOCC v

AHRSPAY v

CAPGAIN v

CAPLOSS v 

DIVVAL v

HHDREL

NOEMP v

SEOTR 

VETYN v

WKSWORK v

Since there are many features in this dataset, we'll go through one model with all the features and the other one with key features (with v mark)

In [None]:
all_variables = ['AAGE', 'ACLSWKR', 'ADTIND', 'ADTOCC', 'AHGA', 'AHRSPAY', 'AHSCOL',
                'AMARITL', 'AMJIND', 'AMJOCC', 'ARACE', 'AREORGN', 'ASEX', 'AUNMEM', 'AUNTYPE',
                'AWKSTAT', 'CAPGAIN', 'CAPLOSS', 'DIVVAL', 'FEDTAX', 'FILESTAT', 'GRINREG', 'GRINST',
                'HHDFMX', 'HHDREL', 'MIGMTR4', 'NOEMP', 'PEFNTVTY', 
                'PEMNTVTY', 'PENATVTY', 'PRCITSHP', 'SEOTR', 'year', 'VETQVA', 'VETYN', 
                 'WKSWORK','CAPGAIN_LogNorm', 'DIVVAL_LogNorm', 'HHDREL_LogNorm']
key_variables = ['ACLSWKR','AMARITL','ARACE','ASEX','AUNMEM','AWKSTAT',
                 'PEFNTVTY','PRCITSHP','AAGE','ADTIND','ADTOCC','AHRSPAY',
                 'CAPGAIN','CAPLOSS','DIVVAL','NOEMP','VETYN','WKSWORK','CAPGAIN_LogNorm', 'DIVVAL_LogNorm']
X_key = X[key_variables]
X_key.head()

Changing categorical variables to numerical variable using one hot encoding. After then we can drop the original categorical variables.

In [None]:
object_list = []
for col in X.columns:
    if X.dtypes[col] == "object":
        object_list.append(col)

for i in object_list:
    dummy = pd.get_dummies(X[i], prefix= f"{i}_")
    X = pd.merge(left=X, right=dummy,
                 left_index= True, right_index= True)

X = X.drop(object_list, axis=1)
X.head()


In [None]:
object_list = []
for col in X_key.columns:
    if X_key.dtypes[col] == "object":
        object_list.append(col)

for i in object_list:
    dummy = pd.get_dummies(X_key[i], prefix= f"{i}_")
    X_key = pd.merge(left=X_key, right=dummy,
                 left_index= True, right_index= True)

X_key = X_key.drop(object_list, axis=1)
X_key.head()

In [None]:
X_Original = X.drop(['CAPGAIN_LogNorm', 'DIVVAL_LogNorm', 'HHDREL_LogNorm'], axis = 1)
X_Normalized = X.drop(['CAPGAIN', 'DIVVAL', 'HHDREL'], axis = 1)
X_key_Original = X_key.drop(['CAPGAIN_LogNorm', 'DIVVAL_LogNorm'], axis = 1)
X_key_Normalized =X_key.drop(['CAPGAIN', 'DIVVAL'], axis = 1)

Now we need to load test data and preprocess for future modeling.

In [None]:
def check_column_number(A,B):
    testcolumns = []
    traincolumns = []
    for col in A.columns:
        testcolumns.append(col)
    for col in B.columns:
        traincolumns.append(col)
    return list(set(testcolumns)-set(traincolumns))

In [None]:
scale_col = ['CAPGAIN', 'DIVVAL', 'HHDREL']
data = test_df[scale_col]
data = data.apply(lambda x: x + 1)
data = data.apply(np.log)
scaler = MinMaxScaler()
scaler.fit(data)
print(scaler.data_max_)
data = scaler.transform(data)

In [None]:
scale_col_normalized = ['CAPGAIN_LogNorm', 'DIVVAL_LogNorm', 'HHDREL_LogNorm']
data_df = pd.DataFrame(data, columns = scale_col_normalized)
test_df = pd.concat([test_df,data_df], axis = 1)

In [None]:
testmodel_df = test_df[all_variables]
testmodel_df.shape

In [None]:
object_list = []
for col in testmodel_df.columns:
    if testmodel_df.dtypes[col] == "object":
        object_list.append(col)

for i in object_list:
    dummy = pd.get_dummies(testmodel_df[i], prefix= f"{i}_")
    testmodel_df = pd.merge(left=testmodel_df, right=dummy,
                 left_index= True, right_index= True)

testmodel_df = testmodel_df.drop(object_list, axis=1)

In [None]:
testmodel_original_df = testmodel_df.drop(['CAPGAIN_LogNorm', 'DIVVAL_LogNorm', 'HHDREL_LogNorm'], axis = 1)
testmodel_norm_df = testmodel_df.drop(['CAPGAIN', 'DIVVAL', 'HHDREL'], axis = 1)

In [None]:
print(check_column_number(testmodel_original_df, X_Original))
print(check_column_number(testmodel_norm_df, X_Normalized))

Before fitting the data into the model, we need to split df into train data, validataion data, and testing data. Train, validation, test data are all useful in when developing the model:

Train data: Helps model to learn the pattern and the data itself by passing massive amounts of data.

Test data: Tests how well the model is performing, comparing it with the predicted output of model.

Validation data: Helps Train Data to change parameter during training

In [None]:
# Define function for model evaluation
def classifier_eval(test, pred):
    print(f"Confusion Matrix: \n {confusion_matrix(test, pred)}")
    print(f"Accuracy: {accuracy_score(test, pred)}")
    print(f"Precision: {precision_score(test, pred)}")
    print(f"Recall: {recall_score(test, pred)}")
    print(f"F1: {f1_score(test, pred)}")
    print(f"ROC_AUC_SCORE: {roc_auc_score(test, pred)}")


# XGboosting with Original Dataset

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_Original, y, test_size = 0.2, random_state = 3400)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state = 3400)

In [None]:
xgb_model_original= xgb.XGBClassifier(n_estimators = 200, 
                             learning_rate = 0.3, 
                             max_depth = 3,
                             objective='binary:logistic', 
                             use_label_encoder=False)


In [None]:
xgb_model_original.fit(X_train, y_train, 
              early_stopping_rounds = 100,
              eval_metric="logloss", 
              eval_set=[(X_val, y_val)])
y_pred = xgb_model_original.predict(X_test)

In [None]:
classifier_eval(y_test,y_pred)

In [None]:
# Use the model to make predictions
predicted_result = xgb_model_original.predict(testmodel_original_df)
print(predicted_result)

In [None]:
my_submission = pd.DataFrame({'Id': test_df['Id'], 'income_morethan_50K': predicted_result})
# you could use any filename. We choose submission here
my_submission = my_submission.sort_values(by=['Id'])
my_submission.to_csv('XGboost_Original.csv', index=False)

# XGboosting with Normalized dataset

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_Normalized, y, test_size = 0.2, random_state = 3400)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state = 3400)

In [None]:
xgb_model_normalized= xgb.XGBClassifier(n_estimators = 200, 
                             learning_rate = 0.3, 
                             max_depth = 3,
                             objective='binary:logistic', 
                             use_label_encoder=False)

In [None]:
xgb_model_normalized.fit(X_train, y_train, 
              early_stopping_rounds = 100,
              eval_metric="logloss", 
              eval_set=[(X_val, y_val)])
y_pred = xgb_model_normalized.predict(X_test)

In [None]:
classifier_eval(y_test,y_pred)

In [None]:
# Use the model to make predictions
predicted_result = xgb_model_normalized.predict(testmodel_norm_df)

In [None]:
my_submission = pd.DataFrame({'Id': test_df['Id'], 'income_morethan_50K': predicted_result})
# you could use any filename. We choose submission here
my_submission = my_submission.sort_values(by=['Id'])
my_submission.to_csv('XGboost_Original_Normalized.csv', index=False)

Now we move to model just using Key Variables. We need to preprocess the test_df as we did for previous dataset for original set.

In [None]:
testmodel_key_df = test_df[key_variables]

object_list = []
for col in testmodel_key_df.columns:
    if testmodel_key_df.dtypes[col] == "object":
        object_list.append(col)

for i in object_list:
    dummy = pd.get_dummies(testmodel_key_df[i], prefix= f"{i}_")
    testmodel_key_df = pd.merge(left=testmodel_key_df, right=dummy,
                 left_index= True, right_index= True)

testmodel_key_df = testmodel_key_df.drop(object_list, axis=1)

In [None]:
testmodel_key_original_df = testmodel_key_df.drop(['CAPGAIN_LogNorm', 'DIVVAL_LogNorm'], axis = 1)
testmodel_key_norm_df = testmodel_key_df.drop(['CAPGAIN', 'DIVVAL'], axis = 1)

In [None]:
print(check_column_number(testmodel_original_df, X_Original))
print(check_column_number(testmodel_norm_df, X_Normalized))

# XGboost with Key Variables

In [None]:
X_train_key, X_test_key, y_train, y_test = train_test_split(X_key_Original, y, test_size = 0.2, random_state = 3400)
X_train_key, X_val_key, y_train, y_val = train_test_split(X_train_key, y_train, test_size = 0.25, random_state = 3400)

In [None]:
xgb_model_key_original= xgb.XGBClassifier(n_estimators = 600, 
                             learning_rate = 0.15, 
                             max_depth = 3,
                             objective='binary:logistic', 
                             use_label_encoder=False)

In [None]:
xgb_model_key_original.fit(X_train_key, y_train, 
              early_stopping_rounds = 100,
              eval_metric="logloss", 
              eval_set=[(X_val_key, y_val)])
y_pred_key = xgb_model_key_original.predict(X_test_key)

In [None]:
classifier_eval(y_test, y_pred_key)

In [None]:
# Use the model to make predictions
predicted_result = xgb_model_key_original.predict(testmodel_key_original_df)

print(len(predicted_result))
print(test_df.shape)

In [None]:
my_submission = pd.DataFrame({'Id': test_df['Id'], 'income_morethan_50K': predicted_result})
# you could use any filename. We choose submission here
my_submission = my_submission.sort_values(by=['Id'])
my_submission.head()

my_submission.to_csv('XGboost_key.csv', index=False)

# XGboosting with key variables normalized

In [None]:
X_train_key, X_test_key, y_train, y_test = train_test_split(X_key_Normalized, y, test_size = 0.2, random_state = 3400)
X_train_key, X_val_key, y_train, y_val = train_test_split(X_train_key, y_train, test_size = 0.25, random_state = 3400)

In [None]:
xgb_model_key_normalized= xgb.XGBClassifier(n_estimators = 600, 
                             learning_rate = 0.15, 
                             max_depth = 3,
                             objective='binary:logistic', 
                             use_label_encoder=False)

In [None]:
xgb_model_key_normalized.fit(X_train_key, y_train, 
              early_stopping_rounds = 100,
              eval_metric="logloss", 
              eval_set=[(X_val_key, y_val)])
y_pred_key = xgb_model_key_normalized.predict(X_test_key)

In [None]:
classifier_eval(y_test, y_pred_key)

In [None]:
testmodel_key_norm_df.shape

In [None]:
testmodel_key_original_df.shape

In [None]:
X_train_key.shape

In [None]:
# Use the model to make predictions
predicted_result = xgb_model_key_normalized.predict(testmodel_key_norm_df)

print(len(predicted_result))
print(test_df.shape)

In [None]:
my_submission = pd.DataFrame({'Id': test_df['Id'], 'income_morethan_50K': predicted_result})
# you could use any filename. We choose submission here
my_submission = my_submission.sort_values(by=['Id'])
my_submission.head()

my_submission.to_csv('XGboost_Key_Normalized.csv', index=False)

It turned out that normalization doesn't influence the performance of the model. Therefore we try to do differnet method to improve the model.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_Original, y, test_size = 0.2, random_state = 3400)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state = 3400)

In [None]:
model = xgb.XGBClassifier(n_estimators = 600, 
                             learning_rate = 0.15, 
                             max_depth = 3,
                             objective='binary:logistic', 
                             use_label_encoder=False)
model.fit(X_train, y_train, 
              early_stopping_rounds = 100,
              eval_metric="logloss", 
              eval_set=[(X_val, y_val)])

# make predictions for test data and evaluate
y_pred = model.predict(X_test)

In [None]:
'''# Fit model using each importance as a threshold
thresholds = np.sort(xgb_model_original.feature_importances_[xgb_model_original.feature_importances_>0.01])
for thresh in thresholds:
    print(thresh)
    # select features using threshold
    selection = SelectFromModel(model, threshold=thresh, prefit=True)
    select_X_train = selection.transform(X_train)
    # train model
    selection_model = xgb.XGBClassifier()
    selection_model.fit(select_X_train, y_train)
    # eval model
    select_X_test = selection.transform(X_test)
    y_pred = selection_model.predict(select_X_test)
    classifier_eval(y_test, y_pred_key)'''

# DNN Model

In order to find important variable to train, we use feature_importances_ function on the original function with all variables.

In [None]:
# plot feature importance
xgb.plot_importance(xgb_model_original, 
                    max_num_features = len(xgb_model_original.feature_importances_[xgb_model_original.feature_importances_>0.01]))
plt.show()

In [None]:
feature_weight = xgb_model_original.get_booster().get_score(importance_type='weight')

In [None]:
feature_weight

‘CAPGAIN': 82,

'DIVVAL': 72,

'WKSWORK': 35,

'ASEX__Female': 24,

'CAPLOSS': 64,

'ADTOCC': 90,

'AAGE': 101,

'AHGA__Masters degree(MA MS MEng MEd MSW MBA)': 16,

'AHGA__Prof school degree (MD DDS DVM LLB JD)': 10,

'NOEMP': 37,

'AHGA__Doctorate degree(PhD EdD)': 12,

'AHGA__Bachelors degree(BA AB BS)': 11,

'ADTIND': 53,

'AHRSPAY': 30,

'AMARITL__Married-civilian spouse present': 16,

'AMJOCC__Precision production craft & repair': 10,

Extracted features with weight equal or larger than 10. Will used these variables to train deep learning model.

In [None]:
X_Original.columns

In [None]:
X_hfeatures = X_Original[['CAPGAIN','DIVVAL','WKSWORK','ASEX__Female',
                        'CAPLOSS','ADTOCC','AAGE','AHGA__Masters degree(MA MS MEng MEd MSW MBA)',
                        'AHGA__Prof school degree (MD DDS DVM LLB JD)','NOEMP','AHGA__Doctorate degree(PhD EdD)',
                        'AHGA__Bachelors degree(BA AB BS)','ADTIND','AHRSPAY','AMARITL__Married-civilian spouse present',
                        'AMJOCC__Precision production craft & repair']]
X_hfeatures.head()

In [None]:
X_hfeatures.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_hfeatures, y, test_size = 0.2, random_state = 3400)

In [None]:
model = Sequential()
model.add(Dense(16, activation='relu', input_shape=(142863,16)))
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam', 
              metrics=['Precision',
                      'Recall',
                      'AUC',
                       'BinaryAccuracy'])

model.fit(X_train, y_train, validation_split=0.3, epochs=10, verbose=1)

loss_and_metrics = model.evaluate(X_test, y_test, verbose=0)

print(loss_and_metrics)

In [None]:
X_hfeatures_test = testmodel_original_df[['CAPGAIN','DIVVAL','WKSWORK','ASEX__Female',
                        'CAPLOSS','ADTOCC','AAGE','AHGA__Masters degree(MA MS MEng MEd MSW MBA)',
                        'AHGA__Prof school degree (MD DDS DVM LLB JD)','NOEMP','AHGA__Doctorate degree(PhD EdD)',
                        'AHGA__Bachelors degree(BA AB BS)','ADTIND','AHRSPAY','AMARITL__Married-civilian spouse present',
                        'AMJOCC__Precision production craft & repair']]
X_hfeatures_test.shape
y_predict = model.predict(X_hfeatures_test)

In [None]:
binary_encoded = []
for i in y_predict:
    if i > 0.5:
        binary_encoded.append(1)
    else:
        binary_encoded.append(0)
binary_encoded = np.array(binary_encoded)

In [None]:
my_submission = pd.DataFrame({'Id': test_df['Id'], 'income_morethan_50K': binary_encoded})
# you could use any filename. We choose submission here
my_submission = my_submission.sort_values(by=['Id'])
my_submission.head()

my_submission.to_csv('DNN.csv', index=False)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_Original, y, test_size = 0.2, random_state = 3400)

In [None]:
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(114290, 351)))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam', 
              metrics=['Precision',
                      'Recall',
                      'AUC',
                       'BinaryAccuracy'])

model.fit(X_train, y_train, validation_split=0.3, epochs=10, verbose=1)

loss_and_metrics = model.evaluate(X_test, y_test, verbose=0)

print(loss_and_metrics)

In [None]:
y_predict = model.predict(testmodel_original_df)
binary_encoded = []
for i in y_predict:
    if i > 0.5:
        binary_encoded.append(1)
    else:
        binary_encoded.append(0)
binary_encoded = np.array(binary_encoded)

In [None]:
my_submission = pd.DataFrame({'Id': test_df['Id'], 'income_morethan_50K': binary_encoded})
# you could use any filename. We choose submission here
my_submission = my_submission.sort_values(by=['Id'])
my_submission.head()

my_submission.to_csv('DNN_Original.csv', index=False)