In [389]:
import collections
import numpy as np
import pandas as pd
import re

from argparse import Namespace
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix



args = Namespace(                               # Create key, value pairs, and access values later on, e.g., args.seed
    raw_train_dataset_csv="data/adultdata.csv",
    raw_test_dataset_csv="data/adulttest.csv",
    train_proportion=0.7,
    val_proportion=0.3,
    output_munged_traindata_csv="data/training_data.csv",
    output_munged_testdata_csv="data/testing_data.csv",
)


train_data = pd.read_csv(args.raw_train_dataset_csv, na_values=['?','NaN', 'Na','N/A'], header=None, skipinitialspace=True)
test_data = pd.read_csv(args.raw_test_dataset_csv, na_values=['?','NaN', 'Na','N/A'], header=None, skipinitialspace=True)
train_data.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [390]:
test_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,|1x3 Cross validator,,,,,,,,,,,,,,
1,25,Private,226802.0,11th,7.0,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States,<=50K.
2,38,Private,89814.0,HS-grad,9.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K.
3,28,Local-gov,336951.0,Assoc-acdm,12.0,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States,>50K.
4,44,Private,160323.0,Some-college,10.0,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40.0,United-States,>50K.


In [391]:
train_unknown_rows = train_data[train_data.isnull().any(axis=1)]
test_unknown_rows = test_data[test_data.isnull().any(axis=1)]
train_unknown_rows.head()
test_unknown_rows.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,|1x3 Cross validator,,,,,,,,,,,,,,
5,18,,103497.0,Some-college,10.0,Never-married,,Own-child,White,Female,0.0,0.0,30.0,United-States,<=50K.
7,29,,227026.0,HS-grad,9.0,Never-married,,Unmarried,Black,Male,0.0,0.0,40.0,United-States,<=50K.
14,58,,299831.0,HS-grad,9.0,Married-civ-spouse,,Husband,White,Male,0.0,0.0,35.0,United-States,<=50K.
20,40,Private,85019.0,Doctorate,16.0,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0.0,0.0,45.0,,>50K.


In [392]:
#check for missing data, to see if we can just drop the data rows if not many
print("total number of train data rows:", len(train_data))
print("total number of unknown train data rows:", len(train_unknown_rows))
print("total number of test data rows:", len(test_data))
print("total number of unknown test data rows:", len(test_unknown_rows))

total number of train data rows: 32561
total number of unknown train data rows: 2399
total number of test data rows: 16282
total number of unknown test data rows: 1222


In [393]:
#drop the data rows with missing data
train_data.dropna(axis=0, inplace=True)
print("total number of train data rows after dropping:", len(train_data))
test_data.dropna(axis=0, inplace=True)
print("total number of tesr data rows after dropping:", len(test_data))

total number of train data rows after dropping: 30162
total number of tesr data rows after dropping: 15060


In [394]:
#add column names to the dataset
column_names = [
    'age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation',
    'relationship', 'race', 'gender', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income'
]
# Create a Pandas DataFrame with the data and column names
train_data.columns = column_names
test_data.columns = column_names
train_data.head()


Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [395]:
test_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income
1,25,Private,226802.0,11th,7.0,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States,<=50K.
2,38,Private,89814.0,HS-grad,9.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K.
3,28,Local-gov,336951.0,Assoc-acdm,12.0,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States,>50K.
4,44,Private,160323.0,Some-college,10.0,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40.0,United-States,>50K.
6,34,Private,198693.0,10th,6.0,Never-married,Other-service,Not-in-family,White,Male,0.0,0.0,30.0,United-States,<=50K.


In [396]:
#train data income mapping
income_mapping = {'<=50K': 0, '>50K': 1}
train_data['income'] = train_data['income'].map(income_mapping)
#rename the income column to 'income_over_5k'
train_data.rename(columns={'income': 'income_over_5k'}, inplace=True)

#test data income mapping
income_mapping = {'<=50K.': 0, '>50K.': 1}
test_data['income'] = test_data['income'].map(income_mapping)
#rename the income column to 'income_over_5k'
test_data.rename(columns={'income': 'income_over_5k'}, inplace=True)

#train data country mapping
country_mapping = {'United-States': 1}
train_data['native_country'] = train_data['native_country'].map(country_mapping).fillna(0).astype(int)
# Rename the 'native_country' column to 'united_states'
train_data.rename(columns={'native_country': 'native_country_united_states'}, inplace=True)

#test data country mapping
country_mapping = {'United-States': 1}
test_data['native_country'] = test_data['native_country'].map(country_mapping).fillna(0).astype(int)
# Rename the 'native_country' column to 'united_states'
test_data.rename(columns={'native_country': 'native_country_united_states'}, inplace=True)

# Drop the 'education' column
train_data.drop(columns=['education'], inplace=True)
test_data.drop(columns=['education'], inplace=True)
# Drop the 'fnlwgt' column
train_data.drop(columns=['fnlwgt'], inplace=True)
test_data.drop(columns=['fnlwgt'], inplace=True)

# List of columns to convert from float to integer
columns_to_convert = ['education_num', 'capital_gain', 'capital_loss', 'hours_per_week']
#train_x[columns_to_convert] = X_train_encoded[columns_to_convert].astype(int)
test_data[columns_to_convert] = test_data[columns_to_convert].astype(int)


In [397]:
train_data.head()

Unnamed: 0,age,workclass,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country_united_states,income_over_5k
0,39,State-gov,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,1,0
1,50,Self-emp-not-inc,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,1,0
2,38,Private,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,1,0
3,53,Private,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,1,0
4,28,Private,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,0,0


In [398]:
# Extract features and labels
train_x = train_data.iloc[:, :-1]  # Features (all columns except the last one)
train_y = train_data.iloc[:, -1]   # Labels (the last column)
test_x = test_data.iloc[:, :-1]  # Features (all columns except the last one)
test_y = test_data.iloc[:, -1]   # Labels (the last column)

# Display the first few rows of train_x (features)
print("Features (train_x):")
print(train_x.head())

# Display the first few rows of train_y (labels)
print("\nLabels (train_y):")
print(train_y.head(20))

# Display the first few rows of test_x (features)
print("Features (test_x):")
print(test_x.head())

# Display the first few rows of test_y (labels)
print("\nLabels (test_y):")
print(test_y.head(20))
"""
# Creating a binary 'is_married' feature based on 'marital_status'
data['is_married'] = np.where(data['marital_status'].str.contains('Married'), 1, 0)

## Encoding categorical variables
#data = pd.get_dummies(data, columns=['age', 'workclass', 'fnlwgt', 'education', 'education number', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native_country'])
data = pd.get_dummies(data, columns=['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'gender', 'native_country'])

# Scaling numerical features 
scaler = StandardScaler()
numerical_features = ['capital_gain', 'capital_loss']
data[numerical_features] = scaler.fit_transform(data[numerical_features])

"""

Features (train_x):
   age         workclass  education_num      marital_status  \
0   39         State-gov             13       Never-married   
1   50  Self-emp-not-inc             13  Married-civ-spouse   
2   38           Private              9            Divorced   
3   53           Private              7  Married-civ-spouse   
4   28           Private             13  Married-civ-spouse   

          occupation   relationship   race  gender  capital_gain  \
0       Adm-clerical  Not-in-family  White    Male          2174   
1    Exec-managerial        Husband  White    Male             0   
2  Handlers-cleaners  Not-in-family  White    Male             0   
3  Handlers-cleaners        Husband  Black    Male             0   
4     Prof-specialty           Wife  Black  Female             0   

   capital_loss  hours_per_week  native_country_united_states  
0             0              40                             1  
1             0              13                             1  


"\n# Creating a binary 'is_married' feature based on 'marital_status'\ndata['is_married'] = np.where(data['marital_status'].str.contains('Married'), 1, 0)\n\n## Encoding categorical variables\n#data = pd.get_dummies(data, columns=['age', 'workclass', 'fnlwgt', 'education', 'education number', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native_country'])\ndata = pd.get_dummies(data, columns=['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'gender', 'native_country'])\n\n# Scaling numerical features \nscaler = StandardScaler()\nnumerical_features = ['capital_gain', 'capital_loss']\ndata[numerical_features] = scaler.fit_transform(data[numerical_features])\n\n"

In [399]:
# List of remaining categorical columns to one-hot encode
categorical_columns = ['workclass', 'marital_status', 'occupation', 'relationship', 'race', 'gender','education_num']

# Apply one-hot encoding to the remaining categorical columns
train_x_encoded = pd.get_dummies(train_x, columns=categorical_columns)
test_x_encoded = pd.get_dummies(test_x, columns=categorical_columns)
train_x_encoded.head()


Unnamed: 0,age,capital_gain,capital_loss,hours_per_week,native_country_united_states,workclass_Federal-gov,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,...,education_num_7,education_num_8,education_num_9,education_num_10,education_num_11,education_num_12,education_num_13,education_num_14,education_num_15,education_num_16
0,39,2174,0,40,1,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
1,50,0,0,13,1,False,False,False,False,True,...,False,False,False,False,False,False,True,False,False,False
2,38,0,0,40,1,False,False,True,False,False,...,False,False,True,False,False,False,False,False,False,False
3,53,0,0,40,1,False,False,True,False,False,...,True,False,False,False,False,False,False,False,False,False
4,28,0,0,40,0,False,False,True,False,False,...,False,False,False,False,False,False,True,False,False,False


In [400]:
test_x_encoded.head()

Unnamed: 0,age,capital_gain,capital_loss,hours_per_week,native_country_united_states,workclass_Federal-gov,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,...,education_num_7,education_num_8,education_num_9,education_num_10,education_num_11,education_num_12,education_num_13,education_num_14,education_num_15,education_num_16
1,25,0,0,40,1,False,False,True,False,False,...,True,False,False,False,False,False,False,False,False,False
2,38,0,0,50,1,False,False,True,False,False,...,False,False,True,False,False,False,False,False,False,False
3,28,0,0,40,1,False,True,False,False,False,...,False,False,False,False,False,True,False,False,False,False
4,44,7688,0,40,1,False,False,True,False,False,...,False,False,False,True,False,False,False,False,False,False
6,34,0,0,30,1,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False


In [401]:
# Initialize and train the SVM classifier
svm_classifier = SVC()  # You can customize hyperparameters here
svm_classifier.fit(train_x_encoded, train_y)

# Predict on the test set
svm_pred_y = svm_classifier.predict(test_x_encoded)

# Evaluate the model's performance
svm_accuracy = accuracy_score(test_y, svm_pred_y)
svm_report = classification_report(test_y, svm_pred_y)
svm_confusion = confusion_matrix(test_y, svm_pred_y)

print("svm_Accuracy:", svm_accuracy)
print("svm_Classification Report:")
print(svm_report)
print("svm confusion matrix:")
print(svm_confusion)


svm_Accuracy: 0.7964143426294821
svm_Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.97      0.88     11360
           1       0.73      0.27      0.39      3700

    accuracy                           0.80     15060
   macro avg       0.77      0.62      0.64     15060
weighted avg       0.79      0.80      0.76     15060

svm confusion matrix:
[[10997   363]
 [ 2703   997]]


In [402]:

# Initialize and train the Decision Tree classifier
decision_tree_classifier = DecisionTreeClassifier()  # You can customize hyperparameters here
decision_tree_classifier.fit(train_x_encoded, train_y)

# Predict on the test set
dt_pred_y = decision_tree_classifier.predict(test_x_encoded)

# Evaluate the model's performance
dt_accuracy = accuracy_score(test_y, dt_pred_y)
dt_report = classification_report(test_y, dt_pred_y)
dt_confusion = confusion_matrix(test_y, dt_pred_y)

print("dt_Accuracy:", dt_accuracy)
print("dt_Classification Report:")
print(dt_report)
print("decision tree confusion matrix:")
print(dt_confusion)

dt_Accuracy: 0.8154050464807437
dt_Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.88      0.88     11360
           1       0.63      0.61      0.62      3700

    accuracy                           0.82     15060
   macro avg       0.75      0.75      0.75     15060
weighted avg       0.81      0.82      0.81     15060

decision tree confusion matrix:
[[10023  1337]
 [ 1443  2257]]
