### Load processed Data

In [1]:
### Load processed Data
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

df =pd.read_csv('eda_processed_adult.csv')

In [2]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'educational-num',
       'marital-status', 'occupation', 'relationship', 'race', 'gender',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'income'],
      dtype='object')

### Get dummies

In [8]:
df_model = df[['workclass', 'relationship', 'education', 'marital-status', 'occupation', 'race', 'gender', 'native-country', 'income']]

In [9]:
df_dum = pd.get_dummies(df_model)
df_dum.head()

Unnamed: 0,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay,relationship_Husband,relationship_Not-in-family,...,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia,income_<=50K,income_>50K
0,False,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,True,False
1,False,False,False,True,False,False,False,False,True,False,...,False,False,False,False,False,True,False,False,True,False
2,False,True,False,False,False,False,False,False,True,False,...,False,False,False,False,False,True,False,False,False,True
3,False,False,False,True,False,False,False,False,True,False,...,False,False,False,False,False,True,False,False,False,True
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,True,False


### Train the model

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Split the data into training and testing sets
X = df_dum.drop('income_>50K', axis=1)
y = df_dum['income_>50K']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 1.0


In [12]:
from sklearn.ensemble import RandomForestClassifier

# Train the RandomForest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred_rf = rf_model.predict(X_test)

# Calculate the accuracy
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f'Random Forest Accuracy: {accuracy_rf}')

Random Forest Accuracy: 1.0


In [13]:
from sklearn.ensemble import GradientBoostingClassifier

# Train the Gradient Boosting model
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_model.fit(X_train, y_train)

# Predict on the test set
y_pred_gb = gb_model.predict(X_test)

# Calculate the accuracy
accuracy_gb = accuracy_score(y_test, y_pred_gb)
print(f'Gradient Boosting Accuracy: {accuracy_gb}')

Gradient Boosting Accuracy: 1.0


In [16]:
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

# Rename columns to remove invalid characters
X_train.columns = [col.replace('[', '').replace(']', '').replace('<', '').replace('>', '') for col in X_train.columns]
X_test.columns = [col.replace('[', '').replace(']', '').replace('<', '').replace('>', '') for col in X_test.columns]

# Train the SVM model
svm_model = SVC()
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f'SVM Accuracy: {accuracy_svm}')

# Train the KNN model
knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
y_pred_knn = knn_model.predict(X_test)
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print(f'KNN Accuracy: {accuracy_knn}')

# Train the XGBoost model
xgb_model = XGBClassifier()
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print(f'XGBoost Accuracy: {accuracy_xgb}')

SVM Accuracy: 1.0
KNN Accuracy: 0.9718179954908793
XGBoost Accuracy: 1.0


In [17]:
from sklearn.linear_model import Lasso

# Train the Lasso regression model
lasso_model = Lasso(alpha=0.1, max_iter=1000)
lasso_model.fit(X_train, y_train)

# Predict on the test set
y_pred_lasso = lasso_model.predict(X_test)

# Convert predictions to binary outcomes
y_pred_lasso_binary = [1 if pred >= 0.5 else 0 for pred in y_pred_lasso]

# Calculate the accuracy
accuracy_lasso = accuracy_score(y_test, y_pred_lasso_binary)
print(f'Lasso Regression Accuracy: {accuracy_lasso}')

Lasso Regression Accuracy: 1.0


In [21]:
from sklearn.linear_model import LinearRegression

# Train the multiple linear regression model
mlr_model = LinearRegression()
mlr_model.fit(X_train, y_train)

# Predict on the test set
y_pred_mlr = mlr_model.predict(X_test)

# Convert predictions to binary outcomes
y_pred_mlr_binary = [1 if pred >= 0.5 else 0 for pred in y_pred_mlr]

# Calculate the accuracy
accuracy_mlr = accuracy_score(y_test, y_pred_mlr_binary)
print(f'Multiple Linear Regression Accuracy: {accuracy_mlr}')

Multiple Linear Regression Accuracy: 1.0


In [22]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'C': [0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga']
}

# Initialize the GridSearchCV object
grid_search = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, cv=5, scoring='accuracy')

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f'Best Parameters: {best_params}')
print(f'Best Cross-Validation Accuracy: {best_score}')

Best Parameters: {'C': 0.1, 'solver': 'liblinear'}
Best Cross-Validation Accuracy: 1.0


In [23]:
from sklearn.ensemble import VotingClassifier

# Create a voting classifier with the individual models
voting_clf = VotingClassifier(estimators=[
    ('log_reg', model),
    ('rf', rf_model),
    ('gb', gb_model),
    ('svm', svm_model),
    ('knn', knn_model),
    ('xgb', xgb_model)
], voting='hard')

# Train the voting classifier
voting_clf.fit(X_train, y_train)

# Predict on the test set
y_pred_voting = voting_clf.predict(X_test)

# Calculate the accuracy
accuracy_voting = accuracy_score(y_test, y_pred_voting)
print(f'Ensemble Voting Classifier Accuracy: {accuracy_voting}')

Ensemble Voting Classifier Accuracy: 1.0
