In [1]:
"""
Ensemble Learning on Adult Dataset.

@2020 Created by Vihang Garud.

"""

# Importing required libraries

import os
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score

from tabulate import tabulate

from mlens.ensemble import SuperLearner

[MLENS] backend: threading


In [2]:
"""
Importing the datasets

"""

# Adding column names
column_names = ['Age', 'Work_Class', 'Fnlwgt', 'Education', 'Education_Num', 'Marital_Status', 'Occupation',
                'Relationship', 'Race', 'Sex', 'Capital_Gain', 'Capital_Loss', 'Hours_per_week', 'Native_Country',
                'Salary']

# Importing the datasets
train_data = pd.read_csv('adult.data', names=column_names)
test_data = pd.read_csv('adult.test', skiprows=1, names=column_names)

In [3]:
"""
Preprocessing the train dataset

- Dropping unnecessary columns
- Mapping binary valued columns
- Dropping missing values
- One Hot Encoding

"""
# Drop education num and native country column
train_data.drop(['Education_Num', 'Native_Country'], axis=1, inplace=True)

# Map binary valued columns
train_data['Salary'] = train_data['Salary'].map({' <=50K': 0, ' >50K': 1})
train_data['Sex'] = train_data['Sex'].map({' Female': 0, ' Male': 1})

# Drop missing value instances
train_data = train_data.replace({' ?': np.nan}).dropna()

# One Hot Encoding for categorical columns
train_data = pd.get_dummies(train_data)

# Shifting class label
salary = train_data.pop('Salary')
train_data['Salary'] = salary

In [4]:
"""
Preprocessing the test dataset

- Dropping unnecessary columns
- Mapping binary valued columns
- Dropping missing values
- One Hot Encoding

"""
# Drop education num and native country column
test_data.drop(['Education_Num', 'Native_Country'], axis=1, inplace=True)

# Map binary valued columns
test_data['Salary'] = test_data['Salary'].map({' <=50K.': 0, ' >50K.': 1})
test_data['Sex'] = test_data['Sex'].map({' Female': 0, ' Male': 1})

# Drop missing value instances
test_data = test_data.replace({' ?': np.nan}).dropna()

# One Hot Encoding for categorical columns
test_data = pd.get_dummies(test_data)

# Shifting class label
salary = test_data.pop('Salary')
test_data['Salary'] = salary

In [5]:
# Readying train and test sets

X_train = train_data.iloc[:, : -1]
Y_train = train_data.iloc[:, -1]
X_test = test_data.iloc[:, : -1]
Y_test = test_data.iloc[:, -1]

## Individual Classifiers

In [6]:
"""
Calculating Accuracies for individual classifiers

- KNeighbors Classifier
- Naive Bayes Classifier
- Support Vector Classifier
- Decision Tree Classifier
- Random Forest Classifier
- Adaboost Classifier
- Gradient Boosting Classifier
- Linear Discriminant Analysis Classifier
- Multilayer Perceptron Classifier
- Logistic Regression Classifier

"""

# KNeighbors Classifier Predictions

knn_clf = KNeighborsClassifier(n_neighbors=15, weights='distance', p=1)
knn_clf.fit(X_train, Y_train)
knn_Y_pred = knn_clf.predict(X_test)

# Naive Bayes Classifier Predictions

nb_clf = GaussianNB()
nb_clf.fit(X_train, Y_train)
nb_Y_pred = nb_clf.predict(X_test)

# Support Vector Classifier Predictions

sv_clf = SVC(C=1000)
sv_clf.fit(X_train, Y_train)
sv_Y_pred = sv_clf.predict(X_test)

# Decision Tree Classifier Predictions

dt_clf = DecisionTreeClassifier(criterion='entropy', min_samples_split=300, max_leaf_nodes=200)
dt_clf.fit(X_train, Y_train)
dt_Y_pred = dt_clf.predict(X_test)

# Random Forest Classifier Predictions

rf_clf = RandomForestClassifier(n_estimators=300, min_samples_split=400, n_jobs=100)
rf_clf.fit(X_train, Y_train)
rf_Y_pred = rf_clf.predict(X_test)

# Adaboost Classifier Predictions

ada_clf = AdaBoostClassifier(n_estimators=400)
ada_clf.fit(X_train, Y_train)
ada_Y_pred = ada_clf.predict(X_test)

# Gradient Boosting Classifier Predictions

gb_clf = GradientBoostingClassifier(learning_rate=0.9, min_samples_split=400)
gb_clf.fit(X_train, Y_train)
gb_Y_pred = gb_clf.predict(X_test)

# Linear Discriminant Analysis Classifier Predictions

lda_clf = LinearDiscriminantAnalysis()
lda_clf.fit(X_train, Y_train)
lda_Y_pred = lda_clf.predict(X_test)

# Multilayer Perceptron Classifier Predictions

mlp_clf = MLPClassifier()
mlp_clf.fit(X_train, Y_train)
mlp_Y_pred = mlp_clf.predict(X_test)

# Logistic Regression Classifier Predictions

lr_clf = LogisticRegression(solver='liblinear', penalty='l1')
lr_clf.fit(X_train, Y_train)
lr_Y_pred = lr_clf.predict(X_test)

In [7]:
# Printing all the classifier accuracies

print(tabulate([['K-Neighbors', accuracy_score(Y_test, knn_Y_pred) * 100],
                ['Naive Bayes', accuracy_score(Y_test, nb_Y_pred) * 100],
                ['SVM', accuracy_score(Y_test, sv_Y_pred) * 100],
                ['Decision Tree', accuracy_score(Y_test, dt_Y_pred) * 100],
                ['Random Forest', accuracy_score(Y_test, rf_Y_pred) * 100],
                ['Adaboost', accuracy_score(Y_test, ada_Y_pred) * 100],
                ['Gradient Boosting', accuracy_score(Y_test, gb_Y_pred) * 100],
                ['LDA', accuracy_score(Y_test, lda_Y_pred) * 100],
                ['MLP', accuracy_score(Y_test, mlp_Y_pred) * 100],
                ['Logistic Regression', accuracy_score(Y_test, lr_Y_pred) * 100]],
                ['Algorithms', 'Accuracy (%)'], 'psql'))

+---------------------+----------------+
| Algorithms          |   Accuracy (%) |
|---------------------+----------------|
| K-Neighbors         |        79.3275 |
| Naive Bayes         |        78.8116 |
| SVM                 |        79.223  |
| Decision Tree       |        84.9429 |
| Random Forest       |        85.7525 |
| Adaboost            |        86.6732 |
| Gradient Boosting   |        86.3532 |
| LDA                 |        83.8459 |
| MLP                 |        77.016  |
| Logistic Regression |        84.8253 |
+---------------------+----------------+


## Performing Ensemble Learning

In [8]:
"""
Performing ensemble learning

- Creating a model list
- Creating the ensembler

"""

models = []
models.append(KNeighborsClassifier(n_neighbors=15, weights='distance', p=1))
models.append(GaussianNB())
models.append(SVC(C=1000))
models.append(DecisionTreeClassifier(criterion='entropy', min_samples_split=300, max_leaf_nodes=200))
models.append(RandomForestClassifier(n_estimators=300, min_samples_split=400, n_jobs=100))
models.append(AdaBoostClassifier(n_estimators=400))
models.append(GradientBoostingClassifier(learning_rate=0.9, min_samples_split=400))
models.append(LinearDiscriminantAnalysis())
models.append(MLPClassifier())
models.append(LogisticRegression(solver='liblinear', penalty='l1'))

ensemble = SuperLearner(scorer=accuracy_score, folds=10, shuffle=True, sample_size=len(X_train), random_state=0)
ensemble.add(models)
ensemble.add_meta(AdaBoostClassifier(n_estimators=400))

SuperLearner(array_check=None, backend=None, folds=10,
       layers=[Layer(backend='threading', dtype=<class 'numpy.float32'>, n_jobs=-1,
   name='layer-1', propagate_features=None, raise_on_exception=True,
   random_state=0, shuffle=True,
   stack=[Group(backend='threading', dtype=<class 'numpy.float32'>,
   indexer=FoldIndex(X=None, folds=10, raise_on_excep...A54E0D0>)],
   n_jobs=-1, name='group-1', raise_on_exception=True, transformers=[])],
   verbose=0)],
       model_selection=False, n_jobs=None, raise_on_exception=True,
       random_state=0, sample_size=30718,
       scorer=<function accuracy_score at 0x000001A4FA54E0D0>,
       shuffle=True, verbose=False)

In [9]:
"""
Performing ensemble learning

- Fitting the ensemble model
- Predicting the values
- Printing the accuracy of the ensemble model

"""

ensemble.fit(X_train.values, Y_train.values)
ensemble_Y_pred = ensemble.predict(X_test.values)

print('Accuracy using Ensemble Learning : {:.2f}'.format(accuracy_score(Y_test, ensemble_Y_pred) * 100), '%')

Accuracy using Ensemble Learning : 86.78 %
