In [1]:
# Basic Setup
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
# Read Data
training_data = pd.read_csv('adult.data', sep=',', header=None)
attributes_names = ['Age', 'Workclass','Fnlwgt','Education','Education_Num','Marital_Status',
                    'Occupation','Relationship','Race','Sex','Capital_Gain','Capital_Loss',
                    'Hours_Per_Week','Native_Country','Income']
training_data.columns = attributes_names
training_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Age             32561 non-null  int64 
 1   Workclass       32561 non-null  object
 2   Fnlwgt          32561 non-null  int64 
 3   Education       32561 non-null  object
 4   Education_Num   32561 non-null  int64 
 5   Marital_Status  32561 non-null  object
 6   Occupation      32561 non-null  object
 7   Relationship    32561 non-null  object
 8   Race            32561 non-null  object
 9   Sex             32561 non-null  object
 10  Capital_Gain    32561 non-null  int64 
 11  Capital_Loss    32561 non-null  int64 
 12  Hours_Per_Week  32561 non-null  int64 
 13  Native_Country  32561 non-null  object
 14  Income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


remove useless data from dataset: 

In [3]:
object_columns = ['Workclass', 'Education', 'Marital_Status', 'Occupation',
                          'Relationship', 'Race', 'Sex', 'Native_Country', 'Income']
numerical_training_data = training_data.copy()
for str_col in object_columns:
    # remove unknown value ('?')
    training_data = training_data[training_data[str_col] != ' ?']
    numerical_training_data = numerical_training_data[numerical_training_data[str_col] != ' ?']
    # assign numerical value for categorical value
    numerical_training_data[str_col] = numerical_training_data[str_col].astype('category').cat.codes

print('Number of data in dataset after removing unkown value:\t', len(numerical_training_data))

# record numerical value for categorical value in each object attribute
corresponding_attributes = {}
for j in (1, 3, 5, 6, 7, 8, 9, 13):
    dic, numerical_list = {}, list(numerical_training_data[attributes_names[j]])
    for i, temp_str in enumerate(list(training_data[attributes_names[j]])):
        if temp_str.strip() not in dic:
            dic[temp_str.strip()] = numerical_list[i]
    corresponding_attributes[j] = dic
# print(corresponding_attributes)

Number of data in dataset after removing unkown value:	 30162


Prediction:

In [4]:
def load_test_data(corresponding_attributes):
    inputs, expected_output = [],[]
    # print(corresponding_attributes)
    with open('adult.test', 'r') as myfile:
        myfile.readline()         # flush the first line
        line = myfile.readline()[:-2]      # ignore '.' and '\n'
        while line and line != '\n':
            temp_list = line.split(', ')
            inputs_list, output_label = temp_list[:-1], temp_list[-1]
            check_for_question_mark = False
            for index, dic in corresponding_attributes.items():
                if inputs_list[index].strip() == '?':
                    check_for_question_mark = True
                    break
                if inputs_list[index].strip() in dic:
                    inputs_list[index] = dic[inputs_list[index]]  
            if check_for_question_mark:
                line = myfile.readline()[:-2]
                continue
            if output_label.strip() == '<=50K':
                expected_output.append(0)
            else:
                expected_output.append(1)
            inputs.append(inputs_list)
            line = myfile.readline()[:-2]
    return np.array(inputs, dtype='int'), np.array(expected_output, dtype='int')

training_inputs, training_class_labels = numerical_training_data.drop(columns=['Income']).to_numpy(dtype='int'), numerical_training_data['Income'].to_numpy(dtype='int')
testing_inputs, testing_output = load_test_data(corresponding_attributes)

# normalize data, ensure the range for each attribute are same
training_inputs = StandardScaler().fit_transform(training_inputs)
testing_inputs = StandardScaler().fit_transform(testing_inputs)

Naive Bayes Classifcation

In [5]:
# Naive Bayes Classification
naive_bayes_classifier = GaussianNB()
naive_bayes_classifier.fit(training_inputs, training_class_labels)

# Training_data accuracy
estimated_training_output = naive_bayes_classifier.predict(training_inputs)
print('Accuracy on training dataset (Naive-Bayes):\t', naive_bayes_classifier.score(training_inputs, training_class_labels))

# Testing_data accuracy
estimated_testing_output = naive_bayes_classifier.predict(testing_inputs)
print('Accuracy on testing dataset (Naive-Bayes):\t', naive_bayes_classifier.score(testing_inputs, testing_output))

Accuracy on training dataset (Naive-Bayes):	 0.7982560838140706
Accuracy on testing dataset (Naive-Bayes):	 0.8


Support Vector Machine(SVM)

In [6]:
svm_classifier = SVC() #'RBF' as default model
svm_classifier.fit(training_inputs, training_class_labels)

# Training_data accuracy
print('Accuracy on training dataset (SVM):\t', svm_classifier.score(training_inputs, training_class_labels))

# Testing_data accuracy
print('Accuracy on testing dataset (SVM):\t', svm_classifier.score(testing_inputs, testing_output))

Accuracy on training dataset (SVM):	 0.8529938333001791
Accuracy on testing dataset (SVM):	 0.846215139442231


Linear SVM

In [7]:
svm_classifier = LinearSVC(random_state=0, tol=1e-5, max_iter=10000)
svm_classifier.fit(training_inputs, training_class_labels)

# Training_data accuracy
print('Accuracy on training dataset (Linear SVM):\t', svm_classifier.score(training_inputs, training_class_labels))

# Testing_data accuracy
print('Accuracy on testing dataset (Linear SVM):\t', svm_classifier.score(testing_inputs, testing_output))

Accuracy on training dataset (Linear SVM):	 0.8192759100855381
Accuracy on testing dataset (Linear SVM):	 0.8204515272244356


Logistic Regression

In [8]:
logistic_classifier = LogisticRegression(random_state=0)
logistic_classifier.fit(training_inputs, training_class_labels)

# Training_data accuracy
print('Accuracy on training dataset (Logistic Regression):\t', logistic_classifier.score(training_inputs, training_class_labels))

# Testing_data accuracy
print('Accuracy on testing dataset (Logistic Regression):\t', logistic_classifier.score(testing_inputs, testing_output))

Accuracy on training dataset (Logistic Regression):	 0.820370001989258
Accuracy on testing dataset (Logistic Regression):	 0.8199867197875166


K-nearest_neighboor (KNN)

In [None]:
knn_classifier = KNeighborsClassifier(n_neighbors=100, weights='distance')
knn_classifier.fit(training_inputs, training_class_labels)

# Training_data accuracy
print('Accuracy on training dataset (KNN):\t', knn_classifier.score(training_inputs, training_class_labels))

# Testing_data accuracy
print('Accuracy on testing dataset (KNN):\t', knn_classifier.score(testing_inputs, testing_output))

Decision Tree

In [None]:
dt_classifier = DecisionTreeClassifier(random_state=0, splitter='best', criterion='entropy')
dt_classifier.fit(training_inputs, training_class_labels)

# Training_data accuracy
print('Accuracy on training dataset (Decision Tree):\t', dt_classifier.score(training_inputs, training_class_labels))

# Testing_data accuracy
print('Accuracy on testing dataset (Decision Tree):\t', dt_classifier.score(testing_inputs, testing_output))