In [1]:
# Basic Setup
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [2]:
# Read Data
training_data = pd.read_csv('adult.data', sep=',', header=None)
attributes_names = ['Age', 'Workclass','Fnlwgt','Education','Education_Num','Marital_Status',
                    'Occupation','Relationship','Race','Sex','Capital_Gain','Capital_Loss',
                    'Hours_Per_Week','Native_Country','Income']
training_data.columns = attributes_names



Prediction:

In [3]:
def rearrange_data(data, corresponding_attributes):
    '''
        Rearrange all data and use integer to represent value of each attribute and corresponding label
        Return:
                1. inputs
                2. corresponding outputs
    '''
    num_data = len(data)
    num_attributes = len(attributes_names) - 1
    class_labels = np.zeros(num_data, dtype='int')
    inputs = np.zeros((num_data, num_attributes), dtype='int')  # Initialize data 
    unknown_values = set()
    
    for j in range(num_attributes):                             # Rearrange inputs
        dic = {}
        cor_index = 0           
        for i in range(num_data):
            if j == 1 or j == 3 or 5 <= j <= 9 or j == 13:
                if data[attributes_names[j]][i].strip() == '?':   
                    unknown_values.add(i)                       # record the index of data with '?'
                    continue
                if data[attributes_names[j]][i].strip() not in dic:
                    dic[data[attributes_names[j]][i].strip()] = cor_index
                    cor_index += 1
                inputs[i][j] = dic[data[attributes_names[j]][i].strip()]
            else:
                inputs[i][j] = data[attributes_names[j]][i]
        if len(dic.keys()) > 0:
            corresponding_attributes[j] = dic

    for i in range(num_data):         # Rearrange class_labels(output)
        if data['Income'][i].strip() != '<=50K':
            class_labels[i] = 1
            
    # print(list(unknown_values))
    useless_rows = list(unknown_values)
    inputs = np.delete(inputs, useless_rows, 0)
    class_labels = np.delete(class_labels, useless_rows, 0)           # delete data with unknown value (?)
    
    return inputs, class_labels, corresponding_attributes


corresponding_attributes = {}
training_inputs, training_class_labels, corresponding_attributes = rearrange_data(training_data, corresponding_attributes)
print(corresponding_attributes)

{1: {'State-gov': 0, 'Self-emp-not-inc': 1, 'Private': 2, 'Federal-gov': 3, 'Local-gov': 4, 'Self-emp-inc': 5, 'Without-pay': 6, 'Never-worked': 7}, 3: {'Bachelors': 0, 'HS-grad': 1, '11th': 2, 'Masters': 3, '9th': 4, 'Some-college': 5, 'Assoc-acdm': 6, 'Assoc-voc': 7, '7th-8th': 8, 'Doctorate': 9, 'Prof-school': 10, '5th-6th': 11, '10th': 12, '1st-4th': 13, 'Preschool': 14, '12th': 15}, 5: {'Never-married': 0, 'Married-civ-spouse': 1, 'Divorced': 2, 'Married-spouse-absent': 3, 'Separated': 4, 'Married-AF-spouse': 5, 'Widowed': 6}, 6: {'Adm-clerical': 0, 'Exec-managerial': 1, 'Handlers-cleaners': 2, 'Prof-specialty': 3, 'Other-service': 4, 'Sales': 5, 'Craft-repair': 6, 'Transport-moving': 7, 'Farming-fishing': 8, 'Machine-op-inspct': 9, 'Tech-support': 10, 'Protective-serv': 11, 'Armed-Forces': 12, 'Priv-house-serv': 13}, 7: {'Not-in-family': 0, 'Husband': 1, 'Wife': 2, 'Own-child': 3, 'Unmarried': 4, 'Other-relative': 5}, 8: {'White': 0, 'Black': 1, 'Asian-Pac-Islander': 2, 'Amer-Ind

In [4]:
def load_test_data(corresponding_attributes):
    inputs, expected_output = [],[]
    # print(corresponding_attributes)
    with open('adult.test', 'r') as myfile:
        myfile.readline()         # flush the first line
        line = myfile.readline()[:-2]      # ignore '.' and '\n'
        while line and line != '\n':
            temp_list = line.split(', ')
            inputs_list, output_label = temp_list[:-1], temp_list[-1]
            check_for_question_mark = False
            for index, dic in corresponding_attributes.items():
                if inputs_list[index].strip() == '?':
                    check_for_question_mark = True
                    break
                if inputs_list[index].strip() in dic:
                    inputs_list[index] = dic[inputs_list[index]]  
            if check_for_question_mark:
                line = myfile.readline()[:-2]
                continue
            if output_label.strip() == '<=50K':
                expected_output.append(0)
            else:
                expected_output.append(1)
            inputs.append(inputs_list)
            line = myfile.readline()[:-2]
    return np.array(inputs, dtype='int'), np.array(expected_output, dtype='int')

testing_inputs, testing_output = load_test_data(corresponding_attributes)


Naive Bayes Classifcation

In [5]:
# Naive Bayes Classification
naive_bayes_classifier = GaussianNB()
naive_bayes_classifier.fit(training_inputs, training_class_labels)

# Training_data accuracy
estimated_training_output = naive_bayes_classifier.predict(training_inputs)
num_misclassified = 0
for i in range(len(estimated_training_output)):
    if estimated_training_output[i] != training_class_labels[i]:
        num_misclassified  += 1
print('Accuracy:\t', 1-(num_misclassified / len(estimated_training_output)))

# Testing_data accuracy
estimated_testing_output = naive_bayes_classifier.predict(testing_inputs)
num_misclassified = 0
for i in range(len(estimated_testing_output)):
    if estimated_testing_output[i] != testing_output[i]:
        num_misclassified  += 1
print('Accuracy :\t', 1-(num_misclassified / len(estimated_testing_output)))

Accuracy:	 0.7888071082819441
Accuracy :	 0.7887782204515272


Support Vector Machine(SVM)

In [6]:
svm_classifier = SVC()
svm_classifier.fit(training_inputs, training_class_labels)
# Training_data accuracy
estimated_training_output = svm_classifier.predict(training_inputs)
num_misclassified = 0
for i in range(len(estimated_training_output)):
    if estimated_training_output[i] != training_class_labels[i]:
        num_misclassified  += 1
print('Accuracy:\t', 1-(num_misclassified / len(estimated_training_output)))

# Testing_data accuracy
estimated_testing_output = svm_classifier.predict(testing_inputs)
num_misclassified = 0
for i in range(len(estimated_testing_output)):
    if estimated_testing_output[i] != testing_output[i]:
        num_misclassified  += 1
print('Accuracy :\t', 1-(num_misclassified / len(estimated_testing_output)))

Accuracy:	 0.7894038856839732
Accuracy :	 0.7908366533864541
