## 1. Import Libraries 

In [59]:
# Import Lib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.svm import SVC
from sklearn import datasets
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.decomposition import PCA 
from sklearn.ensemble import RandomForestClassifier
import csv

from xgboost import XGBClassifier

# Importing the dataset
dataset_train = pd.read_csv('train.csv', header=None)
dataset_test = pd.read_csv('test.csv', header=None)

# Take all columns except last one
train = dataset_train.iloc[:, :-1]
test = dataset_test.iloc[:, :]
y = dataset_train.iloc[:, dataset_train.shape[1]-1]


## 2. Data preprocessing

In [121]:
# Encoding categorial data
# Replace string by categories number
drop = []
train_objs_num = len(train)

dataset = pd.concat(objs=[train, test], axis=0)

# Salary Mapping
edu_mapping = {
        ' Preschool':0, 
        ' 1st-4th':1,
        ' 5th-6th':2,
        ' 7th-8th':3,
        ' 9th':4,
        ' 10th':5,
        ' 11th':6,
        ' 12th':7,
        ' HS-grad':15,
        ' Prof-school':70,
        ' Assoc-acdm':25,
        ' Assoc-voc':25,
        ' Some-college':20,
        ' Bachelors':40,
        ' Masters':55,
        ' Doctorate':70
}
gender_mapping = {
    ' Female':10,
    ' Male':30
}
color_mapping = {
    ' Amer-Indian-Eskimo':11,
    ' White':25, 
    ' Asian-Pac-Islander':25, 
    ' Other':10, 
    ' Black':13
}
marital_mapping={
    ' Married-civ-spouse':45,
    ' Divorced':10,
    ' Never-married':5,
    ' Separated':6,
    ' Widowed':8,
    ' Married-spouse-absent':8,
    ' Married-AF-spouse':45
}
dataset[3] = dataset[3].map(edu_mapping)
dataset[9] = dataset[9].map(gender_mapping)
dataset[8] = dataset[8].map(color_mapping)
dataset[5] = dataset[5].map(marital_mapping)

# One hot encoder
'''
for row in range(0, dataset.shape[1]):
    if (isinstance(dataset.values[1][row], str)):        
        print ("Delete row: ", row), 
        one_hot = pd.get_dummies(dataset[row])
        drop.append(row)
        dataset = pd.concat([dataset, one_hot], axis=1)
        dataset = dataset.iloc[:, :-1]
    
# Remove original attributes
drop.sort(reverse=True)
for row in drop:
    dataset = dataset.drop(row, axis = 1)
'''

# Only label encoder
for row in range(0, dataset.shape[1]):
    if (isinstance(dataset.values[1][row], str)):    
        labelencoder = LabelEncoder()
        target = labelencoder.fit_transform(dataset.values[:, row])
        dataset[row] = target

print ("Finish One Hot Enconding")

# PCA
'''
n_com = 30
pca = PCA(n_components=n_com)
dataset_pca = pca.fit_transform(dataset)
print ("Finish PCA preprocess") 
'''

Finish One Hot Enconding


'\nn_com = 30\npca = PCA(n_components=n_com)\ndataset_pca = pca.fit_transform(dataset)\nprint ("Finish PCA preprocess") \n'

## SVM

In [123]:
X = dataset[:train_objs_num]
test_data = dataset[train_objs_num:]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 0)

#feature normalize
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

svm = SVC(kernel='rbf')
svm.fit(X_train_std, y_train)
y_pred = svm.predict(X_test_std)

print ("Misclassified sample %d" % (y_test!=y_pred).sum())
print ("Train Accuracy: ", accuracy_score(y_train, svm.predict(X_train_std)))
print ("Test Accuracy: ", accuracy_score(y_test, y_pred))
precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, average='micro')
print ("precision: ", precision)
print ("recall: ", recall)
print ("fscore: ", fscore)


test_std = sc.transform(test_data)
test_std_pred = svm.predict(test_std)

with open('output.csv', 'w+') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['ID', 'ans'])
    for i in range(len(test_std_pred)):
        writer.writerow([i, test_std_pred[i]])

print ("Done")

Misclassified sample 487
Train Accuracy:  0.8501825329963494
Test Accuracy:  0.8461781427668983
precision:  0.8461781427668983
recall:  0.8461781427668983
fscore:  0.8461781427668983
Done


## Random Forest Tree

In [124]:
X = dataset[:train_objs_num]
test_data = dataset[train_objs_num:]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
'''
max_fscore = 0
target_n_estimators = 0
for i in range(10, 100):
    forest = RandomForestClassifier(criterion='entropy', n_estimators=i)
    forest.fit(X_train, y_train)
    y_pred = forest.predict(X_test)
    precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, average='micro')
    if fscore > max_fscore:
        max_fscore = fscore
        target_n_estimators = i
'''    
forest = RandomForestClassifier(criterion='entropy', n_estimators=20)
forest.fit(X_train, y_train)
y_pred = forest.predict(X_test)
print ("Misclassified sample %d" % (y_test!=y_pred).sum())
print ("Train Accuracy: ", accuracy_score(y_train, forest.predict(X_train)))
print ("Test Accuracy: ", accuracy_score(y_test, y_pred))
precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, average='micro')
print ("precision: ", precision)
print ("recall: ", recall)
print ("fscore: ", fscore)


test_std_pred = forest.predict(test_data)

with open('output.csv', 'w+') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['ID', 'ans'])
    for i in range(len(test_std_pred)):
        writer.writerow([i, test_std_pred[i]])

print ("Done")

Misclassified sample 934
Train Accuracy:  0.9956956126841211
Test Accuracy:  0.8524719633549203
precision:  0.8524719633549203
recall:  0.8524719633549203
fscore:  0.8524719633549203
Done


## XGBOOST

In [122]:
X = dataset[:train_objs_num]
test_data = dataset[train_objs_num:]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 0)


xgbc = XGBClassifier(max_depth=10)
xgbc.fit(X_train, y_train)
y_pred = xgbc.predict(X_test)
print ("Misclassified sample %d" % (y_test!=y_pred).sum())
print ("Train Accuracy: ", accuracy_score(y_train, xgbc.predict(X_train)))
print ("Test Accuracy: ", accuracy_score(y_test, y_pred))
precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, average='micro')
print ("precision: ", precision)
print ("recall: ", recall)
print ("fscore: ", fscore)


test_std_pred = xgbc.predict(test_data)

with open('output.csv', 'w+') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['ID', 'ans'])
    for i in range(len(test_std_pred)):
        writer.writerow([i, test_std_pred[i]])

print ("Done")

  if diff:


Misclassified sample 425
Train Accuracy:  0.903784049424319
Test Accuracy:  0.8657612128869235
precision:  0.8657612128869235
recall:  0.8657612128869235
fscore:  0.8657612128869235
Done


  if diff:
  if diff:
