In [1]:
#pandas & numpy
import pandas as pd
import numpy as np

#Preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

#the classification algorithms
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

#for parameter optimization
from sklearn.model_selection import GridSearchCV

#for evaluation 
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn import metrics

#ignore all warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
#read data file

#load data to pandas
data = pd.read_csv("../../data/csvData/incomedata.csv")

In [3]:
#deal with categoricial variables 
print ("one-hot encoder for gender")
ohe = OneHotEncoder(categories='auto')
feature_arr = ohe.fit_transform(data[['Gender']]).toarray()
genderData= pd.concat([data['Gender'],pd.DataFrame(feature_arr)],axis=1)
genderData

one-hot encoder for gender


Unnamed: 0,Gender,0,1
0,Male,0.0,1.0
1,Male,0.0,1.0
2,Male,0.0,1.0
3,Male,0.0,1.0
4,Female,1.0,0.0
...,...,...,...
32556,Female,1.0,0.0
32557,Male,0.0,1.0
32558,Female,1.0,0.0
32559,Male,0.0,1.0


In [4]:
#deal with categoricial variables 
ohe = OneHotEncoder(categories='auto')
feature_arr = ohe.fit_transform(data[['Employ_type', 'Education', 'Marriage', 'Occupation', 'Relationship','Race', 'Gender']]).toarray()

#move income column to the end 
incomeColumn = data['Income']
data.drop(labels=['Income'], axis=1,inplace = True)

#add to original data
convertedData= pd.concat([data, pd.DataFrame(feature_arr),incomeColumn], axis=1)
convertedData.head()

Unnamed: 0,Age,Employ_type,Education,Education_years,Marriage,Occupation,Relationship,Race,Gender,Capital-gain,...,51,52,53,54,55,56,57,58,59,Income
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0


In [5]:
#remove the original categorical data 
convertedData=convertedData.drop(columns=['Employ_type', 'Education', 'Marriage', 'Occupation', 'Relationship','Race', 'Gender'])
convertedData

Unnamed: 0,Age,Education_years,Capital-gain,Captial-loss,hours-per-week,0,1,2,3,4,...,51,52,53,54,55,56,57,58,59,Income
0,39,13,2174,0,40,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0
1,50,13,0,0,13,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0
2,38,9,0,0,40,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0
3,53,7,0,0,40,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0
4,28,13,0,0,40,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,12,0,0,38,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0
32557,40,9,0,0,40,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1
32558,58,9,0,0,40,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0
32559,22,9,0,0,20,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0


In [6]:
scaler=StandardScaler()
convertedData.iloc[:, 0:5]=scaler.fit_transform(convertedData.iloc[:, 0:5])
convertedData

Unnamed: 0,Age,Education_years,Capital-gain,Captial-loss,hours-per-week,0,1,2,3,4,...,51,52,53,54,55,56,57,58,59,Income
0,0.030671,1.134739,0.148453,-0.21666,-0.035429,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0
1,0.837109,1.134739,-0.145920,-0.21666,-2.222153,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0
2,-0.042642,-0.420060,-0.145920,-0.21666,-0.035429,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0
3,1.057047,-1.197459,-0.145920,-0.21666,-0.035429,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0
4,-0.775768,1.134739,-0.145920,-0.21666,-0.035429,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,-0.849080,0.746039,-0.145920,-0.21666,-0.197409,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0
32557,0.103983,-0.420060,-0.145920,-0.21666,-0.035429,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1
32558,1.423610,-0.420060,-0.145920,-0.21666,-0.035429,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0
32559,-1.215643,-0.420060,-0.145920,-0.21666,-1.655225,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0


In [7]:
X = convertedData.iloc[:, 0:63].values 
Y = convertedData["Income"].values

from sklearn.model_selection import train_test_split

X_train, X_test, Y_train,Y_test = train_test_split(X, Y, test_size=0.20, random_state=42)
print(X_train)
print(Y_train)

[[-0.40920495  1.13473876 -0.14592048 ...  0.          0.
   1.        ]
 [-0.18926719  0.35733957 -0.14592048 ...  0.          0.
   1.        ]
 [ 1.42360965 -1.97485801 -0.14592048 ...  0.          0.
   1.        ]
 ...
 [-1.5088937  -1.19745882 -0.14592048 ...  0.          0.
   1.        ]
 [ 0.83710898 -0.42005962 -0.14592048 ...  0.          0.
   1.        ]
 [-0.33589236  1.13473876 -0.14592048 ...  0.          0.
   1.        ]]
[1 1 0 ... 0 0 0]


In [8]:
# ##モデルの宣言
#SVM
svmCLF= SVC(C=1000,kernel="rbf")

#NaiveBayes
NBCLF= GaussianNB()

#Random Forest
RFCLF= RandomForestClassifier(max_depth=80,n_estimators=100)

In [9]:
# ##Cross-Validation の F-score の計算
#SVM
scores = cross_val_score(svmCLF, X_train,Y_train, scoring="f1_macro", cv=5)
print("SVM Score")
print(scores)
print("SVM F1 Macro: %.2f%% (%.2f%%)" % (scores.mean()*100, scores.std()*100))

#Naive Bayes
scores = cross_val_score(NBCLF, X_train,Y_train, scoring="f1_macro", cv=5)
print("NB Score")
print(scores)
print("NB F1 Macro: %.2f%% (%.2f%%)" % (scores.mean()*100, scores.std()*100))

#Random Forest
scores = cross_val_score(RFCLF,X_train,Y_train, scoring="f1_macro", cv=5)
print("RF Score")
print(scores)
print("RF F1 Macro: %.2f%% (%.2f%%)" % (scores.mean()*100, scores.std()*100))

KeyboardInterrupt: 