In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

#add headers 
feature_names = ['age','workclass','fnlwgt','education','education_num','marital_status','occupation','relationship','race','sex','capital_gain','capital_loss','hours_per_week','native_country','50k']

train_df_nan = pd.read_csv('/Users/suzannezhen/Documents/Data Mining /Project/census-income.data.csv .csv', names = feature_names) 
test_df_nan = pd.read_csv('/Users/suzannezhen/Documents/Data Mining /Project/census-income.test.csv .csv', names = feature_names)


In [3]:
print('The total number of instances in train data is', len(train_df_nan), 'with features', len(train_df_nan.columns))
print('The total number of instances in test data is', len(test_df_nan), 'with features', len(test_df_nan.columns))

The total number of instances in train data is 32561 with features 15
The total number of instances in test data is 16281 with features 15


In [4]:
# replace '?' with nan values 
train_df_nan.replace(' ?', np.nan, inplace=True)
test_df_nan.replace(' ?', np.nan, inplace=True)

#replace the label column with '0' and '1', <=50k = 0; >50k = 1
train_df_nan['50k'] = pd.get_dummies(train_df_nan).iloc[:,-1]
test_df_nan['50k'] = pd.get_dummies(test_df_nan).iloc[:,-1]

counts_train = train_df_nan['50k'].value_counts()
counts_test = test_df_nan['50k'].value_counts()

print(counts_train)
print('***less than 50k',round(counts_train[0]/len(train_df_nan['50k']),3), '***More than 50k', round(counts_train[1]/len(train_df_nan['50k']),3))
print(counts_test)
print('***less than 50k',round(counts_test[0]/len(test_df_nan['50k']),3), '***More than 50k', round(counts_test[1]/len(test_df_nan['50k']),3))


#print(train_df_nan.head(50))
#print(test_df_nan.head(50))

0    24720
1     7841
Name: 50k, dtype: int64
***less than 50k 0.759 ***More than 50k 0.241
0    12435
1     3846
Name: 50k, dtype: int64
***less than 50k 0.764 ***More than 50k 0.236


In [5]:
#count instances with nan values
train_df = train_df_nan.dropna()
print(len(train_df_nan) - len(train_df), 'train instances contain missing values')
print(train_df_nan.isnull().sum())

test_df = test_df_nan.dropna()
print(len(test_df_nan) - len(test_df), 'test instances contain missing values')
print(test_df_nan.isnull().sum())

2399 train instances contain missing values
age                  0
workclass         1836
fnlwgt               0
education            0
education_num        0
marital_status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital_gain         0
capital_loss         0
hours_per_week       0
native_country     583
50k                  0
dtype: int64
1221 test instances contain missing values
age                 0
workclass         963
fnlwgt              0
education           0
education_num       0
marital_status      0
occupation        966
relationship        0
race                0
sex                 0
capital_gain        0
capital_loss        0
hours_per_week      0
native_country    274
50k                 0
dtype: int64


In [6]:
train_df_nan.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,50k
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


In [7]:
train_df_nan['occupation'].value_counts()

 Prof-specialty       4140
 Craft-repair         4099
 Exec-managerial      4066
 Adm-clerical         3770
 Sales                3650
 Other-service        3295
 Machine-op-inspct    2002
 Transport-moving     1597
 Handlers-cleaners    1370
 Farming-fishing       994
 Tech-support          928
 Protective-serv       649
 Priv-house-serv       149
 Armed-Forces            9
Name: occupation, dtype: int64

In [8]:
#use mode or KNN to fill missing values (pick two to three methods)

#fillna with mode 

#print(train_df_nan['workclass'].value_counts(dropna=False).head())
#print(train_df_nan['occupation'].value_counts(dropna=False).head())
#print(train_df_nan['native_country'].value_counts(dropna=False).head())

train_df_nan['workclass'].fillna(train_df_nan['workclass'].mode()[0],inplace=True)
train_df_nan['occupation'].fillna(value ='Other-service', inplace=True)      #replace NaN occupation with other services
train_df_nan['native_country'].fillna(train_df_nan['native_country'].mode()[0],inplace=True)

#print('*'*100, train_df_nan.isnull().sum())


#print(test_df_nan['workclass'].value_counts(dropna=False).head())
#print(test_df_nan['occupation'].value_counts(dropna=False).head(10))
#print(test_df_nan['native_country'].value_counts(dropna=False).head())


test_df_nan['workclass'].fillna(test_df_nan['workclass'].mode()[0],inplace=True)
test_df_nan['occupation'].fillna(value ='Other-service', inplace=True)      #replace NaN occupation with other services
test_df_nan['native_country'].fillna(test_df_nan['native_country'].mode()[0],inplace=True)


#print('*'*100, test_df_nan.isnull().sum())


In [9]:
#handle unbalanced data (oversampling)
#balancing the data

from sklearn.utils import resample

train_df_nan['50k'].value_counts()
rich=train_df_nan.loc[train_df_nan['50k']==1]
poor=train_df_nan.loc[train_df_nan['50k']==0]
train_df_nan['50k'].value_counts()
rich_upsample=resample(rich,replace=True,n_samples=len(poor),random_state=27)
balancetrain=pd.concat([rich_upsample,poor],ignore_index=True)
balancetrain['50k'].value_counts()


1    24720
0    24720
Name: 50k, dtype: int64

In [10]:
#transform categorical data into numeric data (encoder and one hot encoder)
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
ohe = OneHotEncoder(sparse=False, categories='auto')


categoric_col_train = train_df_nan[['workclass','sex','marital_status','occupation','relationship','native_country','race']]

categoric_arr_train = ohe.fit_transform(categoric_col_train)
categoric_labels_train = ohe.categories_

categoric_col_train =[]
for i in categoric_labels_train:
    for label in list(i):
        categoric_col_train.append(label)
    
enc_df_train = pd.DataFrame(categoric_arr_train, columns=categoric_col_train)

enc_df_train

Unnamed: 0,Federal-gov,Local-gov,Never-worked,Private,Self-emp-inc,Self-emp-not-inc,State-gov,Without-pay,Female,Male,...,Thailand,Trinadad&Tobago,United-States,Vietnam,Yugoslavia,Amer-Indian-Eskimo,Asian-Pac-Islander,Black,Other,White
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
32557,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
32558,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
32559,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [11]:
from sklearn import preprocessing

continuous_df_train = train_df_nan[['age', 'fnlwgt', 'education_num', 'capital_gain','capital_loss','hours_per_week']]
continuous_norm_train = preprocessing.normalize(continuous_df_train, norm='l2', axis=1)
norm_df_train = pd.DataFrame(continuous_norm_train,columns=list(continuous_df_train))

norm_df_train

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
0,0.000503,0.999607,0.000168,0.028035,0.0,0.000516
1,0.000600,1.000000,0.000156,0.000000,0.0,0.000156
2,0.000176,1.000000,0.000042,0.000000,0.0,0.000185
3,0.000226,1.000000,0.000030,0.000000,0.0,0.000170
4,0.000083,1.000000,0.000038,0.000000,0.0,0.000118
...,...,...,...,...,...,...
32556,0.000105,1.000000,0.000047,0.000000,0.0,0.000148
32557,0.000259,1.000000,0.000058,0.000000,0.0,0.000259
32558,0.000382,1.000000,0.000059,0.000000,0.0,0.000263
32559,0.000109,1.000000,0.000045,0.000000,0.0,0.000099


In [12]:
X_df_train = pd.concat([enc_df_train, norm_df_train], axis=1)

X_df_train

Unnamed: 0,Federal-gov,Local-gov,Never-worked,Private,Self-emp-inc,Self-emp-not-inc,State-gov,Without-pay,Female,Male,...,Asian-Pac-Islander,Black,Other,White,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.000503,0.999607,0.000168,0.028035,0.0,0.000516
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.000600,1.000000,0.000156,0.000000,0.0,0.000156
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.000176,1.000000,0.000042,0.000000,0.0,0.000185
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.000226,1.000000,0.000030,0.000000,0.0,0.000170
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.000083,1.000000,0.000038,0.000000,0.0,0.000118
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.000105,1.000000,0.000047,0.000000,0.0,0.000148
32557,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.000259,1.000000,0.000058,0.000000,0.0,0.000259
32558,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.000382,1.000000,0.000059,0.000000,0.0,0.000263
32559,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.000109,1.000000,0.000045,0.000000,0.0,0.000099


In [13]:
#transform categorical data into numeric data (encoder and one hot encoder)
#test data

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
ohe = OneHotEncoder(sparse=False, categories='auto')


categoric_col_test = test_df_nan[['workclass','sex','marital_status','occupation','relationship','native_country','race']]

categoric_arr_test = ohe.fit_transform(categoric_col_test)
categoric_labels_test = ohe.categories_

categoric_col_test =[]
for i in categoric_labels_test:
    for label in list(i):
        categoric_col_test.append(label)
    
    
enc_df_test = pd.DataFrame(categoric_arr_test, columns=categoric_col_test)

# test_data is missing a column ' Holand-Netherlands' compared to train data 
# train = list(enc_df_train)
# test = list(enc_df_test)
# set(train)-set(test)

loc = enc_df_train.columns.get_loc(' Holand-Netherlands')
enc_df_test.insert(loc, ' Holand-Netherlands', 0)
enc_df_test.shape

(16281, 84)

In [14]:
from sklearn import preprocessing

continuous_df_test = test_df_nan[['age', 'fnlwgt', 'education_num', 'capital_gain','capital_loss','hours_per_week']]
continuous_norm_test = preprocessing.normalize(continuous_df_test, norm='l2', axis=1)
norm_df_test = pd.DataFrame(continuous_norm_test,columns=list(continuous_df_test))


In [15]:
X_df_test = pd.concat([enc_df_test, norm_df_test], axis=1)

X_df_test

Unnamed: 0,Federal-gov,Local-gov,Never-worked,Private,Self-emp-inc,Self-emp-not-inc,State-gov,Without-pay,Female,Male,...,Asian-Pac-Islander,Black,Other,White,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.000110,1.000000,0.000031,0.000000,0.0,0.000176
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.000423,1.000000,0.000100,0.000000,0.0,0.000557
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.000083,1.000000,0.000036,0.000000,0.0,0.000119
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.000274,0.998852,0.000062,0.047898,0.0,0.000249
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.000174,1.000000,0.000097,0.000000,0.0,0.000290
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16276,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.000181,1.000000,0.000060,0.000000,0.0,0.000167
16277,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.000199,1.000000,0.000028,0.000000,0.0,0.000124
16278,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.000101,1.000000,0.000035,0.000000,0.0,0.000133
16279,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.000523,0.997892,0.000155,0.064888,0.0,0.000476


In [54]:
Y_df_train = train_df_nan['50k'].astype(float)
Y_df_test = test_df_nan['50k'].astype(float)

In [265]:
#algorithms: SVM/ Random Forest/ KNN

In [52]:
#SVM linear

from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

svclassifier = SVC(kernel='linear')
svclassifier.fit(X_df_train, Y_df_train)

y_pred = svclassifier.predict(X_df_test)

print(confusion_matrix(Y_df_test,y_pred))
print(classification_report(Y_df_test,y_pred))


[[11151  1284]
 [ 1561  2285]]
              precision    recall  f1-score   support

         0.0       0.88      0.90      0.89     12435
         1.0       0.64      0.59      0.62      3846

    accuracy                           0.83     16281
   macro avg       0.76      0.75      0.75     16281
weighted avg       0.82      0.83      0.82     16281



In [289]:
# SVM rbf kernel 
svclassifier = SVC(kernel='rbf', gamma =0.1, C = 10)
svclassifier.fit(X_df_train, Y_df_train)

y_pred = svclassifier.predict(X_df_test)

print(confusion_matrix(Y_df_test,y_pred))
print(classification_report(Y_df_test,y_pred))


[[11588   847]
 [ 1862  1984]]
              precision    recall  f1-score   support

           0       0.86      0.93      0.90     12435
           1       0.70      0.52      0.59      3846

    accuracy                           0.83     16281
   macro avg       0.78      0.72      0.74     16281
weighted avg       0.82      0.83      0.82     16281



In [18]:
#SVM polynomial kernel

from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

svclassifier = SVC(kernel='poly', degree = 2)
svclassifier.fit(X_df_train, Y_df_train)

y_pred = svclassifier.predict(X_df_test)

print(confusion_matrix(Y_df_test,y_pred))
print(classification_report(Y_df_test,y_pred))

#accuracy goes down if increase power, relationship is linear




[[11815   620]
 [ 2428  1418]]
              precision    recall  f1-score   support

           0       0.83      0.95      0.89     12435
           1       0.70      0.37      0.48      3846

    accuracy                           0.81     16281
   macro avg       0.76      0.66      0.68     16281
weighted avg       0.80      0.81      0.79     16281



In [296]:
#Random Forest
from sklearn.ensemble import RandomForestClassifier

RFclassifier = RandomForestClassifier(n_estimators =50)
RFclassifier.fit(X_df_train, Y_df_train)

y_pred = RFclassifier.predict(X_df_test)

print(confusion_matrix(Y_df_test,y_pred))
print(classification_report(Y_df_test,y_pred))

[[11455   980]
 [ 1696  2150]]
              precision    recall  f1-score   support

           0       0.87      0.92      0.90     12435
           1       0.69      0.56      0.62      3846

    accuracy                           0.84     16281
   macro avg       0.78      0.74      0.76     16281
weighted avg       0.83      0.84      0.83     16281



In [301]:
#KNN
from sklearn.neighbors import KNeighborsClassifier

KNNclassifier =KNeighborsClassifier(n_neighbors=5)
KNNclassifier.fit(X_df_train, Y_df_train)

y_pred = KNNclassifier.predict(X_df_test)

print(confusion_matrix(Y_df_test,y_pred))
print(classification_report(Y_df_test,y_pred))

[[11227  1208]
 [ 1738  2108]]
              precision    recall  f1-score   support

           0       0.87      0.90      0.88     12435
           1       0.64      0.55      0.59      3846

    accuracy                           0.82     16281
   macro avg       0.75      0.73      0.74     16281
weighted avg       0.81      0.82      0.81     16281



In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
X_train = train_df_nan.drop(['50k'], axis =1)
Y_train = train_df_nan['50k']

column_trans = make_column_transformer(
    (OneHotEncoder(sparse=False, drop='first'), ['workclass','marital_status','occupation','relationship','sex','native_country','race']), 
    remainder = 'passthrough')

new_x = column_trans.fit_transform(X_train)

new_x.shape