## PCA-logistic regression

In [6]:
import numpy as np
import pandas as pd
import seaborn as sb
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
import warnings

warnings.filterwarnings("ignore")

## data preprocess

### read train.csv, preprocess

In [7]:
# load data & show head
df = pd.read_csv('../cs5228/train.csv')
print(df.shape)
df.head()

(24421, 14)


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,sex,capital-gain,capital-loss,hours-per-week,native-country,exceeds50K
0,30,?,147215,Some-college,10,Never-married,?,Own-child,Female,0,0,30,United-States,0
1,60,Private,173960,Bachelors,13,Divorced,Prof-specialty,Not-in-family,Female,0,0,42,United-States,0
2,52,?,105428,Some-college,10,Married-civ-spouse,?,Husband,Male,0,0,12,United-States,0
3,37,Private,112497,Bachelors,13,Married-civ-spouse,Sales,Husband,Male,0,0,60,United-States,0
4,63,Private,137843,Some-college,10,Married-civ-spouse,Sales,Husband,Male,7298,0,48,United-States,1


In [8]:
# split features and labels
X = df.drop(['exceeds50K'],axis=1)
Y = df['exceeds50K']
X.head(3)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,sex,capital-gain,capital-loss,hours-per-week,native-country
0,30,?,147215,Some-college,10,Never-married,?,Own-child,Female,0,0,30,United-States
1,60,Private,173960,Bachelors,13,Divorced,Prof-specialty,Not-in-family,Female,0,0,42,United-States
2,52,?,105428,Some-college,10,Married-civ-spouse,?,Husband,Male,0,0,12,United-States


### read test.csv, preprocess

In [9]:
df_test = pd.read_csv('../cs5228/test.csv')
print(df_test.shape)
X_test = df_test
df_test.head(3)

(24421, 13)


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,sex,capital-gain,capital-loss,hours-per-week,native-country
0,23,Private,32732,Some-college,10,Married-civ-spouse,Sales,Husband,Male,0,0,25,United-States
1,69,Private,165017,HS-grad,9,Widowed,Machine-op-inspct,Unmarried,Male,2538,0,40,United-States
2,27,Private,36440,Bachelors,13,Never-married,Sales,Not-in-family,Female,0,0,40,United-States


In [10]:
# replace missing value ' ?' by mode

feature_list = X.columns

# count missing value ' ?' 
print('missing value count in train.scv')
for feature in feature_list:
    print(feature, ' ', len(X[X[feature] == ' ?']))

print('\nmissing value count in test.csv')
for feature in feature_list:
    print(feature, ' ', len(X_test[X_test[feature]==' ?']))

# replace
X_all = pd.concat([X,X_test], axis = 0)
imputer = SimpleImputer(missing_values=' ?', strategy='most_frequent')
imputer.fit(X_all)
X = pd.DataFrame(imputer.transform(X), columns = feature_list)
X_test = pd.DataFrame(imputer.transform(X_test), columns = feature_list)
X.head(3)

missing value count in train.scv
age   0
workclass   1392
fnlwgt   0
education   0
education-num   0
marital-status   0
occupation   1399
relationship   0
sex   0
capital-gain   0
capital-loss   0
hours-per-week   0
native-country   410

missing value count in test.csv
age   0
workclass   1407
fnlwgt   0
education   0
education-num   0
marital-status   0
occupation   1410
relationship   0
sex   0
capital-gain   0
capital-loss   0
hours-per-week   0
native-country   447


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,sex,capital-gain,capital-loss,hours-per-week,native-country
0,30,Private,147215,Some-college,10,Never-married,Prof-specialty,Own-child,Female,0,0,30,United-States
1,60,Private,173960,Bachelors,13,Divorced,Prof-specialty,Not-in-family,Female,0,0,42,United-States
2,52,Private,105428,Some-college,10,Married-civ-spouse,Prof-specialty,Husband,Male,0,0,12,United-States


In [11]:
# marital-status into single, married
print(set(X['marital-status'].values))
single_list = [' Separated', ' Divorced', ' Widowed', ' Never-married']
married_list = [' Married-AF-spouse', ' Married-spouse-absent', ' Married-civ-spouse']

X['marital-status'][X['marital-status'].isin(married_list)] = 'married'
X['marital-status'][X['marital-status'].isin(single_list)] = 'single'
X_test['marital-status'][X_test['marital-status'].isin(married_list)] = 'married'
X_test['marital-status'][X_test['marital-status'].isin(single_list)] = 'single'
X.head()

{' Widowed', ' Never-married', ' Married-AF-spouse', ' Married-spouse-absent', ' Divorced', ' Married-civ-spouse', ' Separated'}


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,sex,capital-gain,capital-loss,hours-per-week,native-country
0,30,Private,147215,Some-college,10,single,Prof-specialty,Own-child,Female,0,0,30,United-States
1,60,Private,173960,Bachelors,13,single,Prof-specialty,Not-in-family,Female,0,0,42,United-States
2,52,Private,105428,Some-college,10,married,Prof-specialty,Husband,Male,0,0,12,United-States
3,37,Private,112497,Bachelors,13,married,Sales,Husband,Male,0,0,60,United-States
4,63,Private,137843,Some-college,10,married,Sales,Husband,Male,7298,0,48,United-States


### embedding, train&test data together

In [12]:
# transfer string feature into int

# features in string to be encoded
encoding_features = ['workclass','education','marital-status','occupation','relationship','sex','native-country']

# to include all possible values
X_all = pd.concat([X,X_test], axis = 0)
for feature in encoding_features:
    encoder = preprocessing.LabelEncoder()
    encoder.fit(X_all[feature])
    X[feature] = encoder.transform(X[feature])
    X_test[feature] = encoder.transform(X_test[feature])

X.head(3)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,sex,capital-gain,capital-loss,hours-per-week,native-country
0,30,3,147215,15,10,1,9,3,0,0,0,30,38
1,60,3,173960,9,13,1,9,1,0,0,0,42,38
2,52,3,105428,15,10,0,9,0,1,0,0,12,38


In [13]:
# feature scaling

scaler = preprocessing.StandardScaler()

X = pd.DataFrame(scaler.fit_transform(X), columns = feature_list)
X_test = pd.DataFrame(scaler.transform(X_test), columns = feature_list)
X.head(3)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,sex,capital-gain,capital-loss,hours-per-week,native-country
0,-0.627855,-0.086978,-0.397416,1.218346,-0.03749,0.946594,0.714447,0.965669,-1.421717,-0.145954,-0.216863,-0.849954,0.263597
1,1.565644,-0.086978,-0.141947,-0.335686,1.127814,0.946594,0.714447,-0.279387,-1.421717,-0.145954,-0.216863,0.120349,0.263597
2,0.980711,-0.086978,-0.796568,1.218346,-0.03749,-1.056419,0.714447,-0.901915,0.703375,-0.145954,-0.216863,-2.305408,0.263597


### PCA analysis

In [14]:
pca = PCA()
pca.fit(X)
pca_list = pca.explained_variance_ratio_
print('pca list\n', pca_list)
pca_df = pd.DataFrame([*zip(feature_list, pca_list)], columns = ['feature', 'variance'])
cums = np.cumsum(pca_list)
dim = np.argmax(cums >= 0.9) + 1
print('remain dim', dim, 'total dim', len(feature_list))
pca_df

pca list
 [0.17923473 0.11070692 0.08303999 0.08020077 0.07860167 0.07681319
 0.07478438 0.07077998 0.06849254 0.06057913 0.04633313 0.04092867
 0.0295049 ]
remain dim 11 total dim 13


Unnamed: 0,feature,variance
0,age,0.179235
1,workclass,0.110707
2,fnlwgt,0.08304
3,education,0.080201
4,education-num,0.078602
5,marital-status,0.076813
6,occupation,0.074784
7,relationship,0.07078
8,sex,0.068493
9,capital-gain,0.060579


In [15]:
# drop the last 2 feature
X = X.drop(['hours-per-week', 'native-country'],axis=1)
X_test = X_test.drop(['hours-per-week', 'native-country'],axis=1)
X_train ,X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.3, random_state = 0)
X.head(3)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,sex,capital-gain,capital-loss
0,-0.627855,-0.086978,-0.397416,1.218346,-0.03749,0.946594,0.714447,0.965669,-1.421717,-0.145954,-0.216863
1,1.565644,-0.086978,-0.141947,-0.335686,1.127814,0.946594,0.714447,-0.279387,-1.421717,-0.145954,-0.216863
2,0.980711,-0.086978,-0.796568,1.218346,-0.03749,-1.056419,0.714447,-0.901915,0.703375,-0.145954,-0.216863


## logistic regression training

In [16]:
clf = LogisticRegression()
clf.fit(X_train, Y_train)
Y_pred = clf.predict(X_val)
accuracy_score(Y_val,Y_pred)

0.844001637778081

## predict test data

In [69]:
Y_test = clf.predict(X_test)
print(Y_test.shape)

(24421,)


In [70]:
# wirte into file
df_res = pd.read_csv('../cs5228/sample_submission.csv')
df_res['prediction'] = Y_test
print(df_res.head())
df_res.to_csv('../submissions/PCA_lr_submission.csv', index = False)

   id  prediction
0   1           0
1   2           0
2   3           0
3   4           0
4   5           0
