# Homework #3: PCA/Hyperparameter/CV
Data source: http://archive.ics.uci.edu/ml/datasets/Polish+companies+bankruptcy+data

In [17]:
from scipy.io import arff
import pandas as pd
import numpy as np

In [18]:
data = arff.loadarff('4year.arff')
df = pd.DataFrame(data[0])

In [19]:
df['bankruptcy'] = (df['class']==b'1')
df.drop(columns=['class'], inplace=True)
df.columns = ['X{0:02d}'.format(k) for k in range(1,65)] + ['bankruptcy']

In [20]:
df.describe()

Unnamed: 0,X01,X02,X03,X04,X05,X06,X07,X08,X09,X10,...,X55,X56,X57,X58,X59,X60,X61,X62,X63,X64
count,9791.0,9791.0,9791.0,9749.0,9771.0,9791.0,9791.0,9773.0,9792.0,9791.0,...,9792.0,9771.0,9791.0,9776.0,9791.0,9178.0,9760.0,9771.0,9749.0,9561.0
mean,0.043019,0.596404,0.130959,8.1366,64.65164,-0.059273,0.059446,19.884016,1.882296,0.38904,...,7686.33,-0.992263,0.035022,1.133287,0.856053,118.156064,25.19443,2015.157,8.660813,35.949619
std,0.359321,4.587122,4.559074,290.647281,14759.39,6.812754,0.533344,698.697015,17.67465,4.590299,...,76052.61,77.007971,8.945365,8.038201,26.393305,3230.316692,1099.260821,117146.1,60.838202,483.318623
min,-12.458,0.0,-445.91,-0.045319,-379460.0,-486.82,-12.458,-1.8482,-0.032371,-445.91,...,-713220.0,-7522.1,-597.42,-30.892,-284.38,0.0,-12.656,-14965.0,-0.02439,-1.5e-05
25%,0.001321,0.263145,0.020377,1.047,-51.217,-0.000578,0.003004,0.4283,1.006675,0.29444,...,21.84,0.003121,0.008768,0.885722,0.0,5.356325,4.2677,43.234,2.9388,2.0129
50%,0.041364,0.46774,0.19929,1.5918,-0.055576,0.0,0.04882,1.0887,1.1613,0.51045,...,950.33,0.043679,0.098026,0.958305,0.002129,9.482,6.28355,74.729,4.8489,4.0416
75%,0.11113,0.689255,0.41067,2.8804,55.732,0.065322,0.12694,2.691,1.970225,0.71429,...,4694.55,0.11717,0.24268,0.996163,0.21179,19.506,9.9382,123.345,8.3638,9.4135
max,20.482,446.91,22.769,27146.0,1034100.0,322.2,38.618,53209.0,1704.8,12.602,...,6123700.0,112.02,226.76,668.75,1661.0,251570.0,108000.0,10779000.0,5662.4,21153.0


In [21]:
sum(df.bankruptcy == True)

515

In [22]:
df.fillna(df.mean(), inplace=True)
df.isna().sum()
X_imp = df.values

In [23]:
from sklearn.model_selection import train_test_split

X, y = X_imp[:, :-1], X_imp[:, -1]

X_train, X_test, y_train, y_test =\
    train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

In [24]:
import sklearn.preprocessing as skpre

stdsc = skpre.StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
print(X_train_std.shape)
X_test_std = stdsc.transform(X_test)
print(X_test_std.shape)

(6854, 64)
(2938, 64)


## Find the 2 most important features
I use Logistic Regression with L1 penalty. Since the data is unbalanced, we should use ```class_weight='balanced'```.

In [30]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [29]:
lr = LogisticRegression(penalty='l1',
                       C=0.01,
                       solver='liblinear',
                       class_weight='balanced')
y_train=np.int64(y_train)
y_test=np.int64(y_test)
lr.fit(X_train_std, y_train)
print('Training accuracy:', lr.score(X_train_std, y_train))
print('Test accuracy:', lr.score(X_test_std, y_test))

Training accuracy: 0.6995914794280712
Test accuracy: 0.6936691626957113


In [27]:
lr.coef_[lr.coef_!=0].shape # check the number of the features with non-zero weights

(2,)

In [28]:
# Store the result of L1 penalty in X_test_l1 and X_train_l1
X_train_l1 = X_train_std[:, lr.coef_[0]!=0]
X_test_l1 = X_test_std[:, lr.coef_[0]!=0]

## PCA using pipeline
I use PCA to reduce the demension, and then use Logistic Regression to classify. Since the data is unbalanced, we should use ```class_weight='balanced'```.

#### PCA

In [54]:
from sklearn.decomposition import PCA
pca=PCA(n_components=3)
X_train_pca=pca.fit_transform(X_train_std)
X_test_pca=pca.transform(X_test_std)

#### Logistic Regression

In [55]:
lr1=LogisticRegression(class_weight='balanced')
lr1.fit(X_train_pca, y_train)
print('Training accuracy:', lr1.score(X_train_pca, y_train))
print('Test accuracy:', lr1.score(X_test_pca, y_test))

Training accuracy: 0.7297928217099504
Test accuracy: 0.7270251872021783


#### SVM

In [59]:
from sklearn.svm import SVC
svm=SVC(kernel='rbf',class_weight='balanced')
svm.fit(X_train_pca,y_train)
print('Training accuracy:', svm.score(X_train_pca, y_train))
print('Test accuracy:', svm.score(X_test_pca, y_test))

Training accuracy: 0.648964108549752
Test accuracy: 0.6330837304288631


#### Decision Tree

In [67]:
from sklearn.tree import DecisionTreeClassifier
tree_model=DecisionTreeClassifier(criterion='gini',class_weight='balanced')
tree_model.fit(X_train_pca,y_train)
print('Training accuracy:', tree_model.score(X_train_pca, y_train))
print('Test accuracy:', tree_model.score(X_test_pca, y_test))

Training accuracy: 1.0
Test accuracy: 0.89857045609258


#### Implement the methods using pipeline

In [182]:
from sklearn.pipeline import make_pipeline
pipe_lr=make_pipeline(PCA(n_components=3),
                         LogisticRegression(class_weight='balanced'))
pipe_lr.fit(X_train_std,y_train)
print('Training accuracy:', pipe_lr.score(X_train_std, y_train))
print('Test accuracy:', pipe_lr.score(X_test_std, y_test))

Training accuracy: 0.7297928217099504
Test accuracy: 0.7270251872021783


In [69]:
from sklearn.pipeline import make_pipeline
pipe_svm=make_pipeline(PCA(n_components=3),
                         SVC(kernel='rbf',class_weight='balanced'))
pipe_svm.fit(X_train_std,y_train)
print('Training accuracy:', pipe_svm.score(X_train_std, y_train))
print('Test accuracy:', pipe_svm.score(X_test_std, y_test))

Training accuracy: 0.6491100087540123
Test accuracy: 0.6330837304288631


In [68]:
from sklearn.pipeline import make_pipeline
pipe_tree=make_pipeline(PCA(n_components=3),
                         DecisionTreeClassifier(criterion='gini',class_weight='balanced'))
pipe_tree.fit(X_train_std,y_train)
print('Training accuracy:', pipe_tree.score(X_train_std, y_train))
print('Test accuracy:', pipe_tree.score(X_test_std, y_test))

Training accuracy: 0.9983950977531368
Test accuracy: 0.899931926480599


## 5 fold Cross-Validation
I use 5 fold Cross-Validation to test the precision of the LR model and decision tree model.

In [148]:
from sklearn.model_selection import cross_val_score
X_combined_std = np.vstack((X_train_std, X_test_std))
y_combined = np.hstack((y_train, y_test))
scores=cross_val_score(estimator=pipe_lr,
                      X=X_combined_std,
                      y=y_combined,
                      cv=5)
print('CV accuracy scores of LR: %s'%scores)

CV accuracy scores of LR: [0.73864216 0.72281776 0.72420838 0.7226762  0.73237998]


In [186]:
scores=cross_val_score(estimator=pipe_tree,
                      X=X_combined_std,
                      y=y_combined,
                      cv=5)
print('CV accuracy scores of Decision Tree: %s'%scores)

CV accuracy scores of Decision Tree: [0.90403267 0.90760592 0.90602656 0.89530133 0.90909091]


## Grid search
I combine grid search and 5 fold Cross-Validation in the part.

#### Only Grid Search for LR

In [160]:
from sklearn.model_selection import GridSearchCV
param_range=(np.ones([9])*10)**np.arange(-4,5)
param_grid={'logisticregression__C':param_range}
gs=GridSearchCV(estimator=pipe_lr,
               param_grid=param_grid,
               scoring='accuracy',
               cv=2)
gs.fit(X_train_std,y_train)
print(gs.best_params_)

{'logisticregression__C': 0.0001}


In [161]:
blr=gs.best_estimator_
blr.fit(X_train_std,y_train)
print('Training accuracy:', blr.score(X_train_std, y_train))
print('Test accuracy:', blr.score(X_test_std, y_test))

Training accuracy: 0.7674350744091042
Test accuracy: 0.7648059904697073


#### Only Grid Search for Decision Tree

In [158]:
param_range=np.arange(1,40)
param_grid={'decisiontreeclassifier__max_depth':param_range}
gs_tree=GridSearchCV(estimator=pipe_tree,
               param_grid=param_grid,
               scoring='accuracy',
               cv=2)
gs_tree.fit(X_train_std,y_train)
print(gs_tree.best_params_)

{'decisiontreeclassifier__max_depth': 30}


In [159]:
btree=gs_tree.best_estimator_
btree.fit(X_train_std,y_train)
print('Training accuracy:', btree.score(X_train_std, y_train))
print('Test accuracy:', btree.score(X_test_std, y_test))

Training accuracy: 0.9982491975488765
Test accuracy: 0.8995915588835943


#### Grid Search plus 5 fold Cross-Validation

In [165]:
param_grid={'logisticregression__C':param_range}
gs=GridSearchCV(estimator=pipe_lr,
               param_grid=param_grid,
               scoring='accuracy',
               cv=2)

scores=cross_val_score(estimator=gs,X=X_combined_std,y=y_combined,
                       scoring='accuracy',cv=5)
print('CV accuracy scores of Optimal LR: %s'%scores)

CV accuracy scores of Optimal LR: [0.77284329 0.77335375 0.75331971 0.77170582 0.75025536]


In [167]:
param_grid={'decisiontreeclassifier__max_depth':param_range}
gs_tree=GridSearchCV(estimator=pipe_tree,
               param_grid=param_grid,
               scoring='accuracy',
               cv=2)

scores=cross_val_score(gs_tree,X_combined_std,y_combined,
                       scoring='accuracy',cv=5)
print('CV accuracy scores of Optimal Decision Tree: %s'%scores)

CV accuracy scores of Optimal Decision Tree: [0.90301174 0.90964778 0.90858018 0.90194076 0.91011236]
