# Feature Dimension Reduction Using LDA and PCA

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler

In [4]:
data = pd.read_csv('santander-train.csv', nrows = 20000)
data.head()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03,0
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77,0
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.97,0
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016,0


In [5]:
X = data.drop('TARGET', axis = 1)
y = data['TARGET']
X.shape, y.shape

((20000, 370), (20000,))

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0, stratify = y)

# Remove Constant, Quasi Constant and Duplicate Features

In [7]:
# Let’s remove constant and quasi constant features from the data with the threshold value of 0.01. 
# That means the features which have 99% similarity among them have been removed.

constant_filter = VarianceThreshold(threshold=0.01)
constant_filter.fit(X_train)
X_train_filter = constant_filter.transform(X_train)
X_test_filter = constant_filter.transform(X_test)

X_train_filter.shape, X_test_filter.shape

((16000, 245), (4000, 245))

In [10]:
# Let’s remove duplicated features from the data.

X_train_T = X_train_filter.T
X_test_T = X_test_filter.T
X_train_T = pd.DataFrame(X_train_T)

X_test_T = pd.DataFrame(X_test_T)
X_train_T.duplicated().sum()

18

In [11]:
duplicated_features = X_train_T.duplicated()
features_to_keep = [not index for index in duplicated_features]
X_train_unique = X_train_T[features_to_keep].T
X_test_unique = X_test_T[features_to_keep].T

In [13]:
# Let’s go ahead and standardize the data to get the same scale.

scaler = StandardScaler().fit(X_train_unique)
X_train_unique = scaler.transform(X_train_unique)
X_test_unique = scaler.transform(X_test_unique)
X_train_unique = pd.DataFrame(X_train_unique)
X_test_unique = pd.DataFrame(X_test_unique)

X_train_unique.shape, X_test_unique.shape

((16000, 227), (4000, 227))

# Removal of correlated Feature

In [14]:
corrmat = X_train_unique.corr()

In [15]:
def get_correlation(data, threshold):
    corr_col = set()
    corrmat = data.corr()
    for i in range(len(corrmat.columns)):
        for j in range(i):
            if abs(corrmat.iloc[i, j]) > threshold:
                colname = corrmat.columns[i]
                corr_col.add(colname)
    return corr_col

corr_features = get_correlation(X_train_unique, 0.70)
print('correlated features: ', len(set(corr_features)) )

correlated features:  148


In [16]:
X_train_uncorr = X_train_unique.drop(labels=corr_features, axis = 1)
X_test_uncorr = X_test_unique.drop(labels = corr_features, axis = 1)

X_train_uncorr.shape, X_test_uncorr.shape

((16000, 79), (4000, 79))

# Feature Dimention Reduction by LDA

In [17]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

In [18]:
'''
The number of components which we can pass here is 1 why beacuse if you remember the maximum number of components 
we can select are total number of classes – 1. Here, santadard problem is a biclass problem those are either 0 or 1. 
so the maximum number of components is 1.
Even if we select more than 1, it will treat as 1.
'''

'\nThe number of components which we can pass here is 1 why beacuse if you remember the maximum number of components \nwe can select are total number of classes – 1. Here, santadard problem is a biclass problem those are either 0 or 1. \nso the maximum number of components is 1.\nEven if we select more than 1, it will treat as 1.\n'

In [19]:
lda = LDA(n_components=1)
X_train_lda = lda.fit_transform(X_train_uncorr, y_train)
X_test_lda = lda.transform(X_test_uncorr)

In [20]:
X_train_lda.shape, X_test_lda.shape

((16000, 1), (4000, 1))

In [22]:
X_train_lda

array([[-0.85882526],
       [ 0.65214263],
       [-1.01475523],
       ...,
       [ 1.72799436],
       [-0.96410962],
       [-0.76560697]])

In [23]:
def run_randomForest(X_train, X_test, y_train, y_test):
    clf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print('Accuracy on test set: ')
    print(accuracy_score(y_test, y_pred))

In [30]:
%time
run_randomForest(X_train_lda, X_test_lda, y_train, y_test)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.91 µs
Accuracy on test set: 
0.93025


In [31]:
%time
run_randomForest(X_train_uncorr, X_test_uncorr, y_train, y_test)

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 5.72 µs
Accuracy on test set: 
0.9585


In [33]:
%time
run_randomForest(X_train, X_test, y_train, y_test)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.15 µs
Accuracy on test set: 
0.9585


In [34]:
'''
So, if see here accuracy on the original dataset is more compared to transformed dataset. 
But, the training time original dataset is double than tranformed version and the dimension also has been reduced.

From this, we can observe LDA won’t give guarantee on the accuracy but it will give guarantee on the reduction in 
dimension and cpu time.

Actually we cannot say time also reduced, if you execute multiple times, both trained and transformed gives diff
diff time.
'''

'\nSo, if see here accuracy on the original dataset is more compared to transformed dataset. \nBut, the training time original dataset is double than tranformed version and the dimension also has been reduced.\n\nFrom this, we can observe LDA won’t give guarantee on the accuracy but it will give guarantee on the reduction in \ndimension and cpu time.\n\nActually we cannot say time also reduced, if you execute multiple times, both trained and transformed gives diff\ndiff time.\n'

# Feature Reduction by PCA

In [35]:
from sklearn.decomposition import PCA

In [36]:
pca = PCA(n_components=2, random_state=42)
pca.fit(X_train_uncorr)
PCA(copy=True, iterated_power='auto', n_components=2, random_state=42, svd_solver='auto', tol=0.0, whiten=False)

PCA(n_components=2, random_state=42)

In [37]:
# Let’s go ahead and get training and testing dataset by PCA transformation.

X_train_pca = pca.transform(X_train_uncorr)
X_test_pca = pca.transform(X_test_uncorr)

X_train_pca.shape, X_test_pca.shape

((16000, 2), (4000, 2))

In [39]:
X_train_pca

array([[-0.94370357,  0.03598099],
       [-2.0434708 ,  0.06223386],
       [ 0.03606085, -0.13012272],
       ...,
       [-1.57988208,  0.05142811],
       [-0.95575252,  0.03714793],
       [-1.3394553 ,  0.06149278]])

In [40]:
# Now, find out the accuracy and cpu time of the transformed dataset.

%time
run_randomForest(X_train_pca, X_test_pca, y_train, y_test)

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 7.87 µs
Accuracy on test set: 
0.956


In [41]:
# Let’s get the accuracy and cpu time of the original dataset.

%time
run_randomForest(X_train, X_test, y_train, y_test)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.91 µs
Accuracy on test set: 
0.9585


In [42]:
X_train_uncorr.shape

(16000, 79)

In [43]:
# here also no benifit in time and accuracy....so feature selection/reduction may work, maynot work...

In [45]:
# Let’s check the accuracy for various selected components. This is to select the number of components based on 
# some check instead of manually. 

for component in range(1,10):   # we need to give range as 1 to n-1 here 1 to 78 (79-1)
    pca = PCA(n_components=component, random_state=42)
    pca.fit(X_train_uncorr)
    X_train_pca = pca.transform(X_train_uncorr)
    X_test_pca = pca.transform(X_test_uncorr)
    print('Selected Components: ', component)
    run_randomForest(X_train_pca, X_test_pca, y_train, y_test)
    print() # this is to print new line 

Selected Components:  1
Accuracy on test set: 
0.92375

Selected Components:  2
Accuracy on test set: 
0.956

Selected Components:  3
Accuracy on test set: 
0.95675

Selected Components:  4
Accuracy on test set: 
0.95825

Selected Components:  5
Accuracy on test set: 
0.9575

Selected Components:  6
Accuracy on test set: 
0.95725

Selected Components:  7
Accuracy on test set: 
0.9565

Selected Components:  8
Accuracy on test set: 
0.9565

Selected Components:  9
Accuracy on test set: 
0.9555



In [None]:
# so instead of selecting components as 2 manually, we can see that we can select as 4 (between 1 to 10 range)
# to get more accuracy. 