# 1. Basics

### Preparation

In [2]:
# Import modules
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import datasets

In [3]:
# Load data: hand-writing data of 8x8 size
digits = datasets.load_digits()

digits

{'data': array([[ 0.,  0.,  5., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ..., 10.,  0.,  0.],
        [ 0.,  0.,  0., ..., 16.,  9.,  0.],
        ...,
        [ 0.,  0.,  1., ...,  6.,  0.,  0.],
        [ 0.,  0.,  2., ..., 12.,  0.,  0.],
        [ 0.,  0., 10., ..., 12.,  1.,  0.]]),
 'target': array([0, 1, 2, ..., 8, 9, 8]),
 'frame': None,
 'feature_names': ['pixel_0_0',
  'pixel_0_1',
  'pixel_0_2',
  'pixel_0_3',
  'pixel_0_4',
  'pixel_0_5',
  'pixel_0_6',
  'pixel_0_7',
  'pixel_1_0',
  'pixel_1_1',
  'pixel_1_2',
  'pixel_1_3',
  'pixel_1_4',
  'pixel_1_5',
  'pixel_1_6',
  'pixel_1_7',
  'pixel_2_0',
  'pixel_2_1',
  'pixel_2_2',
  'pixel_2_3',
  'pixel_2_4',
  'pixel_2_5',
  'pixel_2_6',
  'pixel_2_7',
  'pixel_3_0',
  'pixel_3_1',
  'pixel_3_2',
  'pixel_3_3',
  'pixel_3_4',
  'pixel_3_5',
  'pixel_3_6',
  'pixel_3_7',
  'pixel_4_0',
  'pixel_4_1',
  'pixel_4_2',
  'pixel_4_3',
  'pixel_4_4',
  'pixel_4_5',
  'pixel_4_6',
  'pixel_4_7',
  'pixel_5_0',
  'pixel_5_1',
 

In [69]:
# Standardization and Fit data

features = StandardScaler().fit_transform(digits.data)
            # StandardScaler(): Transfer data to Standard Normal Distribution(Mean=0, Std=1)
            # fit_transform(): A combination of 'fit' and 'transform'
            # 'digits.data': An array including pixel values of the handwritting images

features

array([[ 0.        , -0.33501649, -0.04308102, ..., -1.14664746,
        -0.5056698 , -0.19600752],
       [ 0.        , -0.33501649, -1.09493684, ...,  0.54856067,
        -0.5056698 , -0.19600752],
       [ 0.        , -0.33501649, -1.09493684, ...,  1.56568555,
         1.6951369 , -0.19600752],
       ...,
       [ 0.        , -0.33501649, -0.88456568, ..., -0.12952258,
        -0.5056698 , -0.19600752],
       [ 0.        , -0.33501649, -0.67419451, ...,  0.8876023 ,
        -0.5056698 , -0.19600752],
       [ 0.        , -0.33501649,  1.00877481, ...,  0.8876023 ,
        -0.26113572, -0.19600752]])

### Create PCA Components maintaining 99% of Distribution

In [11]:
# Create PCA Components
pca = PCA(n_components=0.99,  # 99% of variance
          whiten=True)   # An option for 'whitening'
                         # 'Whitening'
                         #    - Each Principal Component has a variance of 1
                         #    - Create a new characteristic space of unrelated components

pca

In [7]:
# Run PCA
features_pca = pca.fit_transform(features)

features_pca

array([[ 0.70631939, -0.39512814, -1.73816236, ...,  0.36526417,
        -0.31369006,  0.05355504],
       [ 0.21732591,  0.38276482,  1.72878893, ..., -0.17818068,
        -0.14031747,  1.18179755],
       [ 0.4804351 , -0.13130437,  1.33172761, ..., -0.01924571,
        -0.23580029,  0.92966158],
       ...,
       [ 0.37732433, -0.0612296 ,  1.0879821 , ..., -1.05526847,
         1.75559618, -0.87894699],
       [ 0.39705007, -0.15768102, -1.08160094, ...,  0.10442881,
         0.65907949,  1.1292155 ],
       [-0.46407544, -0.92213976,  0.12493334, ..., -1.10593026,
         0.54434185, -0.26573597]])

In [10]:
# Result
print("Number of Original data: ", features.shape[1])
print("Number of Reduced data: ", features_pca.shape[1])


Number of Original data:  64
Number of Reduced data:  54


# 2. Exercise with Handwriting Dataset

## 2.1. Preparation

### Preparation

In [24]:
# Import modules
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

### Load Dataset

In [18]:
# Load dataset
digits = load_digits()

print(len(digits.data))
print(len(digits.target))

1797
1797


In [19]:
# Split data
x_train, x_test, y_train, y_test = train_test_split(digits.data, digits.target, random_state=777)

print(len(x_train), len(x_test))
print(len(y_train), len(y_test))

1347 450
1347 450


## 2.2. Before StandardScaler

### Modelling

In [37]:
# Create a model
model = LogisticRegression(max_iter=10000)

model

In [38]:
# Fit model
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

y_pred

array([5, 6, 6, 2, 4, 1, 7, 3, 4, 3, 4, 7, 4, 1, 2, 6, 3, 7, 1, 2, 3, 2,
       0, 2, 6, 1, 9, 7, 7, 8, 4, 5, 2, 3, 7, 5, 9, 8, 7, 4, 7, 1, 1, 4,
       7, 8, 2, 5, 8, 6, 8, 5, 0, 7, 4, 6, 3, 1, 7, 6, 6, 6, 4, 6, 7, 0,
       9, 2, 7, 5, 1, 5, 5, 9, 4, 2, 4, 4, 0, 7, 3, 8, 0, 8, 6, 6, 3, 9,
       9, 3, 6, 1, 0, 9, 8, 9, 4, 0, 3, 9, 7, 4, 9, 1, 1, 0, 7, 6, 0, 8,
       5, 1, 5, 5, 9, 7, 2, 6, 1, 0, 2, 1, 0, 5, 0, 7, 9, 3, 8, 1, 0, 1,
       4, 5, 9, 4, 7, 1, 6, 4, 7, 5, 9, 5, 3, 7, 8, 9, 3, 2, 7, 5, 8, 5,
       0, 3, 4, 6, 3, 1, 9, 4, 5, 8, 8, 6, 9, 3, 7, 7, 6, 1, 6, 9, 6, 5,
       7, 1, 1, 0, 2, 1, 3, 0, 1, 4, 1, 0, 6, 8, 9, 8, 8, 4, 7, 6, 0, 8,
       8, 6, 8, 5, 5, 5, 7, 0, 3, 1, 9, 0, 3, 5, 2, 8, 0, 9, 0, 1, 5, 6,
       1, 0, 0, 8, 0, 6, 7, 9, 5, 1, 1, 5, 7, 3, 7, 8, 0, 0, 6, 8, 2, 6,
       2, 2, 9, 9, 0, 8, 4, 7, 1, 3, 0, 1, 9, 2, 1, 1, 0, 8, 8, 7, 3, 1,
       5, 7, 0, 9, 5, 2, 2, 7, 7, 5, 9, 8, 9, 2, 5, 4, 4, 2, 3, 0, 8, 1,
       5, 1, 2, 0, 5, 1, 3, 8, 9, 2, 8, 4, 5, 4, 7,

In [39]:
# Get an accuracy score
no_standardScaler_acc_score = accuracy_score(y_test, y_pred)

no_standardScaler_acc_score

0.9533333333333334

## 2.4. After StandardScaler

### Nomalization

In [40]:
# Get a scaler
scaler = StandardScaler()

In [41]:
# Normalize dataset
x_train_norm = scaler.fit_transform(x_train)
x_test_norm = scaler.transform(x_test)

x_train_norm, x_test_norm

(array([[ 0.        , -0.33086354, -0.87769069, ...,  0.22634127,
         -0.49497922, -0.19529843],
        [ 0.        , -0.33086354,  0.18186242, ...,  1.59429051,
          3.43712254,  7.46153672],
        [ 0.        , -0.33086354, -1.08960132, ..., -1.14160798,
         -0.49497922, -0.19529843],
        ...,
        [ 0.        ,  1.93143694,  2.08905802, ..., -1.14160798,
         -0.49497922, -0.19529843],
        [ 0.        , -0.33086354, -0.45386945, ...,  0.56832858,
         -0.49497922, -0.19529843],
        [ 0.        ,  4.19373742,  1.66523678, ..., -1.14160798,
         -0.49497922, -0.19529843]]),
 array([[ 0.        ,  0.8002867 ,  0.81759429, ..., -0.97061432,
         -0.49497922, -0.19529843],
        [ 0.        , -0.33086354, -1.08960132, ...,  1.42329686,
          0.73380258, -0.19529843],
        [ 0.        , -0.33086354, -1.08960132, ...,  1.42329686,
         -0.24922286, -0.19529843],
        ...,
        [ 0.        ,  1.93143694,  2.08905802, ..., -

### Modelling

In [42]:
# Create a normalized model
model_norm = LogisticRegression()

model_norm

In [43]:
# Fit model
model_norm.fit(x_train_norm, y_train)

y_pred_norm = model_norm.predict(x_test_norm)

y_pred_norm

array([5, 6, 6, 2, 4, 1, 7, 3, 4, 3, 4, 7, 4, 1, 2, 6, 3, 7, 1, 2, 3, 2,
       0, 2, 6, 1, 9, 7, 7, 8, 4, 5, 2, 3, 7, 5, 9, 8, 7, 4, 7, 1, 1, 4,
       7, 8, 2, 5, 8, 6, 8, 5, 0, 7, 4, 6, 3, 1, 7, 6, 6, 6, 4, 6, 7, 0,
       9, 2, 7, 5, 1, 5, 5, 9, 4, 2, 4, 4, 0, 7, 3, 8, 0, 8, 6, 6, 3, 9,
       9, 3, 6, 1, 0, 9, 8, 8, 4, 0, 3, 9, 7, 4, 9, 8, 1, 0, 7, 6, 0, 8,
       5, 1, 5, 5, 9, 7, 2, 6, 1, 0, 2, 1, 0, 5, 0, 7, 9, 3, 8, 1, 0, 1,
       4, 5, 3, 4, 7, 5, 6, 4, 7, 5, 9, 5, 3, 7, 8, 9, 3, 2, 7, 5, 8, 5,
       0, 3, 4, 6, 3, 1, 9, 4, 5, 8, 8, 6, 3, 3, 7, 7, 6, 1, 6, 9, 6, 5,
       7, 1, 1, 0, 2, 1, 3, 0, 1, 4, 1, 0, 6, 8, 9, 8, 8, 4, 7, 6, 0, 8,
       8, 6, 8, 5, 5, 5, 7, 0, 3, 1, 9, 0, 3, 5, 2, 8, 0, 9, 0, 1, 9, 6,
       1, 0, 0, 8, 0, 6, 7, 5, 5, 1, 1, 5, 7, 3, 7, 8, 0, 0, 6, 8, 2, 6,
       2, 2, 9, 9, 0, 8, 4, 7, 1, 3, 0, 1, 9, 2, 1, 1, 0, 8, 8, 7, 3, 1,
       5, 7, 0, 9, 5, 2, 2, 7, 7, 5, 9, 8, 9, 2, 5, 4, 4, 2, 3, 0, 8, 1,
       5, 1, 2, 0, 5, 1, 3, 8, 9, 2, 8, 4, 5, 4, 7,

In [44]:
# Get an accuracy score
standardScaler_accuracy_score = accuracy_score(y_test, y_pred_norm)

print('Accuray with normalization: ', standardScaler_accuracy_score)

Accuray with normalization:  0.9555555555555556


### Result Comparison

In [49]:
print(no_standardScaler_acc_score, '\n', standardScaler_accuracy_score)

0.9533333333333334 
 0.9555555555555556


# 3. Reduct Linearly Undistinguished Data

### Preparation

In [50]:
# Import modules
from sklearn.decomposition import KernelPCA
from sklearn.datasets import make_circles

In [51]:
# Generate linearly undistinguished data
features, _ = make_circles(n_samples=1000,
                          random_state=85,
                          noise=0.1,
                          factor=0.1)

features

array([[ 0.11512608, -0.1404095 ],
       [ 0.82975712, -0.27623227],
       [-0.15788743, -0.05662242],
       ...,
       [-0.03189136, -0.05701366],
       [-0.4337422 ,  0.97389709],
       [-0.14144344, -0.95212533]])

In [53]:
# Get KernelPCA(KPCA)
kcpa = KernelPCA(kernel='rbf',
                gamma=15,
                n_components=1)

kcpa

In [59]:
# Radial Basic Function(RBF) Kernal PCA
features_rbf = kcpa.fit_transform(features)

features_rbf

array([[ 0.27758804],
       [-0.37793593],
       [ 0.37362358],
       [ 0.15792736],
       [-0.38759982],
       [-0.37282623],
       [ 0.26688617],
       [-0.38317404],
       [ 0.55018078],
       [-0.38018402],
       [ 0.55573779],
       [-0.3953011 ],
       [ 0.33494762],
       [-0.39079029],
       [-0.36786334],
       [-0.38738098],
       [ 0.33819745],
       [-0.38781449],
       [ 0.5090142 ],
       [ 0.23376007],
       [ 0.61729293],
       [-0.37985111],
       [ 0.28216951],
       [ 0.56568856],
       [ 0.31970831],
       [-0.38295851],
       [-0.39279007],
       [-0.38915437],
       [-0.39505028],
       [ 0.53595758],
       [-0.39154596],
       [ 0.25997297],
       [-0.384811  ],
       [ 0.60098102],
       [-0.37814629],
       [-0.36276908],
       [-0.37030551],
       [ 0.34314676],
       [ 0.36032582],
       [ 0.50118748],
       [-0.38207187],
       [ 0.53283222],
       [-0.36893602],
       [-0.38597722],
       [-0.37239508],
       [-0

In [60]:
# Result
print('Number of Original Data: ', features.shape[1])
print('Number of Reduced Data: ', features_rbf.shape[1])

Number of Original Data:  2
Number of Reduced Data:  1


# 4. Linear Discriminant Analysis

### Preparation

In [61]:
# Import modules
from sklearn import datasets
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [65]:
# Load dataset
iris = datasets.load_iris()

iris

{'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
  

In [67]:
features = iris.data
target = iris.target

features, target

(array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
        [5

In [68]:
# Create Linear discriminant Analysis(LDA) object
lda = LinearDiscriminantAnalysis(n_components=1)
features_lda = lda.fit(features, target).transform(features)

print('Number of Original Data: ', features.shape[1])
print('Number of Reduced Data: ', features_lda.shape[1])

Number of Original Data:  4
Number of Reduced Data:  1
