### **Feature Extraction**

---

#### Curse of Dimensionality

<img src="../assets/curse_of_dimensionality.png"/>

#### **Principle Component Analysis**

Principal component analysis (PCA) is a statistical method that reduces the number of dimensions in a data table while retaining as much of the data's **variation** as possible.

<img src="../assets/pca.png"/>

In [1]:
import os
import pandas as pd

In [2]:
path = os.path.join("..","data","mnist_train.csv")
df = pd.read_csv(path, nrows=1000)
df.sample(5)

Unnamed: 0,label,1x1,1x2,1x3,1x4,1x5,1x6,1x7,1x8,1x9,...,28x19,28x20,28x21,28x22,28x23,28x24,28x25,28x26,28x27,28x28
476,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
242,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
740,8,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
199,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
284,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
# Step 1 - Mean centering
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

scaled_df = scaler.fit_transform(df.drop(columns=["label"]))
scaled_df.shape

(1000, 784)

In [7]:
# Step 2 - Find Covariance matrix
import numpy as np

cov_matrix = np.cov(scaled_df, rowvar=False)
cov_matrix.shape

(784, 784)

In [8]:
# Step 3 - Find eigenvectors and eigenvalues for the covariance matrix
# All the eigenvectors are the principle components for the given data 

eigen_values, eigen_vectors = np.linalg.eig(cov_matrix)

eigen_values.shape, eigen_vectors.shape

((784,), (784, 784))

In [11]:
# Step 4 - Transforn data into k principle components, by projection

k = 100
pca = eigen_vectors[0: k]

transformed_df = np.dot(scaled_df, pca.T)
transformed_df.shape

(1000, 100)

---

In [1]:
import os
import pandas as pd

In [2]:
path = os.path.join("..","data","winequality.csv")

df = pd.read_csv(path)
df.sample(5)

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
3668,white,6.5,0.27,0.26,11.0,0.03,2.0,82.0,0.99402,3.07,0.36,11.2,5
4238,white,6.4,0.29,0.18,15.0,0.04,21.0,116.0,0.99736,3.14,0.5,9.2,5
6003,red,6.3,0.57,0.28,2.1,0.048,13.0,49.0,0.99374,3.41,0.6,12.8,5
6084,red,6.6,0.8,0.03,7.8,0.079,6.0,12.0,0.9963,3.52,0.5,12.2,5
4151,white,7.4,0.26,0.32,3.7,0.032,29.0,193.0,0.99134,3.1,0.67,12.5,6


In [5]:
# Data cleaning

df = df.drop_duplicates()
df = df.dropna()
df.shape

(5295, 13)

In [6]:
X = df.drop("type", axis=1)
y = df["type"]

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)
X_train.sample(5)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
5399,10.4,0.44,0.73,6.55,0.074,38.0,76.0,0.999,3.17,0.85,12.0,7
4590,6.4,0.33,0.3,7.2,0.041,42.0,168.0,0.99331,3.22,0.49,11.1,6
2101,6.8,0.28,0.35,2.3,0.042,16.0,85.0,0.9906,3.19,0.56,12.4,6
3065,7.0,0.15,0.38,15.3,0.045,54.0,120.0,0.9975,3.18,0.42,9.8,6
3144,6.2,0.23,0.36,17.2,0.039,37.0,130.0,0.99946,3.23,0.43,8.8,6


In [9]:
# Step 1 - Mean centering

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [11]:
# Step 2 - Apply PCA

from sklearn.decomposition import PCA

pca = PCA(n_components=5)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

X_train_scaled.shape, X_train_pca.shape

# Goal is to take as many features as to explain at least 90 percent of the variance
pca.explained_variance_ratio_

((3971, 12), (3971, 5))

---