# Data Preprocessing

- Import the data
- Get the description of the data
- Decrease the dimension of the data
    - PCA
    

In [79]:
# import the libraries

# data analysis and wrangling
import pandas as pd
import numpy as np

# visualization
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB # Gaussian Naive Bays
from sklearn.linear_model import Perceptron
from sklearn.tree import DecisionTreeClassifier

from sklearn.decomposition import PCA
import seaborn as sb

## Import the data

In [2]:
# import the data
train_df = pd.read_csv('./../data/TrainingSet/ACT1_competition_training.csv')
test_df = pd.read_csv('./../data/TestSet/ACT1_competition_test.csv')
combine = [train_df, test_df]
# combine is used to ensure whatever preprocessing is done on training data
# is also done on test data

In [3]:
train_df.head()

Unnamed: 0,MOLECULE,Act,D_3,D_4,D_5,D_6,D_7,D_8,D_9,D_11,...,D_11061,D_11064,D_11065,D_11066,D_11067,D_11068,D_11070,D_11074,D_11076,D_11078
0,ACT1_M_80,6.0179,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ACT1_M_189,4.3003,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ACT1_M_190,5.2697,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ACT1_M_402,6.1797,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ACT1_M_659,4.3003,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Get the description of the data

In [4]:
# Features/Variable names

train_df.columns

Index(['MOLECULE', 'Act', 'D_3', 'D_4', 'D_5', 'D_6', 'D_7', 'D_8', 'D_9',
       'D_11',
       ...
       'D_11061', 'D_11064', 'D_11065', 'D_11066', 'D_11067', 'D_11068',
       'D_11070', 'D_11074', 'D_11076', 'D_11078'],
      dtype='object', length=9493)

In [5]:
# preview the data

train_df.head(5)

Unnamed: 0,MOLECULE,Act,D_3,D_4,D_5,D_6,D_7,D_8,D_9,D_11,...,D_11061,D_11064,D_11065,D_11066,D_11067,D_11068,D_11070,D_11074,D_11076,D_11078
0,ACT1_M_80,6.0179,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ACT1_M_189,4.3003,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ACT1_M_190,5.2697,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ACT1_M_402,6.1797,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ACT1_M_659,4.3003,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# General data statistics

train_df.describe()

Unnamed: 0,Act,D_3,D_4,D_5,D_6,D_7,D_8,D_9,D_11,D_15,...,D_11061,D_11064,D_11065,D_11066,D_11067,D_11068,D_11070,D_11074,D_11076,D_11078
count,37241.0,37241.0,37241.0,37241.0,37241.0,37241.0,37241.0,37241.0,37241.0,37241.0,...,37241.0,37241.0,37241.0,37241.0,37241.0,37241.0,37241.0,37241.0,37241.0,37241.0
mean,4.690458,8.1e-05,5.4e-05,0.00043,0.000107,2.7e-05,8.1e-05,0.000242,5.4e-05,2.7e-05,...,2.7e-05,2.7e-05,8.1e-05,0.000215,2.7e-05,2.7e-05,2.7e-05,2.7e-05,2.7e-05,2.7e-05
std,0.648661,0.015546,0.010364,0.020723,0.010363,0.005182,0.008975,0.015544,0.007328,0.005182,...,0.005182,0.005182,0.008975,0.014655,0.005182,0.005182,0.005182,0.005182,0.005182,0.005182
min,4.3003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.3003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,4.3003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,4.9033,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,8.134,3.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [7]:
# Data Frame information (null, data type etc)

train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37241 entries, 0 to 37240
Columns: 9493 entries, MOLECULE to D_11078
dtypes: float64(1), int64(9491), object(1)
memory usage: 2.6+ GB


## Decrease the dimension of the data
### PCA via SVD

**Description of SVD**

You can take any matrix X, it doesn’t matter if it is square, singular or diagonal, you can decompose it into a product of three matrices(as shown in the figure below); two orthogonal matrices U and V and diagonal matrix D. The orthogonal matrix has same dimensions as your data matrix and then your diagonal matrix is square and it has dimensions kxk (k is the number of variables you have), V is again a square matrix.

![SVD1](./../image/svd_1.png)

**What Does SVD Has To Do With Dimensionality Reduction? **

The image below shows how I reduce the number of dimensions from k to q(k<q).  If you reduce the number of column vectors to q , then you have obtained the q-dimensional hyper-plane in this example. The values of D gives you the amount of variance retained by this reduction.

![SVD2](./../image/svd_2.png)

**How to pick a q? **

Calculate the Variance of the eigenvectors. The best way to figure out how much variance does your dimensions capture is to plot a scree plot.

![SVD3](./../image/svd_3.png)

You can find this link for more information: https://bigdata-madesimple.com/decoding-dimensionality-reduction-pca-and-svd/

In [26]:
# drop the first 2 columns
# keep the features merely
train_arr = train_df.iloc[:, 2 : ].values
# use SVD to decompose the original matrix
u, s, vh = np.linalg.svd(train_arr, full_matrices=False)
print(u.shape, s.shape, vh.shape)

In [94]:
def Reconstruct(q=130):
    # reconstruct a matrix with q eigen vectors
    print("q = ", q)
    # slice the matricies
    u_r, s_r, vh_r = u[:, :q], s[0:q], vh[:q, :]
    print("U_r shape:", u_r.shape, "S_r shape:", s_r.shape, "Vh_r shape:", vh_r.shape)
    # reshape s_r which was a (q, ) array
    s_r = s_r.reshape((1, q))
    # reconstruct the data matrix with q components
    train_recstr = np.dot(u_r, np.dot(s_r * np.eye(q), vh_r))
    print("Matrix after reconstruction: ", train_recstr.shape)

In [95]:
Reconstruct(130)

q =  130
U_r shape: (37241, 130) S_r shape: (130,) Vh_r shape: (130, 9491)
Matrix after reconstruction:  (37241, 9491)


## Use PCA directly

more information: https://etav.github.io/python/scikit_pca.html


In [81]:
covar_matrix = PCA(n_components = 500) # pick 500 features

Y_sklearn = covar_matrix.fit(train_arr)
variance = covar_matrix.explained_variance_ratio_ #calculate variance ratios

var=np.cumsum(np.round(covar_matrix.explained_variance_ratio_, decimals=3)*100)
var #cumulative sum of variance explained with [n] features

array([32.9, 42.6, 48.7, 52.4, 55.1, 57.4, 59.4, 61.3, 62.9, 64.4, 65.8,
       67. , 68.2, 69.3, 70.2, 71.1, 71.8, 72.5, 73.2, 73.8, 74.4, 75. ,
       75.5, 76. , 76.5, 76.9, 77.3, 77.7, 78.1, 78.5, 78.8, 79.1, 79.4,
       79.7, 80. , 80.3, 80.6, 80.9, 81.1, 81.3, 81.5, 81.7, 81.9, 82.1,
       82.3, 82.5, 82.7, 82.9, 83.1, 83.3, 83.5, 83.7, 83.9, 84.1, 84.3,
       84.5, 84.7, 84.8, 84.9, 85. , 85.1, 85.2, 85.3, 85.4, 85.5, 85.6,
       85.7, 85.8, 85.9, 86. , 86.1, 86.2, 86.3, 86.4, 86.5, 86.6, 86.7,
       86.8, 86.9, 87. , 87.1, 87.2, 87.3, 87.4, 87.5, 87.6, 87.7, 87.8,
       87.9, 88. , 88.1, 88.2, 88.3, 88.4, 88.5, 88.6, 88.7, 88.8, 88.9,
       89. , 89.1, 89.2, 89.3, 89.4, 89.5, 89.6, 89.7, 89.8, 89.9, 90. ,
       90.1, 90.2, 90.3, 90.4, 90.5, 90.6, 90.7, 90.8, 90.9, 91. , 91.1,
       91.2, 91.3, 91.4, 91.5, 91.6, 91.7, 91.8, 91.9, 92. , 92.1, 92.2,
       92.2, 92.2, 92.2, 92.2, 92.2, 92.2, 92.2, 92.2, 92.2, 92.2, 92.2,
       92.2, 92.2, 92.2, 92.2, 92.2, 92.2, 92.2, 92

In [90]:
itemindex = np.where(var >= 92)
print(itemindex)

(array([130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181,
       182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
       195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
       208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220,
       221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233,
       234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246,
       247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259,
       260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272,
       273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285,
       286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298,
       299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 3

In the above 2 matrices we see that the first feature explains roughly 32.9% of the variance within our data set while the first two explain 42.6% and so on. If we employ 132 features we capture 92.2% of the variance within the dataset, thus we gain very little by implementing an additional feature (think of this as diminishing marginal return on total variance explained).

In [91]:
Y_sklearn = covar_matrix.fit(train_arr)

In [None]:
# haven't finished
data = []

for name, col in train_df.values():
    trace = dict(
        type='scatter',
        x=Y_sklearn[y==name,0],
        y=Y)

https://plot.ly/ipython-notebooks/principal-component-analysis/