In [1]:
import numpy as np 
import pandas as pd

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.decomposition import PCA, KernelPCA # PCA, Kernel PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis # LDA

"""Matrix decomposition algorithms.
These include PCA, NMF, ICA, and more. 
Most of the algorithms of this module can be regarded as dimensionality reduction techniques"""

from sklearn.ensemble import RandomForestClassifier


[Documentação do PCA](https://scikit-learn.org/dev/modules/generated/sklearn.decomposition.PCA.html)

### EDA

In [2]:
census = pd.read_csv('census.csv')
census.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   final-weight    32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loos    32561 non-null  int64 
 12  hour-per-week   32561 non-null  int64 
 13  native-country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [3]:
X_census: np.ndarray = census.iloc[:, 0:14].values # previsores
y_census: np.ndarray = census.iloc[:, 14].values # classe de salarios 

In [4]:
### Label Encoder para categóricos
encoder_workclass = LabelEncoder()
encoder_education = LabelEncoder()
encoder_marital= LabelEncoder()
encoder_occupation = LabelEncoder()
encoder_relationship = LabelEncoder()
encoder_race = LabelEncoder()
encoder_sex = LabelEncoder()
encoder_country = LabelEncoder()

X_census[:,1] = encoder_workclass.fit_transform(X_census[:,1])
X_census[:,3] = encoder_education.fit_transform(X_census[:,3])
X_census[:,5] = encoder_marital.fit_transform(X_census[:,5])
X_census[:,6] = encoder_occupation.fit_transform(X_census[:,6])
X_census[:,7] = encoder_relationship.fit_transform(X_census[:,7])
X_census[:,8] = encoder_race.fit_transform(X_census[:,8])
X_census[:,9] = encoder_sex.fit_transform(X_census[:,9])
X_census[:,13] = encoder_country.fit_transform(X_census[:,13])


In [5]:
# Visualizando todos os atributos numéricos
X_census[0]

array([39, 7, 77516, 9, 13, 4, 1, 1, 4, 1, 2174, 0, 40, 39], dtype=object)

In [6]:
### Padronizacao
scaler_census: StandardScaler = StandardScaler()
X_census: np.ndarray = scaler_census.fit_transform(X_census)
X_census


array([[ 0.03067056,  2.15057856, -1.06361075, ..., -0.21665953,
        -0.03542945,  0.29156857],
       [ 0.83710898,  1.46373585, -1.008707  , ..., -0.21665953,
        -2.22215312,  0.29156857],
       [-0.04264203,  0.09005041,  0.2450785 , ..., -0.21665953,
        -0.03542945,  0.29156857],
       ...,
       [ 1.42360965,  0.09005041, -0.35877741, ..., -0.21665953,
        -0.03542945,  0.29156857],
       [-1.21564337,  0.09005041,  0.11095988, ..., -0.21665953,
        -1.65522476,  0.29156857],
       [ 0.98373415,  0.77689313,  0.92989258, ..., -0.21665953,
        -0.03542945,  0.29156857]])

In [7]:
### Divisão da base de dados
X_train, X_test, y_train, y_test = train_test_split(X_census, y_census, test_size=0.15, random_state=0)

In [8]:
X_test

array([[-0.84908045,  0.09005041, -0.11993902, ..., -0.21665953,
         0.28852962,  0.29156857],
       [-0.84908045,  0.09005041,  0.25298957, ..., -0.21665953,
        -0.03542945,  0.29156857],
       [-0.99570562,  0.09005041,  0.62989738, ..., -0.21665953,
        -0.03542945,  0.29156857],
       ...,
       [-1.28895595,  0.09005041, -0.12885437, ..., -0.21665953,
        -1.97918382,  0.29156857],
       [-0.6291427 ,  0.09005041, -1.03415505, ..., -0.21665953,
        -0.03542945,  0.29156857],
       [ 0.25060831,  0.09005041,  0.11100726, ..., -0.21665953,
        -0.03542945,  0.29156857]])

In [9]:
X_train.shape, y_train.shape

((27676, 14), (27676,))

Utilizaremos os algoritmos de redução de dimensionalidade PCA, LDA e Kernel PCA para reduzir de 14 atributos para 6.

### PCA

In [10]:
pca: PCA = PCA(n_components=6)

In [11]:
X_train_pca = pca.fit_transform(X_train) # Já fez um fit aqui não precisa fazer novamente embaixo, pca: PCA já se moldou aos dados
X_test_pca = pca.transform(X_test)

In [12]:
X_train_pca.shape,  X_test_pca.shape

((27676, 6), (4885, 6))

Combinou-se esses 14 atributos e gerou-se **6 novos atributos** que tem como base os 14 anteriores. Nesse sentido então, nossa base está completamente diferente da original.

In [13]:
pca.explained_variance_ratio_

array([0.151561  , 0.10109701, 0.08980379, 0.08076277, 0.07627678,
       0.07357646])

> Interpretação  
> 1 atributo : 15% das variáveis explicadas   
> 2 atributos: (15 + 10)% das variáveis explicadas  
> 3 atributos: (15 + 10 + 8)% das variáveis explicadas

In [14]:
### Com 6 atributos explicação de 57%
pca.explained_variance_ratio_.sum()

0.5730778058904427

<h2> Random Forest para teste da nova base </h2>

In [15]:
y_train

array([' <=50K', ' <=50K', ' <=50K', ..., ' <=50K', ' >50K', ' <=50K'],
      dtype=object)

In [16]:
rf_pca = RandomForestClassifier(n_estimators=40, random_state=0)
rf_pca.fit(X_train_pca, y_train)

In [17]:
y_pred = rf_pca.predict(X_test_pca)
y_pred

array([' <=50K', ' <=50K', ' <=50K', ..., ' <=50K', ' <=50K', ' >50K'],
      dtype=object)

In [18]:
y_test

array([' <=50K', ' <=50K', ' <=50K', ..., ' <=50K', ' <=50K', ' <=50K'],
      dtype=object)

In [19]:
### Accuracy
np.sum(y_test == y_pred) / y_test.size

0.8292732855680655

## Kernel PCA

```python
kernel_pca: KernelPCA = KernelPCA(n_components=8, kernel='rbf')
X_train_kpca = kernel_pca.fit_transform(X_train)
X_test_kpca = kernel_pca.fit(X_test)

```python
rf_kpca = RandomForestClassifier(n_estimators=40, random_state=0)
rf_kpca.fit(X_train_kpca, y_train)

y_pred_kcpa = rf_kpca.predict(X_test_kpca)

np.sum(y_test == y_pred_kpca) / y_pred_kpca.size

```python 
X_train_kpca.shape ,X_test_kpca.shape 
```
> Output: (27676, 8) , (4885, 8)


O accuracy dessa forma, foi de 83% comparado a 82% sem reduzir a dimensionalidade

## LDA  
[Documentação LDA](https://scikit-learn.org/dev/modules/generated/sklearn.discriminant_analysis.LinearDiscriminantAnalysis.html)

O LDA vai reduzir de 14 para apenas 1 classe, pois temos `n_classes = 2` que equivale a $< 50K$ e $\ge 50K$. Segue a docstring:  

n_components : int, default=None  

Number of components (<= min(n_classes - 1, n_features)) for dimensionality reduction.   
If None, will be set to min(n_classes - 1, n_features). This parameter only affects the `transform` method.

Essa restrição levou a 73% de acerto (ruim)