In [1]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.datasets import load_iris

In [2]:
dataset = load_iris()

In [3]:
type(dataset)

sklearn.utils._bunch.Bunch

In [4]:
dataset.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [5]:
print(dataset.DESCR)

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [6]:
X = dataset['data']
y = dataset['target']

In [7]:
len(X), len(y)

(150, 150)

In [8]:
from sklearn.model_selection import train_test_split as tts

X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, random_state=42)

### Using PCA, convert 3 features to 2 features

In [9]:
pca = PCA(n_components=3)

In [10]:
X_train = pca.fit_transform(X_train)
X_train

array([[-3.17191553e+00,  1.29457045e-01,  2.74955159e-01],
       [-2.34745821e+00,  1.33781523e+00,  2.71788068e-01],
       [ 9.71520952e-01,  4.86044204e-01, -3.20689648e-01],
       [-2.56753356e+00,  1.60522934e-02,  8.62344384e-02],
       [-2.95088792e+00, -3.43587883e-01,  1.72320523e-01],
       [ 1.57397400e+00, -3.56283392e-01, -1.34399639e-01],
       [ 9.76313730e-01,  3.35516537e-01, -2.45953936e-02],
       [-2.51851294e+00,  3.69738445e-01, -9.24482207e-02],
       [-2.68502537e+00,  3.26991300e-01,  7.57370407e-02],
       [-2.60734589e+00,  8.14321255e-01,  2.16884159e-01],
       [ 1.46242806e+00, -5.57915855e-01,  2.83802576e-01],
       [ 8.50516252e-01,  2.08313940e-01,  3.81872802e-01],
       [ 1.26454870e+00,  4.27647245e-01, -2.42585185e-01],
       [-2.58248688e+00,  8.09432716e-01,  1.26361388e-01],
       [-2.46440403e+00,  6.47300146e-01, -8.69320372e-02],
       [-1.40580297e-01, -6.65638257e-01, -3.23323589e-01],
       [ 1.49066461e+00, -1.21522133e-01

In [11]:
pca.components_

array([[ 0.35922175, -0.08867716,  0.85800365,  0.35626652],
       [ 0.66072216,  0.72708635, -0.16688652, -0.08331002],
       [-0.5781132 ,  0.60569412,  0.08006791,  0.54084091]])

In [12]:
X_test = pca.transform(X_test)

In [13]:
pca.explained_variance_ratio_

array([0.91959926, 0.05714377, 0.01838378])

In [14]:
!pip install plotly
import plotly.express as px



In [15]:
import pandas as pd
df = pd.DataFrame(X_train, columns = ['A', 'B', 'C'])
df.head()

Unnamed: 0,A,B,C
0,-3.171916,0.129457,0.274955
1,-2.347458,1.337815,0.271788
2,0.971521,0.486044,-0.32069
3,-2.567534,0.016052,0.086234
4,-2.950888,-0.343588,0.172321


In [16]:
fig = px.scatter_3d(df, x='A', y='B', z='C')
#fig.show()