In [30]:
import pandas as pd
import numpy as np
from scipy.io import arff
from sklearn.decomposition import PCA
import plotly.express as px
import plotly.graph_objects as go
from sklearn.cluster import KMeans
from sklearn import preprocessing
from sklearn.metrics import accuracy_score


In [2]:
data = arff.loadarff('DryBeanDataset/Dry_Bean_Dataset.arff')
df = pd.DataFrame(data[0])

df['Class'] = df['Class'].str.decode('utf-8')
df.head()

Unnamed: 0,Area,Perimeter,MajorAxisLength,MinorAxisLength,AspectRation,Eccentricity,ConvexArea,EquivDiameter,Extent,Solidity,roundness,Compactness,ShapeFactor1,ShapeFactor2,ShapeFactor3,ShapeFactor4,Class
0,28395.0,610.291,208.178117,173.888747,1.197191,0.549812,28715.0,190.141097,0.763923,0.988856,0.958027,0.913358,0.007332,0.003147,0.834222,0.998724,SEKER
1,28734.0,638.018,200.524796,182.734419,1.097356,0.411785,29172.0,191.27275,0.783968,0.984986,0.887034,0.953861,0.006979,0.003564,0.909851,0.99843,SEKER
2,29380.0,624.11,212.82613,175.931143,1.209713,0.562727,29690.0,193.410904,0.778113,0.989559,0.947849,0.908774,0.007244,0.003048,0.825871,0.999066,SEKER
3,30008.0,645.884,210.557999,182.516516,1.153638,0.498616,30724.0,195.467062,0.782681,0.976696,0.903936,0.928329,0.007017,0.003215,0.861794,0.994199,SEKER
4,30140.0,620.134,201.847882,190.279279,1.060798,0.33368,30417.0,195.896503,0.773098,0.990893,0.984877,0.970516,0.006697,0.003665,0.9419,0.999166,SEKER


In [21]:
X = df.iloc[:, :-1]

# We need to encode the classes as number for scoring accuracy
Y_unencoded = df.iloc[:, -1]
le=preprocessing.LabelEncoder()
le.fit(Y_unencoded)
Y = le.transform(Y_unencoded)

0           SEKER
1           SEKER
2           SEKER
3           SEKER
4           SEKER
           ...   
13606    DERMASON
13607    DERMASON
13608    DERMASON
13609    DERMASON
13610    DERMASON
Name: Class, Length: 13611, dtype: object

In [68]:
fig = px.scatter(df, x='Area', y='Compactness', color='Class', title='Bean Area and Compactness')

fig.write_html("html_figures/bean_area_compactness.html")
fig.show()


In [5]:
pca = PCA(n_components=2)

X_transformed = pca.fit_transform(X)

In [6]:
df_pca = pd.DataFrame(X_transformed)
df_pca['Class'] = df['Class']

In [71]:
fig = px.scatter(df_pca, x=0, y=1, color='Class', title='First Two Principal Components')

fig.update_layout(
    xaxis_title='PC1',
    yaxis_title='PC2',
)

fig.write_html("html_figures/pca2d.html")
fig.show()

In [8]:
pca3 = PCA(n_components=3)
X_transformed_3 = pca3.fit_transform(X)

df_pca_3 = pd.DataFrame(X_transformed_3)
df_pca_3['Class'] = df['Class']

In [73]:
fig = px.scatter_3d(df_pca_3, x=0, y=1, z=2, color='Class')
fig.update_layout(
    title='First Three Principal Components',
    scene = dict(
        xaxis_title='PC1',
        yaxis_title='PC2',
        zaxis_title='PC3',
    )
)

fig.update_traces(
    marker_size=2,
)

fig.write_html("html_figures/pca3d.html")
fig.show()

In [10]:
len(df['Class'].unique())

7

In [13]:
kmeans = KMeans(n_clusters=7, random_state=0).fit(X)

In [16]:
print(kmeans.labels_)

[3 3 3 ... 0 0 0]


In [17]:
kmeans.cluster_centers_

array([[4.15757856e+04, 7.61477296e+02, 2.79940298e+02, 1.90485691e+02,
        1.48220985e+00, 7.11951483e-01, 4.20501369e+04, 2.29919463e+02,
        7.54570229e-01, 9.88757773e-01, 9.02293210e-01, 8.26307272e-01,
        6.74031344e-03, 1.96395462e-03, 6.86255759e-01, 9.96459398e-01],
       [1.95500465e+05, 1.69323707e+03, 6.39643548e+02, 3.92683714e+02,
        1.63246123e+00, 7.86177120e-01, 1.98292327e+05, 4.98532334e+02,
        7.79176028e-01, 9.85991178e-01, 8.56212166e-01, 7.80448310e-01,
        3.28119329e-03, 7.53097414e-04, 6.09876971e-01, 9.90699577e-01],
       [8.36510536e+04, 1.12817749e+03, 4.24567164e+02, 2.52963983e+02,
        1.68383077e+00, 7.98499064e-01, 8.49805833e+04, 3.26120718e+02,
        7.55555629e-01, 9.84367504e-01, 8.26368122e-01, 7.69610840e-01,
        5.08892355e-03, 1.10603525e-03, 5.93444972e-01, 9.92046747e-01],
       [3.11256777e+04, 6.53403520e+02, 2.38972007e+02, 1.66162680e+02,
        1.44687898e+00, 7.07904499e-01, 3.14938646e+04, 1.987

In [26]:
Y_pred = kmeans.predict(X)
accuracy_score(Y, Y_pred)


0.3698479171258541

In [49]:
pca_centers = pca.transform(kmeans.cluster_centers_)
pca_3_centers = pca3.transform(kmeans.cluster_centers_)



X does not have valid feature names, but PCA was fitted with feature names


X does not have valid feature names, but PCA was fitted with feature names



In [50]:
pca_3_centers

array([[-1.63993815e+04, -4.96930283e+01,  1.08509290e+01],
       [ 2.02930299e+05, -9.92442599e+01,  1.75258603e+02],
       [ 4.37129538e+04,  1.02684754e+02, -5.11034330e+01],
       [-3.12537266e+04, -1.49576262e+01,  5.10860339e+01],
       [ 3.36127462e+01,  1.98171458e+01, -5.08009727e+01],
       [ 2.24433460e+04,  1.07491340e+02, -5.58671458e+01],
       [ 1.49478970e+05, -2.40338456e+02,  7.69631191e+01]])

In [39]:
pca_centers[:, 0]

array([-1.63993815e+04,  2.02930299e+05,  4.37129538e+04, -3.12537266e+04,
        3.36127462e+01,  2.24433460e+04,  1.49478970e+05])

In [74]:
fig = go.Figure()


groups = df_pca['Class'].unique()

for group in groups:
    df_curr = df_pca[df_pca['Class'] == group]

    fig.add_trace(
        go.Scatter(
            x=df_curr[0],
            y=df_curr[1],
            mode='markers',
            name=group
        )
    )

fig.add_trace(
    go.Scatter(
        x=pca_centers[:, 0],
        y=pca_centers[:, 1],
        mode='markers',
        name='Cluster Centers',
        marker=dict(
            color='Black',
            size=10,
        ),
    )
)

fig.update_layout(
    title='Cluster Centers',
    xaxis_title='PC1',
    yaxis_title='PC2',
)

fig.write_html("html_figures/pca2d_with_centers.html")

fig.show()

In [75]:
fig = go.Figure()


groups = df_pca_3['Class'].unique()

for group in groups:
    df_curr = df_pca_3[df_pca_3['Class'] == group]

    fig.add_trace(
        go.Scatter3d(
            x=df_curr[0],
            y=df_curr[1],
            z=df_curr[2],
            mode='markers',
            name=group,
            marker=dict(
                opacity=0.5,
                size=2,
            ),
        )
    )


fig.add_trace(
    go.Scatter3d(
        x=pca_3_centers[:, 0],
        y=pca_3_centers[:, 1],
        z=pca_3_centers[:, 2],
        mode='markers',
        name='Cluster Centers',
        marker=dict(
            color='Black',
            size=7,
        ),
    )
)

fig.update_layout(
    title='First Three Principal Components and Cluster Centers',
    scene = dict(
        xaxis_title='PC1',
        yaxis_title='PC2',
        zaxis_title='PC3',
    )
)

fig.write_html("html_figures/pca3d_with_centers.html")


fig.show()