### New Dataset 
This updated dataset has been purged of the 1D versions in cases where we had both 1D and 3D reaction results. In other words, there are only 3D results.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn 

In [None]:
data = np.genfromtxt("../../data/newdata/AllChiPurged.dat")

In [None]:
# Create dataframe for easier operations/manipulation
df = pd.DataFrame(data, columns = ['ustat','alph1','alph2', 'beta', 'chi'])
df.head()

In [None]:
data = df.values

In [None]:
X, y = data[:, :-1], data[:, -1]

In [None]:
X

In [None]:
y

#### Transform Data

In [None]:
from sklearn.preprocessing import PowerTransformer
pt = PowerTransformer(method='yeo-johnson')    

In [None]:
# transform data
X = pt.fit_transform(X)
X_df = pd.DataFrame(X, columns = ['ustat','alph1','alph2', 'beta'])

In [None]:
X_df

In [None]:
pd.plotting.scatter_matrix(X_df, alpha = 1, figsize = (12, 12), diagonal = 'kde')

#### Three Dimensional Scatter for Features

Three dimensional scatterplots with x, y, and z as features.

In [None]:
x = X_df['ustat']
y = X_df['alph1']
z = X_df['alph2']


fig = plt.figure(figsize = (10, 10))
ax = plt.axes(projection ="3d")
 
ax.scatter3D(x, y, z)

ax.set_title('Feature Scatter: ustat, alph1, alph2')
ax.set_xlabel('ustat')
ax.set_ylabel('alph1')
ax.set_zlabel('alph2')
 
plt.show()

In [None]:
x = X_df['ustat']
y = X_df['alph1']
z = X_df['beta']


fig = plt.figure(figsize = (10, 10))
ax = plt.axes(projection ="3d")
 
ax.scatter3D(x, y, z)

ax.set_title('Feature Scatter: ustat, alph1, beta')
ax.set_xlabel('ustat')
ax.set_ylabel('alph1')
ax.set_zlabel('beta')
 
plt.show()

In [None]:
x = X_df['ustat']
y = X_df['alph2']
z = X_df['beta']


fig = plt.figure(figsize = (10, 10))
ax = plt.axes(projection ="3d")
 
ax.scatter3D(x, y, z)

ax.set_title('Feature Scatter: ustat, alph2, beta')
ax.set_xlabel('ustat')
ax.set_ylabel('alph2')
ax.set_zlabel('beta')
 
plt.show()

In [None]:
x = X_df['alph1']
y = X_df['alph2']
z = X_df['beta']


fig = plt.figure(figsize = (10, 10))
ax = plt.axes(projection ="3d")
 
ax.scatter3D(x, y, z)

ax.set_title('Feature Scatter: alph1, alph2, beta')
ax.set_xlabel('alph1')
ax.set_ylabel('alph2')
ax.set_zlabel('beta')
 
plt.show()

In [None]:
xxx

In [None]:
from sklearn.cluster import KMeans
# https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
# https://www.geeksforgeeks.org/elbow-method-for-optimal-value-of-k-in-kmeans/
# https://medium.com/@sk.shravan00/k-means-for-3-variables-260d20849730

In [None]:
#### Columns: ustat, alph1, alph2, beta

In [None]:
# elbow method: heuristic used in determining the number of clusters in a data set
# SSE/intertia: sum of the squared distance between the average point (called Centroid) and each point of the cluster

def elbow_method(X):
    error = []
    for i in range(1,11):
        k_means = KMeans(n_clusters=i,init='k-means++', random_state=42)
        k_means.fit(X)
        # Sum of squared distances of samples to their closest cluster center, 
        # weighted by the sample weights if provided.
        error.append(k_means.inertia_)
    #plot elbow curve
    plt.plot(np.arange(1,11), error)
    plt.xlabel('k')
    plt.ylabel('Sum of Square Error (SSE)')
    plt.show()

In [None]:
# five columns: four features, one target
list(df.columns)

In [None]:
# extract three features/columns
# ustat, alph1, alph2
X = df.iloc[:,[True, True, True, False, False]].values

In [None]:
elbow_method(X)

In [None]:
# select the value of k at the “elbow” ie the point after which the distortion/inertia start decreasing in a linear fashion

In [None]:
# predict labels for K = 4 clusters
km = KMeans(n_clusters = 4, init = 'k-means++',  random_state=42)
y = km.fit_predict(X)

In [None]:
# labels
data1 = df[y==0]
data2 = df[y==1]
data3 = df[y==2]
data4 = df[y==3]

In [None]:
fig = plt.figure(figsize = (10, 10))
ax = plt.axes(projection ="3d")



# Data for three-dimensional scattered points
ax.scatter3D(data1['ustat'], data1['alph1'], data1['alph2'], c='tab:olive', label = 'Cluster 1')

ax.scatter3D(data2['ustat'],data2['alph1'],data2['alph2'],c ='tab:cyan', label = 'Cluster 2')

ax.scatter3D(data3['ustat'], data3['alph1'], data3['alph2'], c='tab:purple', label = 'Cluster 3')

ax.scatter3D(data4['ustat'],data4['alph1'],data4['alph2'],c ='tab:orange', label = 'Cluster 4')

plt.scatter(km.cluster_centers_[:,0], km.cluster_centers_[:,1], color = 'black', s = 100)

ax.set_title('k-Means Cluster, k = 4: ustat, alph1, alph2')
ax.set_xlabel('ustat')
ax.set_ylabel('alph1')
ax.set_zlabel('alph2')

plt.legend()
plt.show()

In [None]:
# ustat, alph1, beta
X = df.iloc[:,[True, True, False, True, False]].values

In [None]:
elbow_method(X)

In [None]:
# predict labels for K = 3 clusters
km = KMeans(n_clusters = 3, init = 'k-means++',  random_state=42)
y = km.fit_predict(X)

In [None]:
# labels
data1 = df[y==0]
data2 = df[y==1]
data3 = df[y==2]

In [None]:
fig = plt.figure(figsize = (10, 10))
ax = plt.axes(projection ="3d")



# Data for three-dimensional scattered points
ax.scatter3D(data1['ustat'], data1['alph1'], data1['beta'], c='tab:olive', label = 'Cluster 1')

ax.scatter3D(data2['ustat'],data2['alph1'],data2['beta'],c ='tab:cyan', label = 'Cluster 2')

ax.scatter3D(data3['ustat'], data3['alph1'], data3['beta'], c='tab:purple', label = 'Cluster 3')


plt.scatter(km.cluster_centers_[:,0], km.cluster_centers_[:,1], color = 'black', s = 100)

ax.set_title('k-Means Cluster, k = 3: ustat, alph1, beta')
ax.set_xlabel('ustat')
ax.set_ylabel('alph1')
ax.set_zlabel('beta')

plt.legend()
plt.show()

In [None]:
# ustat, alph2, beta
X = df.iloc[:,[True, False, True, True, False]].values

In [None]:
elbow_method(X)

In [None]:
# predict labels for K = 5 clusters
km = KMeans(n_clusters = 5, init = 'k-means++',  random_state=42)
y = km.fit_predict(X)

In [None]:
# labels
data1 = df[y==0]
data2 = df[y==1]
data3 = df[y==2]
data4 = df[y==3]
data5 = df[y==4]

In [None]:
fig = plt.figure(figsize = (10, 10))
ax = plt.axes(projection ="3d")



# Data for three-dimensional scattered points
ax.scatter3D(data1['ustat'], data1['alph2'], data1['beta'], c='tab:olive', label = 'Cluster 1')

ax.scatter3D(data2['ustat'],data2['alph2'],data2['beta'],c ='tab:cyan', label = 'Cluster 2')

ax.scatter3D(data3['ustat'], data3['alph2'], data3['beta'], c='tab:purple', label = 'Cluster 3')

ax.scatter3D(data4['ustat'], data4['alph2'], data4['beta'], c='tab:orange', label = 'Cluster 4')

ax.scatter3D(data5['ustat'], data5['alph2'], data5['beta'], c='tab:pink', label = 'Cluster 5')

plt.scatter(km.cluster_centers_[:,0], km.cluster_centers_[:,1], color = 'black', s = 100)

ax.set_title('k-Means Cluster, k = 5: ustat, alph2, beta')
ax.set_xlabel('ustat')
ax.set_ylabel('alph2')
ax.set_zlabel('beta')

plt.legend()
plt.show()

In [None]:
# alph1, alph2, beta
X = df.iloc[:,[False, True, True, True, False]].values

In [None]:
elbow_method(X)

In [None]:
# predict labels for K = 3 clusters
km = KMeans(n_clusters = 3, init = 'k-means++',  random_state=42)
y = km.fit_predict(X)

In [None]:
# labels
data1 = df[y==0]
data2 = df[y==1]
data3 = df[y==2]

In [None]:
fig = plt.figure(figsize = (10, 10))
ax = plt.axes(projection ="3d")



# Data for three-dimensional scattered points
ax.scatter3D(data1['alph1'], data1['alph2'], data1['beta'], c='tab:olive', label = 'Cluster 1')

ax.scatter3D(data2['alph1'],data2['alph2'],data2['beta'],c ='tab:cyan', label = 'Cluster 2')

ax.scatter3D(data3['alph1'], data3['alph2'], data3['beta'], c='tab:purple', label = 'Cluster 3')

plt.scatter(km.cluster_centers_[:,0], km.cluster_centers_[:,1], color = 'black', s = 100)

ax.set_title('k-Means Cluster, k = 3: alph1, alph2, beta')
ax.set_xlabel('alph1')
ax.set_ylabel('alph2')
ax.set_zlabel('beta')

plt.legend()
plt.show()

In [None]:
# 1) separate data clusters into seaprate plots
#2) display chi values
#3) see relationship between clusters and chi
# report