In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Preparing Google Drive
Run the code below to give the Colab notebook access to your own personal Google Drive. Throughout the workshop, we will make use of data stored in a separate folder. To make sure the path to the data is correct, you will need to change your working folder to the "SACAC-EDA-2024 - Setup and Test" folder that you have copied to your Drive. An example is given below - make sure to change the path to match the path on your own drive. It should start with "/content/drive/"

In [None]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/ColabNotebooks/SACAC-EDA-2024

## Test data handling and plotting
Load a data file using `pandas` and plot using `seaborne`

In [None]:
df = pd.read_csv('./data/test-data.csv')
display(df.describe())
sns.scatterplot(data = df, x = 'x1', y = 'x2', alpha = 0.2)

## Test popular machine learning library
Cluster the data using `KMeans` from [scikit-learn](https://scikit-learn.org/stable/index.html)

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3)
kmeans.fit(df)
sns.scatterplot(data = df, x = 'x1', y = 'x2', hue = kmeans.labels_, palette = 'Set1', alpha = 0.5)

## Test popular deep learning library
Train an autoencoder using [keras](https://keras.io/)

In [None]:
from sklearn.preprocessing import StandardScaler
import keras

Z = StandardScaler().fit_transform(df)
aeEncoder = keras.Sequential([
    keras.layers.Dropout(0.5),
    keras.layers.Dense(16, activation='relu',),
    keras.layers.Dense(2, activation='tanh'),
])

aeDecoder = keras.Sequential([
    keras.layers.Dense(16, activation='relu'),
    keras.layers.Dense(Z.shape[1])
])

# Autoencoder (combined model)
autoencoder = keras.Sequential([
    keras.layers.Input(shape=(Z.shape[1],)),
    aeEncoder,
    aeDecoder
])

# Compile the autoencoder
autoencoder.compile(optimizer='adam', loss='mean_squared_error')

# Train the autoencoder
history = autoencoder.fit(Z, Z, epochs=50, batch_size=16)
# Plot the training loss
plt.plot(history.history['loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.show()

Z_reconstructed = autoencoder.predict(Z)
plt.scatter(Z_reconstructed[:,0], Z_reconstructed[:,1], alpha = 0.5)