In [None]:
import tensorflow as tf 
import tensorflow.keras.layers as layers
import tensorflow.keras.models as models
import tensorflow.keras.optimizers as optim

import pandas as pd 
import numpy as np 
import os

In [None]:
PATH = r'.../tox21_dense_train.csv'
data = pd.read_csv(PATH).values
labels = pd.read_csv(PATH).columns
labels

In [None]:
for x in range(799):
    if max(data[:, 3:][:, x])!=min(data[:, 3:][:, x]):
        xtr = (data[:, 3:][:, x]-np.mean(data[:, 3:][:, x]))/(max(data[:, 3:][:, x])-min(data[:, 3:][:, x]))
        data[:, 3:][:, x] = xtr
    else:
        xtr = (data[:, 3:][:, x])/(max(data[:, 3:][:, x])+min(data[:, 3:][:, x])+1e-10)+0.5
        data[:, 3:][:, x] = xtr

In [None]:
data = data[:, 3:]

In [None]:
xtrain = np.array(data[:10000], dtype="float32")
xvalid = np.array(data[10000:12000], dtype="float32")

In [None]:
def DenseBlock(x, ndim):
    x = layers.Dense(ndim)(x)
    x = layers.Dropout(0.1)(x)
    x = layers.BatchNormalization(momentum=0.9)(x)
    x = layers.LeakyReLU(0.2)(x)
    return  x

def autoencoder(inp):
    inp = layers.Input(inp)
    x = DenseBlock(inp, 100)
    enc_out = DenseBlock(x, 12)
    x = DenseBlock(enc_out, 100)
    dec_out = DenseBlock(x, 799)
    encoder = models.Model(inputs=inp, outputs=enc_out, name="Encoder")
    model = models.Model(inputs=inp, outputs=dec_out, name="Autoencoder")

    return encoder, model

encoder, model = autoencoder((799))
print(encoder.summary())
print(model.summary())

In [None]:
model.compile(loss='mse', optimizer=optim.Adam(learning_rate=1e-4), metrics="accuracy")
model.fit(xtrain, xtrain, batch_size=100, verbose=1, epochs=200, validation_data=[xvalid, xvalid], validation_batch_size=100)

In [None]:
# model.save("./model50.h5")
# encoder.save("./encoder50.h5")

In [None]:
from minisom import MiniSom

In [None]:
r = 4
c = 3
iter = 50000
sigma = 1
lr = 0.5

In [None]:
som = MiniSom(x=r, y=c, input_len=12, sigma=sigma, learning_rate=lr)
xsom = encoder(xtrain)
som.random_weights_init(xsom)

In [None]:
som.train_random(xsom, iter, verbose=1)

In [None]:
# each neuron represents a cluster
winner_coordinates = np.array([som.winner(x) for x in xsom]).T
# with np.ravel_multi_index we convert the bidimensional
# coordinates to a monodimensional index
cluster_index = np.ravel_multi_index(winner_coordinates, [r,c])

In [None]:
cluster_index

In [None]:
# import matplotlib.pyplot as plt
# %matplotlib inline

# # plotting the clusters using the first 2 dimentions of the data
# for c in np.unique(cluster_index):
#     plt.scatter(xsom[cluster_index == c, 0],
#                 xsom[cluster_index == c, 1], label='cluster='+str(c), alpha=.7)

# # plotting centroids
# for centroid in som.get_weights():
#     plt.scatter(centroid[:, 0], centroid[:, 1], marker='x', 
#                 s=80, linewidths=5, color='k', label='centroid')
# plt.legend();