In [18]:
from minisom import MiniSom 
import numpy as np 
import pandas as pd
from src import workflow as wf

## Importar dataset

In [21]:
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/oil-spill.csv'
missing_values = [' ', 'NaN', 'na', 'Na', '-', '--', 'n/a', '?']
columns = ['class']
data = pd.read_csv(url, na_values = missing_values, header=None)

In [23]:
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,1,2558,1506.09,456.63,90,6395000.0,40.88,7.89,29780.0,0.19,...,2850.00,1000.00,763.16,135.46,3.73,0,33243.19,65.74,7.95,1
1,2,22325,79.11,841.03,180,55812500.0,51.11,1.21,61900.0,0.02,...,5750.00,11500.00,9593.48,1648.80,0.60,0,51572.04,65.73,6.26,0
2,3,115,1449.85,608.43,88,287500.0,40.42,7.34,3340.0,0.18,...,1400.00,250.00,150.00,45.13,9.33,1,31692.84,65.81,7.84,1
3,4,1201,1562.53,295.65,66,3002500.0,42.40,7.97,18030.0,0.19,...,6041.52,761.58,453.21,144.97,13.33,1,37696.21,65.67,8.07,1
4,5,312,950.27,440.86,37,780000.0,41.43,7.03,3350.0,0.17,...,1320.04,710.63,512.54,109.16,2.58,0,29038.17,65.66,7.35,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
932,200,12,92.42,364.42,135,97200.0,59.42,10.34,884.0,0.17,...,381.84,254.56,84.85,146.97,4.50,0,2593.50,65.85,6.39,0
933,201,11,98.82,248.64,159,89100.0,59.64,10.18,831.0,0.17,...,284.60,180.00,150.00,51.96,1.90,0,4361.25,65.70,6.53,0
934,202,14,25.14,428.86,24,113400.0,60.14,17.94,847.0,0.30,...,402.49,180.00,180.00,0.00,2.24,0,2153.05,65.91,6.12,0
935,203,10,96.00,451.30,68,81000.0,59.90,15.01,831.0,0.25,...,402.49,180.00,90.00,73.48,4.47,0,2421.43,65.97,6.32,0


In [24]:
data_dropped = wf.drop_missing_values_columns(data, 5)
datos_mode = wf.mode_imputation(data_dropped, 49)
datos_mode[49].value_counts()

Columnas dropeadas:  []
Numero de columnas dropeadas:  0
Numero de columnas tras aplicar el porcentaje:  50


0.0    896
1.0     41
Name: 49, dtype: int64

## Estandarización de los datos 

In [25]:
data_std = wf.min_max_normalization(datos_mode)

In [26]:
data_std_dropped = data_std.drop(columns = [49])

In [27]:
x_train, x_test, y_train, y_test, network_output = wf.prep_datos_red(49 , data_std, data_std_dropped)

(749, 49) (188, 49)
(749, 1) (188, 1)


## Calculo de lados del mapa de kohonen

In [None]:
from math import sqrt, ceil

total_neurons = 5*sqrt(som_data.shape[0])

# calculate eigen_values
normal_cov = np.cov(som_data)
eigen_values = np.linalg.eigvals(normal_cov)
# 2 largest eigenvalues
result = sorted([i.real for i in eigen_values])[-2:]
ratio_2_largest_eigen = result[1]/result[0]
side = total_neurons/ratio_2_largest_eigen

print(total_neurons)
print(side)
print(ratio_2_largest_eigen)
print('1st side', ceil(side))
print('2nd side', ceil(ratio_2_largest_eigen))

## Entrenamiento de SOM

In [None]:
n_neurons = ceil(side)
m_neurons = ceil(ratio_2_largest_eigen)

# n_neurons = 15
# m_neurons = 15

som = MiniSom(n_neurons, m_neurons, som_data.shape[1], sigma=0.8, learning_rate=0.05, 
              neighborhood_function='gaussian', random_seed=42)
som.pca_weights_init(som_data)
som.train(som_data, num_iteration=100000, verbose=True)  # random training

## Ver en el mapa las neuronas ganadoras

In [None]:
w_x, w_y = zip(*[som.winner(d) for d in data])
w_x = np.array(w_x)
w_y = np.array(w_y)

plt.figure(figsize=(10, 9))
plt.pcolor(som.distance_map().T, cmap='bone_r', alpha=.2)
plt.colorbar()

for c in np.unique(target):
    idx_target = target==c
    plt.scatter(w_x[idx_target]+.5+(np.random.rand(np.sum(idx_target))-.5)*.8,
                w_y[idx_target]+.5+(np.random.rand(np.sum(idx_target))-.5)*.8, 
                s=50, c=colors[c-1], label=label_names[c])
plt.legend(loc='upper right')
plt.grid()
plt.savefig('resulting_images/som_seed.png')
plt.show()