In [1]:
import numpy as np
from tslearn.metrics import dtw
import scipy as sp
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt

from minisom import MiniSom

Install h5py to use hdf5 features: http://docs.h5py.org/
  warn(h5py_msg)


In [2]:
path = '../../data/'

In [3]:
df = pd.read_csv(path + 'data_smooth.csv')
df.head()

Unnamed: 0,x,y,z
0,0.0,0.0,0.0
1,0.077816,0.0425,-0.120315
2,0.229005,0.1275,-0.354938
3,0.455788,0.251128,-0.708374
4,0.764831,0.415965,-1.186632


In [4]:
def my_metric(x: np.ndarray, w: np.ndarray):
    return np.apply_along_axis(lambda w: dtw(x.reshape(-1, 3), w.reshape(-1, 3)), -1, w)

In [5]:
df.shape

(21619, 3)

In [6]:
signals = []
l = 300
s = 20
for i in range(0, df.shape[0]-l, s):
    sig = df.iloc[i:i+l]
    signals.append(sig.values.ravel())

In [7]:
df_signals = np.array(signals)

In [8]:
df_signals.shape

(1066, 900)

In [9]:
df_signals = (df_signals - df_signals.mean(0)) / df_signals.std(0)

In [6]:
n = 6
m = 6

In [None]:
som = MiniSom(n, m, 3*l, activation_distance=my_metric, learning_rate=0.7)
som.train(df_signals, 150, use_epochs=True, verbose=True)

In [11]:
clusters = np.zeros(df_signals.shape[0])
for i in range(df_signals.shape[0]):
    winner = som.winner(df_signals[i])
    clusters[i] = winner[0] * m + winner[1]

In [12]:
clusters = clusters.astype('int')

In [13]:
df_clustered = df.iloc[:df.shape[0] - l]

In [14]:
point_clusters = []

In [15]:
for i in range(df.shape[0] - l):
    point_clusters.append(np.argmax(np.bincount(clusters[max(0, (i - l + 1) // s) : i // s + 1])))

In [16]:
df_clustered['cluster'] = point_clusters

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clustered['cluster'] = point_clusters


In [17]:
df_clustered.cluster = df_clustered.cluster.astype('category')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clustered.cluster = df_clustered.cluster.astype('category')


In [21]:
df_clustered.to_csv(path + 'som_results.csv', index=0)

In [7]:
df_clustered = pd.read_csv(path + 'som_results.csv')

In [8]:
df_clustered['n'] = df_clustered.cluster // n
df_clustered['m'] = df_clustered.cluster % n

In [9]:
g = df_clustered.groupby(['n', 'm'])['n'].count()
g

n  m
0  0    1100
   2     640
   4     160
1  1     260
   2     920
   4    2300
   5    2040
2  2     640
   4     540
   5    2540
3  0    2220
   3    3220
4  0     460
   5    1979
5  0     260
   1     900
   2     180
   3     760
   4     200
Name: n, dtype: int64

In [10]:
%matplotlib qt
x = []
y = []
for i in g.index:
    x.append(i[0]-.5)
    y.append(i[1]-.5)
fig = plt.figure()
ax = fig.add_subplot(projection='3d')
ax.bar3d(x, y, np.zeros_like(x), 1, 1, g.values)
plt.show()

In [19]:
fig = px.scatter(df_clustered, y=['x', 'y', 'z'], color='cluster', height=450, width=1500)
fig.update_traces(marker=dict(size=3))