In [1]:
import pandas as pd
import numpy as np
import glob

from pathlib import Path, PurePath
from sklearn.manifold import TSNE
from bokeh.plotting import figure, show, output_file
from bokeh.models import ColumnDataSource
from smartula_sound import SmartulaSound

## MFCC 

In [3]:
folder_name = "csv/mfcc-electromagnetic-field"

## LPC

In [2]:
folder_name = "csv/lpc-electromagnetic-field"

In [4]:
data_folder = Path(folder_name)
files_to_open = data_folder / "*.csv"

all_filenames = [i for i in glob.glob(str(files_to_open))]
# all_filenames = all_filenames[:10]
list_of_features = [SmartulaSound(PurePath(f).name.split(" ")[1].replace(".csv", ""),
                                 PurePath(f).name.split(" ")[0],
                                 samples=None, features=np.ravel(pd.read_csv(f, header=None)))
                   for f in all_filenames]

## Dimension reduction

In [9]:
df_embedded = TSNE(n_components=2, perplexity=10, learning_rate=300, n_iter=3000, verbose=2) \
    .fit_transform([ss.features for ss in list_of_features])


[t-SNE] Computing 31 nearest neighbors...
[t-SNE] Indexed 1071 samples in 0.001s...
[t-SNE] Computed neighbors for 1071 samples in 0.051s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1071
[t-SNE] Computed conditional probabilities for sample 1071 / 1071
[t-SNE] Mean sigma: 3.568519
[t-SNE] Computed conditional probabilities in 0.034s
[t-SNE] Iteration 50: error = 84.5236893, gradient norm = 0.3925680 (50 iterations in 0.374s)
[t-SNE] Iteration 100: error = 83.7931671, gradient norm = 0.3748055 (50 iterations in 0.373s)
[t-SNE] Iteration 150: error = 83.5438080, gradient norm = 0.3837304 (50 iterations in 0.333s)
[t-SNE] Iteration 200: error = 82.9252167, gradient norm = 0.3954714 (50 iterations in 0.349s)
[t-SNE] Iteration 250: error = 83.2542648, gradient norm = 0.3870297 (50 iterations in 0.308s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 83.254265
[t-SNE] Iteration 300: error = 1.7454474, gradient norm = 0.0021239 (50 iterations in 0.243s)

In [6]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
df_embedded = pca.fit_transform([ss.features for ss in list_of_features])

In [10]:
data_frame = pd.DataFrame()
data_frame['x'] = df_embedded[:, 0]
data_frame['y'] = df_embedded[:, 1]
data_frame['elfield'] = [ss.electromagnetic_field_on for ss in list_of_features]
data_frame['timestamp'] = [ss.timestamp for ss in list_of_features]

## Data Visualization

In [11]:
data_frame['colors'] = ["#003399" if elfield == "True" else "#ff0000" for elfield in data_frame['elfield']]
source = ColumnDataSource(data=data_frame)

tools = "hover,pan,wheel_zoom,zoom_in,zoom_out,box_zoom,undo,redo,reset,tap,save,box_select," \
        "poly_select,lasso_select, "
tooltips = [
    ("timestamp", "@timestamp"),
    ("class", "@elfield")
]
p = figure(tools=tools, tooltips=tooltips)
p.scatter(x='x', y='y', fill_color='colors', fill_alpha=0.4, source=source, size=10, line_color=None)
output_file("color_scatter.html", title="color_scatter.py example")
show(p)  