In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
import seaborn as sns
import matplotlib
import random
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

from bokeh.plotting import figure
from bokeh.io import output_notebook, show
from bokeh.models import ColumnDataSource, HoverTool, Panel, Tabs


# For DB scan
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from data_access.loader import load_processed_data

In [2]:
output_notebook()

In [3]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [4]:
FOLDER_NAME = '2'
file_name = r'tracking.prqt'
root = r'D:\Users\avitu\Downloads\parquets'
_path = Path(root)/ FOLDER_NAME/ file_name
path = str(_path)
path

'D:\\Users\\avitu\\Downloads\\parquets\\2\\tracking.prqt'

In [5]:
df_raw = load_processed_data(path)
print(len(df_raw))

19071


In [6]:
df = df_raw.copy()
# df.describe()

In [7]:
plot_width, plot_height=1200, 650

left, right = 0, df.x.max()
bottom ,top = 0, df.y.max()

p = figure(plot_width=plot_width, plot_height=plot_height, x_axis_location="above", x_range=(left, right), y_range=(top, bottom))

ids = df.id.drop_duplicates()
palatte = sns.color_palette("hls", len(ids))
hex_platte = [matplotlib.colors.to_hex(rgb) for rgb in palatte]
colors = dict(zip(ids, hex_platte))   

# tooltips=[(clm, f'@{clm}') for clm in df.columns]
tooltips=[('id', '@id'),('Frame', '@frame'),('(X, Y)', '(@x, @y)'),('(W, H)', '(@w, @h)'), ('angle', '@angle'), ('distance', '@distance'), ('Label', '@label') ]                                
hover_tool = HoverTool(tooltips=tooltips,)

for curr_id, curr_df in df.groupby('id'):                
    source = ColumnDataSource(curr_df)
    color = colors[curr_id]
    
    legend = curr_df.label.values[0]#str(curr_id)
    p.circle(x='x', y='y', source=source, legend=legend, color=color, muted_color=color, muted_alpha=0.1, alpha=0.5, muted=True)        
    
p.add_tools(hover_tool)
p.legend.click_policy="mute"
show(p)

median_path = _path.parent/'median.jpg'
if median_path.exists():    
    fig, ax = plt.subplots(figsize=(20, 10))
    img=mpimg.imread(str(median_path))
    imgplot = plt.imshow(img, 'gray')
    plt.show()

___

### Pre process before DBSCAN

In [8]:
def _data_to_input(df):
    data = df.reset_index()[['x','y', 'angle', 'id']].dropna() #'diff_x', 'diff_y',
    data.loc[:, 'angle_2'] = data.angle **2
    # data = df[['x','y']].values 
    X = StandardScaler().fit_transform(data.values)
    # X = data.values
    return X, data

In [9]:
X, data = _data_to_input(df)
data.tail()

Unnamed: 0,x,y,angle,id,angle_2
19066,347,235,180.0,357,32400.0
19067,340,228,225.0,357,50625.0
19068,304,242,170.537678,357,29083.099547
19069,302,243,153.434949,357,23542.28352
19070,300,242,206.565051,357,42669.120368


### Compute DBSCAN

In [10]:
def compute_db_scan(df, eps, min_samples):
    X, data = _data_to_input(df)
    db = DBSCAN(eps=eps, min_samples=min_samples).fit(X)
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    labels = db.labels_    
    
    data.loc[:,'cluster'] = labels
    data.loc[:,'is_core'] = core_samples_mask
    
    # Number of clusters in labels, ignoring noise if present.
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise_ = list(labels).count(-1)
   
    print('Estimated number of clusters: %d' % n_clusters_)
    print('Estimated number of noise points: %d' % n_noise_)
    print("Silhouette Coefficient: %0.3f"% metrics.silhouette_score(X, labels))
   
    return db, data

### Plot results

In [11]:
def convert_to_hex(rgba_color) :    
    red = int(rgba_color[0]*255)
    green = int(rgba_color[1]*255)
    blue = int(rgba_color[2]*255)
    return '#{:02x}{:02x}{:02x}'.format(red, green, blue )    

def plot_db_scan(figure, df_xy, color, markersize, label):
    if not len(df_xy):
        return    
    source = ColumnDataSource(df_xy)        
    color=convert_to_hex(color)
    glyph = figure.circle(x='x', y='y', source=source, legend=str(label), color=color, muted_color=color, muted_alpha=0.1, alpha=0.7, size=markersize, line_color='black')    
    glyph.muted = True

In [12]:
EPS = 0.5
MIN_SAMPLES =40# len(data) *0.015 #40 # Higher value > fewer clusters
print(f'MIN_SAMPLES: {MIN_SAMPLES}')
draw_noise, draw_none_core = False, True

db, data = compute_db_scan(df, eps=EPS, min_samples=MIN_SAMPLES)

labels = data.cluster.unique()
unique_labels = set(labels)

# Black removed and is used for noise instead.
colors = [tuple(plt.cm.Spectral(each)) for each in np.linspace(0, 1, len(unique_labels))]
random.Random(42).shuffle(colors)
colors = dict(zip(labels, colors))

p = figure(plot_width=plot_width, plot_height=plot_height, x_axis_location="above",)
p.y_range.flipped = True

for cluster, df_cluster in data.groupby('cluster'):
    color = colors[cluster]
    if cluster == -1:
        if not draw_noise:
            continue
        # Black used for noise.
        color = [0, 0, 0, 1]

    class_member_mask = (labels == cluster)

    df_core = df_cluster[df_cluster.is_core]    
    title = f'Cluster {cluster}'
    glyph = plot_db_scan(p, df_core, color, 12, title)    

    if draw_none_core:
        df_none_core = df_cluster[~df_cluster.is_core]            
        plot_db_scan(p, df_none_core, color, 6, title)        


hover_tool = HoverTool(tooltips=[[c, f'@{c}'] for c in data.columns],)        
p.add_tools(hover_tool)
p.legend.click_policy="mute"
p.legend.label_text_font_size = "8px"
show(p)

if median_path.exists():    
    fig, ax = plt.subplots(figsize=(20, 10))
    img=mpimg.imread(str(median_path))
    imgplot = plt.imshow(img, 'gray')
    plt.show()

MIN_SAMPLES: 40
Estimated number of clusters: 11
Estimated number of noise points: 1564
Silhouette Coefficient: -0.036


In [13]:
# df_debug = data.sort_values('id')
# df_debug[df_debug.cluster != -1].iloc[150:200,:]