In [None]:
import numpy as np
import torch
from torch import utils
import pandas as pd
from torch.utils.data import DataLoader, Dataset, ConcatDataset
from torch import nn
from torch.nn import functional as F
import pytorch_lightning as pl
from matplotlib import pyplot as plt
from scipy import signal as sig
import os
from pathlib import Path
import re
from torch.utils import data
import pandas as pd
import numpy as np
from smooth import preproc
from pathlib import Path
from dataloader import LandmarkDataset, SequenceDataset

pd.set_option('mode.chained_assignment', None)

In [None]:
data_root = Path("/media/newdrive/leto_backup/K6/")
landmark_files = []
for subdir in os.listdir(data_root):
    for file in os.listdir(data_root/subdir/'Down'):
        if re.match(r"00\d*DeepCut_resnet50_Down2May25shuffle1_1030000\.h5", file):
            lfile = data_root/subdir/'Down'/file
            landmark_files.append(lfile)
            


In [None]:

landmarks_file = Path('/media/newdrive/leto_backup/K6/2020-03-31/Down/0015DeepCut_resnet50_DownMay7shuffle1_1030000.h5')
landmarks_data = LandmarkDataset(landmarks_file)
data = SequenceDataset(landmarks_data.coords.reshape(len(landmarks_data), -1), seqlen=60, step=1, )

In [None]:
landmarks_file = landmark_files[2]
class SimpleAutoencoder(pl.LightningModule):
    def __init__(self, n_neurons=[203, 128, 128, 7], lr=1e-3, seqlen=30, landmark_files=landmark_files):
        super(SimpleAutoencoder, self).__init__()
        self.landmark_files = landmark_files
        self.seqlen = seqlen
        self.hparams = {'lr': lr}
        n_layers = len(n_neurons) - 1
        layers = list()
        for i in range(n_layers):
            layers.append(nn.Linear(n_neurons[i], n_neurons[i+1]))
            if i+1 < n_layers:
                layers.append(nn.ELU())
        self.encoder = nn.Sequential(*layers)
        layers = list()
        n_neurons = n_neurons[::-1]
        for i in range(n_layers):
            layers.append(nn.Linear(n_neurons[i], n_neurons[i+1]))
            if i+1 < n_layers:
                layers.append(nn.ELU())
        self.decoder = nn.Sequential(*layers)
        

    def forward(self, x):
        return self.decoder(self.encoder(x))
    
    def prepare_data(self):
        landmark_datasets = []
        for file in self.landmark_files:
            try:
                ds = LandmarkDataset(file)
                landmark_datasets.append(ds)
            except OSError:
                pass
        coords = [sig.decimate(ds.coords, q=4, axis=0).astype(np.float32) for ds in landmark_datasets]
        N, n_coords, _ = coords[0].shape
        train_data = [crds[:int(0.8*crds.shape[0])].reshape(-1, n_coords*2) for crds in coords]
        valid_data = [crds[int(0.8*crds.shape[0]):].reshape(-1, n_coords*2) for crds in coords]
        train_dsets = [SequenceDataset(data, seqlen=self.seqlen, step=1, diff=False) for data in train_data]
        valid_dsets = [SequenceDataset(data, seqlen=self.seqlen, step=10, diff=False) for data in valid_data]
        self.train_ds = ConcatDataset(train_dsets)
        self.valid_ds = ConcatDataset(valid_dsets)

    def train_dataloader(self):
        return DataLoader(self.train_ds, batch_size=256, shuffle=True, num_workers=4)

    def val_dataloader(self):
        # dataset = SequenceDataset(X_val, seqlen=30, step=5, diff=True)
        return DataLoader(self.valid_ds, batch_size=256, shuffle=True, num_workers=4)

    def configure_optimizers(self):
        opt = torch.optim.Adam(self.parameters(), self.hparams['lr'])
        sched = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, factor=0.2 ,patience=20, verbose=True, min_lr=1e-6)
        return [opt], [sched]
    
    def training_step(self, batch, batch_idx):
        bx = batch
        out = self(bx)
        loss = nn.functional.mse_loss(out, bx)
        logs = {'loss': loss}
        return {'loss': loss, 'log': logs}

    def validation_step(self, batch, batch_idx):
        bx = batch
        out = self(bx)
        loss = nn.functional.mse_loss(out, bx)
        logs = {'loss': loss}
        return {'val_loss': loss, 'log': logs}
    
    def validation_epoch_end(self, outputs):
        losses = torch.stack([out['val_loss'] for out in outputs])
#         print(losses.mean())
        return {"val_loss": losses.mean()}
         


In [None]:
model = SimpleAutoencoder(n_neurons=[2*12*30, 2048, 1024, 512, 32], lr=1e-4)
trainer = pl.Trainer(gpus=1, progress_bar_refresh_rate=12, max_epochs=50, )
trainer.fit(model)


In [None]:
dl = model.val_dataloader()
bx = next(iter(dl))
with torch.no_grad():
    out = model(bx)
plt.plot(bx[150].cpu().numpy().reshape(30, 24)[:,0], label='orig')
plt.plot(out[150].cpu().numpy().reshape(30, 24)[:,0], label='recon')
plt.legend()

In [None]:
def create_encoded_data(data, model, batch_size=256):
    dl = DataLoader(data, batch_size=batch_size, shuffle=False)
    X = []
    model.cuda()
    with torch.no_grad():
        for bx in dl:
            x_encoded = model.encoder(bx.cuda())
            X.append(x_encoded.cpu().numpy())
    return np.concatenate(X)

landmarks_file = landmark_files[2]
landmarks_data = LandmarkDataset(landmarks_file)
coords = sig.decimate(landmarks_data.coords, axis=0, q=4).astype(np.float32)
data = SequenceDataset(coords.reshape(len(coords), -1), seqlen=30, step=1, diff=False)

X_encoded = create_encoded_data(data, model)
X_encoded.shape

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import normalized_mutual_info_score, confusion_matrix, accuracy_score

kmeans = KMeans(n_clusters=30)
labels = kmeans.fit_predict(X_encoded)

In [None]:
from collections import defaultdict
import random
split_at = np.where(np.diff(labels) != 0)[0] + 1
sequence = [[s[0], split_at[i-1], len(s)] for i, s in enumerate(np.split(labels, indices_or_sections=split_at))]
sequence[0][1] = 0
seg_lengths = defaultdict(list)
for seg in sequence:
    seg_lengths[seg[0]].append(seg[2])

sequence = [seq for seq in sequence if seq[2] > 5]

cluster_frames = defaultdict(list)
for seq in sequence:
    cluster_frames[seq[0]].append((seq[1]*4 + seq[2]*2, seq[2]*2))

cluster_frames = {c:cl for c, cl in cluster_frames.items() if len(cl)>25}
# len(cluster_frames)
cluster_samples = {c: random.choices(cl, k=15) for c, cl in cluster_frames.items()}

In [None]:
F.softplus(torch.Tensor(np.array([-3])), beta=2)

In [None]:
import cv2 as cv
import importlib
import clip_videos
importlib.reload(clip_videos)

video_file = data_root/'2020-03-23'/'Down'/'0008DeepCut_resnet50_Down2May25shuffle1_1030000_labeled.mp4'

In [None]:
for cl_id,  cluster in cluster_samples.items():
    n_frames = [c[1] for c in cluster]
    print(cl_id, np.mean(n_frames), np.std(n_frames))

In [None]:
df = LandmarkDataset(landmarks_file).df
for cl_id,  cluster in cluster_samples.items():
    mid_frames = [c[0] for c in cluster]
    n_frames = [c[1] for c in cluster]
    n_frames = int(min(60, np.mean(n_frames) + np.std(n_frames)))
    print(mid_frames)
    clip_videos.save_collage_with_labels_short(str(video_file), df, mid_frames, n_frames_around=n_frames,
                                         save_file=f'clusters/example_1/cluster_{cl_id}.mp4')

In [None]:
writer.write(np.zeros())

In [None]:
!ls clusters/example_1

In [None]:
# sequence = [seq[0] for seq in sequence]
from collections import defaultdict, Counter

def count_ngrams(sequence, max_n=10):
    N = len(sequence)
    counter = defaultdict(int)
    for k in range(1, max_n):
        for i in range(N-k):
            counter[tuple(sequence[i:i+k])] += 1
    return counter

def segment_sequence(sequence, max_n=5):
    N = len(sequence)
    sequence = tuple(sequence)
    ngram_count = count_ngrams(sequence, max_n=max_n)
    u_arr = np.zeros(N - 1)
    for k in range(0, N - 1):
        u_k = 0
        for n in range(2, min(max_n, k, N-k)):
            s_1, s_2 = ngram_count[sequence[k-n+1:k+1]], ngram_count[sequence[k+1:k+n+1]]
            u_k += np.mean([1 if s_1 >= ngram_count[sequence[k-n+i+1:k+i+1]] else 0 for i in range(1, n)])
            u_k += np.mean([1 if s_2 >= ngram_count[sequence[k-n+i+1:k+i+1]] else 0 for i in range(1, n)])
        u_arr[k] = u_k / max_n / 2
    
    sequence = list(sequence)
    segments = []
    prev_idx = 0
    for idx in range(1, N-1):
        if u_arr[idx-1] < u_arr[idx] and u_arr[idx+1] < u_arr[idx]:
            segments.append(sequence[prev_idx:idx+1])
            prev_idx = idx+1

    segments.append(sequence[prev_idx:])
    return segments

segments = segment_sequence(sequence, max_n=10)

In [None]:
Counter(map(tuple, segments)).most_common(30)

In [None]:
plt.figure(figsize=(20, 4))
plt.plot(labels[:2000])

In [None]:
labels[:100]
27, 8, 26, 4, 10, 27, 8, 26, 4, 10, 27, 8, 26

In [None]:
plt.figure(figsize=(20, 4))
plt.plot(labels[3*10**4+1500:3*10**4+3000])

In [None]:
plt.figure(figsize=(20, 4))
plt.plot(labels[10**5+1500:10**5+3000])
# plt.plot(labels[250:400])

In [None]:
re.search(r"(a+b+c+)+", "daaabbbccabc")

In [None]:
chars = [chr(i) for i in range(ord('A'), ord('Z'))] + [chr(i) for i in range(ord('a'), ord('z'))]
labels_string = ''.join([chars[l] for l in labels])
labels_string[280:400]

In [None]:
pat = re.compile(r"(K+Q+M+d+b+)+")
spans = [match.span() for match in re.finditer(pat, labels_string)]
span_lengths = [span[1] - span[0] for span in spans]
spans[3] 

In [None]:
pat = re.compile(r"K+(?!Q*K+)")
fspans = [match.span() for match in re.finditer(pat, labels_string)]
fspans = [(max(0, s[0]-30), s[1]+30) for s in fspans]
fig, axes = plt.subplots(nrows=10, ncols=2, figsize=(18, 20))
for i in range(10):    
    for ipart, part in enumerate(landmarks_data.body_parts):
        if part in ['forepawR', 'forePawL', 'hindpawR', 'hindpawL']:
            axes[i][0].plot(coords[fspans[i][0]+15: fspans[i][1]+15,ipart,0], label=f"{part}_x")
            axes[i][0].plot(coords[fspans[i][0]+15: fspans[i][1]+15,ipart,1], label=f"{part}_y")
    axes[i][1].plot(labels[slice(*fspans[i])])
    axes[i][0].legend(loc='right')

In [None]:
print(len(re.findall(r"K+(?!K*Q+)", labels_string)))
print(len(re.findall(r"K+Q+(?!Q*M+)", labels_string)))
print(len(re.findall(r"K+Q+M+(?!M*d+)", labels_string)))
print(len(re.findall(r"K+Q+M+d+(?!d*b+)", labels_string)))


In [None]:
fig, axes = plt.subplots(nrows=50, ncols=2, figsize=(18, 200))
for i in range(50):    
    for ipart, part in enumerate(landmarks_data.body_parts):
        if part in ['forepawR', 'forePawL', 'hindpawR', 'hindpawL']:
            axes[i][0].plot(coords[spans[i][0]+15: spans[i][1]+15,ipart,0], label=f"{part}_x")
            axes[i][0].plot(coords[spans[i][0]+15: spans[i][1]+15,ipart,1], label=f"{part}_y")
    axes[i][1].plot(labels[slice(*spans[i])])
    axes[i][0].legend(loc='right')

In [None]:
plt.figure(figsize=(20, 4))
plt.plot(labels[:1000])

In [None]:
n_clusters = len(set(labels))
transition_matrix = np.zeros((n_clusters, n_clusters))
for i in range(len(labels) - 1):
    transition_matrix[labels[i], labels[i+1]] += 1.

np.fill_diagonal(transition_matrix, val=0)

transition_matrix /= transition_matrix.sum(axis=0, keepdims=True)
plt.imshow(transition_matrix)

In [None]:
def split(idx_arr):
    to_split = np.where(np.abs(np.diff(idx_arr)) > 1)[0] + 1
    return np.split(idx_arr, indices_or_sections=to_split)
behaviors = [split(np.where(y_gold==lbl)[0]) for lbl in set(y_gold)]
sections = [np.stack([np.mean(X_encoded[sec], axis=0) for sec in beh]) for beh in behaviors]
sections[1].shape