In [None]:
import os
import gc
import time

from IPython.display import clear_output
from tqdm import tqdm
from tqdm.contrib import tzip

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import umap

from sklearn.metrics.pairwise import euclidean_distances
from sklearn.model_selection import train_test_split

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import Resize
import matplotlib.pyplot as plt

import albumentations as A
from albumentations.pytorch import ToTensorV2

In [None]:
load_pretrained = True
load_embs = True
test_eval = False
find_threshold = False
get_dists = False

In [None]:
train_df = pd.read_csv("/kaggle/input/hms-harmful-brain-activity-classification/train.csv")

In [None]:
train_df

In [None]:
train_df.iloc[:,-6:] = train_df.iloc[:,-6:].values / train_df.iloc[:,-6:].sum(axis=1).values.reshape((-1, 1))

In [None]:
cols = ['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote']
train_df['entropy'] = -(train_df[cols] * np.log(train_df[cols])).sum(axis=1)

In [None]:
#Загружаю все спеки разом
if not load_embs:
    dct = np.load('/kaggle/input/default-specs/spectograms.npy', allow_pickle=True).item()

In [None]:
#Делю по пациентам на трейн, валидацию и тест
patients = train_df.patient_id.unique()
patients_train, patients_val_test, _, _ = train_test_split(patients, np.arange(len(patients)), test_size=0.3, random_state=123)
patients_val, patients_test, _, _ = train_test_split(patients_val_test, np.arange(len(patients_val_test)), test_size=0.5, random_state=123)

In [None]:
train_patients = train_df.loc[train_df.patient_id.isin(patients_train)].copy().reset_index(drop=True)
val_patients = train_df.loc[train_df.patient_id.isin(patients_val)].copy().reset_index(drop=True)
test_patients = train_df.loc[train_df.patient_id.isin(patients_test)].copy().reset_index(drop=True)

In [None]:
# Функция, чтобы считать среднее и стандартное отклонение в цикле, потому мтодами numpy памяти не хватает
def online_mean_std(data):
    n = 0
    mean = 0
    M2 = 0

    for x in tqdm(data):
        n = n + 1
        x = np.nan_to_num(x)
        delta = x - mean
        mean = mean + delta/n
        M2 = M2 + delta*(x - mean)

    variance = M2/(n - 1)
    return np.sqrt(variance.mean()), mean.mean()

In [None]:
# Считаю средние и ст.отклонения для каждого типа спектограмм
if not load_pretrained:
    means = []
    stds = []
    for el in ['LL', 'RL', 'LP', 'RP']:
        res = np.concatenate([dct[sid][el][None, :, int(slos)//2: int(slos)//2+300] for sid, slos in zip(train_patients.spectrogram_id, train_patients.spectrogram_label_offset_seconds)], axis=0)
        std, mean = online_mean_std(res)
        means.append(mean)
        stds.append(std)
        del res
        gc.collect()

    norm_mean = np.array(means).reshape((4, 1, 1))
    norm_std = np.array(stds).reshape((4, 1, 1))

### [None, :, int(slos)//2: int(slos)//2+300] Нужно, чтобы по 10 минут из спек вырезать. Новую ось создаю, чтобы по ней конкатить спеки

In [None]:
def normalize(x):
    '''[c, h, w]'''
    return (x - norm_mean) / norm_std

class SpecDataset(Dataset):
# В данных размер спеки 99 x 300    
    def __init__(self, df, dct, img_size=(99, 300)):
        self.df = df
        self.dct = dct
        self.image_size = img_size
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        spec = self.dct[self.df.iloc[index].spectrogram_id]
        shift = self.df.iloc[index].spectrogram_label_offset_seconds
        ll, rl, lp, rp = spec['LL'], spec['RL'], spec['LP'], spec['RP']
        x = np.concatenate([ll[None, :, int(shift)//2: int(shift)//2+300], rl[None, :, int(shift)//2: int(shift)//2+300], lp[None, :, int(shift)//2: int(shift)//2+300], rp[None, :, int(shift)//2: int(shift)//2+300]], axis=0)
        x = torch.from_numpy(normalize(x)).float()
        x = torch.nan_to_num(x, 0)
        transforms = Resize([self.image_size[0], self.image_size[1]])
        x = transforms(x)        
        return x

## Сеть представляет из себя просто ResNet блоки, которые уменьшают/увличивают ширину и высоту в два раза и увличивают/уменьшают число каналов в два раза

In [None]:
class ResNetBlock(nn.Module):
    def __init__(self, in_channels, kernel_size, modify=False, bn=True):
        super().__init__()
        self.modify = modify
        if modify=='downsample':
            self.conv1 = nn.Conv2d(in_channels=in_channels, out_channels=in_channels*2, stride=2, kernel_size=kernel_size, padding=kernel_size//2, bias=False)
            self.conv2 = nn.Conv2d(in_channels=in_channels*2, out_channels=in_channels*2, kernel_size=kernel_size, padding=kernel_size//2,bias=False)
            if bn:
                self.bn1 = nn.BatchNorm2d(in_channels*2)
                self.bn2 = nn.BatchNorm2d(in_channels*2)
            else:
                self.bn1 = nn.Identity()
                self.bn2 = nn.Identity()
                
        elif modify=='upsample':
            self.conv1 = nn.ConvTranspose2d(in_channels=in_channels, out_channels=in_channels//2, stride=2, kernel_size=kernel_size, output_padding=1, padding=kernel_size//2, bias=False)
            self.conv2 = nn.Conv2d(in_channels=in_channels//2, out_channels=in_channels//2, kernel_size=kernel_size, padding=kernel_size//2, bias=False)
            self.bn1 = nn.BatchNorm2d(in_channels//2)
            self.bn2 = nn.BatchNorm2d(in_channels//2)
        else:
            self.conv1 = nn.Conv2d(in_channels=in_channels, out_channels=in_channels, kernel_size=kernel_size, padding=kernel_size//2)
            self.conv2 = nn.Conv2d(in_channels=in_channels, out_channels=in_channels, kernel_size=kernel_size, padding=kernel_size//2)
            self.bn1 = nn.BatchNorm2d(in_channels)
            self.bn2 = nn.BatchNorm2d(in_channels)
        self.act = nn.ReLU()
        
        if modify=='downsample':
            self.proj = nn.Conv2d(in_channels=in_channels, out_channels=in_channels*2, stride=2, kernel_size=kernel_size, padding=kernel_size//2)
        if modify=='upsample':
            self.proj = nn.ConvTranspose2d(in_channels=in_channels, out_channels=in_channels//2, stride=2, kernel_size=kernel_size, output_padding=1, padding=kernel_size//2)


    def forward(self, x):
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.act(out)
        out = self.conv2(out)
        out = self.bn2(out)
        if self.modify:
            x = self.proj(x)
        out = x + out
        out = self.act(out)
        return out

    
class Encoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv = nn.Conv2d(4, 16, 7, 1, 7//2)
        self.rnb1 = ResNetBlock(16, 3, modify='downsample')
        self.rnb2 = ResNetBlock(32, 3, modify='downsample')
        self.rnb3 = ResNetBlock(64, 3, modify='downsample')
        self.rnb4 = ResNetBlock(128, 3, modify='downsample')
        self.rnb5 = ResNetBlock(256, 3, modify='downsample')
        self.rnb6 = ResNetBlock(512, 3, modify='downsample')
        self.rnb7 = ResNetBlock(1024, 3, modify='downsample')
        self.rnb8 = ResNetBlock(2048, 3, modify='downsample')
        
    def forward(self, x):
        x = self.conv(x)
        x = self.rnb1(x)
        x = self.rnb2(x)
        x = self.rnb3(x)
        x = self.rnb4(x)
        x = self.rnb5(x)
        x = self.rnb6(x)
        x = self.rnb7(x)
        x = self.rnb8(x)
        return x
    
class Decoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.rnb1 = ResNetBlock(4096, 3, modify='upsample')
        self.rnb2 = ResNetBlock(2048, 3, modify='upsample')
        self.rnb3 = ResNetBlock(1024, 3, modify='upsample')
        self.rnb4 = ResNetBlock(512, 3, modify='upsample')
        self.rnb5 = ResNetBlock(256, 3, modify='upsample')
        self.rnb6 = ResNetBlock(128, 3, modify='upsample')
        self.rnb7 = ResNetBlock(64, 3, modify='upsample')
        self.rnb8 = ResNetBlock(32, 3, modify='upsample')
        self.conv = nn.Conv2d(16, 4, 3, 1, 3//2)

    def forward(self, x):
        x = self.rnb1(x)
        x = self.rnb2(x)
        x = self.rnb3(x)
        x = self.rnb4(x)
        x = self.rnb5(x)
        x = self.rnb6(x)
        x = self.rnb7(x)
        x = self.rnb8(x)
        x = self.conv(x)
        return x

In [None]:
class SimpleAE(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            Encoder(),
            Decoder()
        )
        
    def forward(self, x):
        return self.net(x)

In [None]:
def run_epoch(model, dataloader, loss_fn, optimizer, epoch, device, scaler):
    model = model.to(device)
    model.train()
    losses = []
    for batch in tqdm(dataloader, total=len(dataloader)):
        x = batch.to(device)
        
#         with torch.autocast(device_type='cuda' if device=='cuda' else 'cpu', dtype=torch.float16 if device=='cuda' else torch.bfloat16):
        x_recon = model(x)
        loss = loss_fn(x, x_recon)

        loss.backward()
        optimizer.step()
#         scaler.scale(loss).backward()
#         scaler.step(optimizer)
#         scaler.update()
        
        optimizer.zero_grad()
                
        losses.append(loss.detach().cpu().item())
#     print(f'Не нан значений во время train: {np.count_nonzero(~np.isnan(losses))}')
    return np.nanmean(losses)

In [None]:
def evaluate(model, dataloader, loss_fn, device, scaler):
    model = model.to(device)
    losses = []
    with torch.no_grad():
        model.eval()
        for batch in tqdm(dataloader, total=len(dataloader)):
            x = batch.to(device)

#             with torch.autocast(device_type='cuda' if device=='cuda' else 'cpu', dtype=torch.float16 if device=='cuda' else torch.bfloat16):
            x_recon = model(x)
            loss = loss_fn(x, x_recon)
#             scaler.scale(loss)
            losses.append(loss.detach().cpu().item())
#     print(f'Не нан значений во время eval: {np.count_nonzero(~np.isnan(losses))}')
    return np.nanmean(losses)

In [None]:
if not load_pretrained:
    scaler = torch.cuda.amp.GradScaler()
    def run_experiment(model, dataloader_train, dataloader_val, loss_fn, optimizer, num_epochs, device, stop_after=5, scaler=scaler):
        losses_train = []
        losses_val = []
        best_loss_val = np.inf
        c = 0
        total_runtime = 0
        for epoch in range(num_epochs):
            start = time.time()

            if c == stop_after:
                print(f'Обучение остановлено, так как лосс на валидации не падал {stop_after} эпох')
                break

            loss_train = run_epoch(model, dataloader_train, loss_fn, optimizer, epoch, device, scaler)
            loss_val = evaluate(model, dataloader_val, loss_fn, device, scaler)
            losses_train.append(loss_train)
            losses_val.append(loss_val)
            clear_output()
            if best_loss_val > loss_val:
                torch.save(model.state_dict(), 'best_model.pth')
                torch.save(optimizer, 'optimizer.pth')
                best_loss_val = loss_val
                c = 0
            else:
                c += 1

            print(f"epoch: {str(epoch).zfill(3)} | loss_train: {loss_train:5.5f} | loss_val: {loss_val:5.5f} | best_loss: {best_loss_val:5.5f}")

            plt.plot(losses_train, label='Loss train')
            plt.plot(losses_val, label='Loss val')
            plt.legend()
            plt.show()

            stop = time.time()
            runtime = stop - start
            total_runtime += runtime
            if 12*60*60 - 600 - total_runtime < runtime:
                break

        return losses_train, losses_val, model

In [None]:
if not load_embs:
    dataset_train = SpecDataset(train_patients, dct, img_size=(256, 256))
    dataset_val = SpecDataset(val_patients, dct, img_size=(256, 256))
    dataset_test = SpecDataset(test_patients, dct, img_size=(256, 256))

    dataloader_train = DataLoader(
        dataset=dataset_train,
        batch_size=128,
        shuffle=True,
        drop_last=True
    )

    dataloader_val = DataLoader(
        dataset=dataset_val,
        batch_size=128,
        shuffle=False,
        drop_last=False
    )

    dataloader_test = DataLoader(
        dataset=dataset_test,
        batch_size=128,
        shuffle=False,
        drop_last=False
    )

In [None]:
# small, _ = torch.utils.data.random_split(dataset_train, [256, len(dataset_train) - 256])
# small_val, _ = torch.utils.data.random_split(dataset_train, [256, len(dataset_train) - 256])
# small_dataloader_train = DataLoader(
#     dataset=small,
#     batch_size=128,
#     shuffle=True,
#     drop_last=True
# )

# small_dataloader_val = DataLoader(
#     dataset=small_val,
#     batch_size=128,
#     shuffle=False,
#     drop_last=False
# )

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
lr = 3e-4
model = SimpleAE()
model= nn.DataParallel(model)
loss_fn = nn.MSELoss()
if not load_pretrained:
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    num_epochs = 100

In [None]:
def init_weights(w):
    if isinstance(w, nn.Linear) or isinstance(w, nn.Conv2d) or isinstance(w, nn.ConvTranspose2d):
        nn.init.xavier_uniform_(w.weight)
if not load_pretrained:   
    model.apply(init_weights);

In [None]:
if load_pretrained and not load_embs:
    model.load_state_dict(torch.load('/kaggle/input/autoencoder-weights/best_model.pth', map_location=torch.device(device)))
# optimizer = torch.load('/kaggle/input/autoencoder-weights/optimizer.pth', map_location=torch.device(device))

In [None]:
# def nan_hook(self, inp, output):
#     if not isinstance(output, tuple):
#         outputs = [output]
#     else:
#         outputs = output

#     for i, out in enumerate(outputs):
#         nan_mask = torch.isnan(out)
#         if nan_mask.any():
#             print("In", self.__class__.__name__)
#             raise RuntimeError(f"Found NAN in output {i} at indices: ", nan_mask.nonzero(), "where:", out[nan_mask.nonzero()[:, 0].unique(sorted=True)])

# for submodule in model.modules():
#     submodule.register_forward_hook(nan_hook)

In [None]:
if not load_pretrained:
    losses_train, losses_val, model = run_experiment(model, dataloader_train, dataloader_val, loss_fn, optimizer, num_epochs, device, stop_after=15)

In [None]:
if not load_pretrained:
    plt.plot(losses_train, label='Loss train')
    plt.plot(losses_val, label='Loss val')
    plt.legend()

In [None]:
if test_eval:
    print('Test loss:', evaluate(model, dataloader_test, loss_fn, device, scaler))

### Что-то типа Аркфейса

In [None]:
def get_embs(model, dataloader):
    model = model.to(device)
    losses = []
    embs = []
    encoder = nn.DataParallel(model.module.net[0])
    decoder = nn.DataParallel(model.module.net[1])
    with torch.no_grad():
        encoder.eval()
        decoder.eval()
        for batch in tqdm(dataloader, total=len(dataloader)):
            x = batch.to(device)
            
            emb = encoder(x)
            x_recon = decoder(emb)
            embs.append(emb.squeeze(2, 3))
            for i in range(len(x)):
                loss = loss_fn(x[i], x_recon[i])
                losses.append(loss.detach().cpu().item())
    return losses, torch.cat(embs, dim=0).detach().cpu().numpy()

In [None]:
if not load_embs:
    dataloader_train = DataLoader(
        dataset=dataset_train,
        batch_size=128,
        shuffle=False,
        drop_last=False
    )
    losses_test, embs_test = get_embs(model, dataloader_test)
    losses_val, embs_val = get_embs(model, dataloader_val)
    losses_train, embs_train = get_embs(model, dataloader_train)
    
    with open(r'losses_test.txt', 'w') as fp:
        for item in losses_test:
            fp.write("%s\n" % item)
        print('Done')
        
    with open(r'losses_val.txt', 'w') as fp:
        for item in losses_val:
            fp.write("%s\n" % item)
        print('Done')
        
    with open(r'losses_train.txt', 'w') as fp:
        for item in losses_train:
            fp.write("%s\n" % item)
        print('Done')
        
    losses_test = np.array(losses_test, dtype=np.float64)
    losses_val = np.array(losses_val, dtype=np.float64)
    losses_train = np.array(losses_train, dtype=np.float64)
    
    np.save('specs_autoencoder_test_embs.npy', embs_test)
    np.save('specs_autoencoder_val_embs.npy', embs_val)
    np.save('specs_autoencoder_train_embs.npy', embs_train)

In [None]:
if load_embs:
    losses_test = []
    with open('/kaggle/input/autoencoder-weights/losses_test.txt', 'r') as fp:
        for line in fp:
            x = line[:-1]
            losses_test.append(x)
    losses_test = np.array(losses_test, dtype=np.float64)
    embs_test = np.load('/kaggle/input/autoencoder-weights/specs_autoencoder_test_embs.npy')
    
    losses_val = []
    with open('/kaggle/input/autoencoder-weights/losses_val.txt', 'r') as fp:
        for line in fp:
            x = line[:-1]
            losses_val.append(x)
    losses_val = np.array(losses_val, dtype=np.float64)
    embs_val = np.load('/kaggle/input/autoencoder-weights/specs_autoencoder_val_embs.npy')
    
    losses_train = []
    with open('/kaggle/input/autoencoder-weights/losses_train.txt', 'r') as fp:
        for line in fp:
            x = line[:-1]
            losses_train.append(x)
    losses_train = np.array(losses_train, dtype=np.float64)
    embs_train = np.load('/kaggle/input/autoencoder-weights/specs_autoencoder_train_embs.npy')

In [None]:
if get_dists:
    from sklearn.preprocessing import StandardScaler
    sc = StandardScaler().fit(embs_train)
    embs_train = sc.transform(embs_train)
    embs_val = sc.transform(embs_val)
    embs_test = sc.transform(embs_test)

In [None]:
if get_dists:
    from sklearn.metrics import pairwise_distances
    if find_threshold:
        dists_train = pairwise_distances(embs_train).astype(np.float32)
    dists_val = pairwise_distances(embs_val).astype(np.float32)
    dists_test = pairwise_distances(embs_test).astype(np.float32)

In [None]:
# def get_acc_fpr_tpr(threshold, df, dists_matrix):
#     preds = np.less(dists_matrix, threshold)
#     targets = []
    
#     for patient_id in df.patient_id:
#         targets.append((df.patient_id == patient_id).values.reshape((1,-1)))
#     targets = np.concatenate(targets, axis=0)
    
#     tp = np.sum(np.logical_and(preds, targets), axis=1)
#     fp = np.sum(np.logical_and(preds, np.logical_not(targets)), axis=1)
#     tn = np.sum(np.logical_and(np.logical_not(preds), np.logical_not(targets)), axis=1)
#     fn = np.sum(np.logical_and(np.logical_not(preds), targets), axis=1)

#     tpr = np.where(tp + fn==0, 0, tp / (tp + fn))
#     fpr = np.where(fp + tn==0, 0, fp / (fp + tn))
#     acc = float(tp + tn) / dist.size
#     return tpr.mean(), fpr.mean(), acc.mean()

In [None]:
def get_acc_fpr_tpr(index, patient_id, threshold, df, dists_matrix):
    dist = dists_matrix[index]
    preds = np.less(dist, threshold)
    targets = (df.patient_id == patient_id).values
    tp = np.sum(np.logical_and(preds, targets))
    fp = np.sum(np.logical_and(preds, np.logical_not(targets)))
    tn = np.sum(np.logical_and(np.logical_not(preds), np.logical_not(targets)))
    fn = np.sum(np.logical_and(np.logical_not(preds), targets))

    tpr = 0 if (tp + fn == 0) else float(tp) / float(tp + fn)
    fpr = 0 if (fp + tn == 0) else float(fp) / float(fp + tn)
    acc = float(tp + tn) / dist.size
    return tpr, fpr, acc

In [None]:
# def find_opt_threshold(df, dists_matrix, thresholds):
#     for th in tqdm(thresholds):
#         tpr_mean = []
#         fpr_mean = []
#         acc_mean = []
#         for i in range(len(df)):
#             tpr, fpr, acc = get_acc_fpr_tpr(th, df, dists_matrix)
#             tpr_mean.append(tpr)
#             fpr_mean.append(fpr)
#             acc_mean.append(acc)
#     return tpr_mean, fpr_mean, acc_mean

In [None]:
def find_opt_threshold(df, dists_matrix, thresholds):
    tpr_mean = []
    fpr_mean = []
    acc_mean = []
    for th in tqdm(thresholds):
        tpr_ = []
        fpr_ = []
        acc_ = []
        for i in range(len(df)):
            p_id = df.patient_id[i]
            tpr, fpr, acc = get_acc_fpr_tpr(i, p_id, th, df, dists_matrix)
            tpr_.append(tpr)
            fpr_.append(fpr)
            acc_.append(acc)
        tpr_mean.append(np.mean(tpr_))
        fpr_mean.append(np.mean(fpr_))
        acc_mean.append(np.mean(acc_))
    return tpr_mean, fpr_mean, acc_mean

In [None]:
thresholds_train = np.arange(0, 1.001, 0.001)
if find_threshold:
    tpr_train, fpr_train, acc_train = find_opt_threshold(train_patients, dists_train, thresholds_train)
    np.save('tpr_train', np.array(tpr_train))
    np.save('fpr_train', np.array(fpr_train))
    np.save('acc_train', np.array(acc_train))
else:
    tpr_train = np.load('/kaggle/input/autoencoder-weights/tpr_train.npy')
    fpr_train = np.load('/kaggle/input/autoencoder-weights/fpr_train.npy')
    acc_train = np.load('/kaggle/input/autoencoder-weights/acc_train.npy')

In [None]:
i = 30
thresholds_train[i], tpr_train[i], fpr_train[i], acc_train[i]

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(20, 5))
axs[0].plot(thresholds_train, tpr_train)
axs[1].plot(thresholds_train, fpr_train)
axs[2].plot(fpr_train, tpr_train)
plt.show()

In [None]:
if get_dists:
    tpr_val, fpr_val, acc_val = find_opt_threshold(val_patients, dists_val, [thresholds_train[i]])
    tpr_test, fpr_test, acc_test = find_opt_threshold(test_patients, dists_test, [thresholds_train[i]])

In [None]:
if get_dists:
    tpr_val, fpr_val, acc_val

In [None]:
if get_dists:    
    tpr_test, fpr_test, acc_test

### Поиск аномальных наблюдений или пациентов (для которых loss большой)

In [None]:
train_patients['loss'] = losses_train
val_patients['loss'] = losses_val
test_patients['loss'] = losses_test

In [None]:
def add_author_classes(df, high_level_of_agreement=0.6, edge_std=0.1, proto_std=0.1):
    '''
    Размечаем наш датасет по 'непонятности' пациентов по тексту авторов:
    We call segments where there are high levels of agreement “idealized” patterns.
    Cases where ~1/2 of experts give a label as “other” and ~1/2 give one of the remaining five labels, we call “proto patterns”.
    Cases where experts are approximately split between 2 of the 5 named patterns, we call “edge cases”.
    
    df — Наш датафрейм с колонками таргетов;
    high_level_of_agreement — Уверенность в диагнозе, после которой мы относим наблюдение к классу idealized.
    edge_std — Стандартное отклонение от 0.5, которое будет использоваться для классификации наблюдения как edge.
                То есть две уверенности из 5 должны быть в интервале [0.5 - edge_std; 0.5 + edge_std]
    proto_std — Стандартное отклонение от 0.5, которое будет использоваться для классификации наблюдения как proto.
                То есть уверенность класса 'other_vote' и какого-то еще одного должны быть в интервале [0.5 - proto_std; 0.5 + proto_std]
                
    Все, что не попадает в эти интервалы, будет классифицироваться как класс other
    '''
    df['authors_class'] = 'other'
    # Убрал other из idealized кейса, хотя формально по тексту описания подходит, но 'idealized other' как-то стремно звучит.
    idealized = (df.iloc[:,-7:-2] > high_level_of_agreement).any(axis=1)
    proto = ((df.iloc[:, -7:-2] > 0.5 - proto_std) & (df.iloc[:, -7:-2] < 0.5 + proto_std)).any(axis=1) & (df['other_vote'] > 0.5 - proto_std) & (df['other_vote'] < 0.5 + proto_std)
    edge = ((df.iloc[:, -7:-2] > 0.5 - edge_std) & (df.iloc[:, -7:-2] < 0.5 + edge_std)).sum(axis=1) == 2
    df.loc[idealized, 'authors_class'] = 'idealized'
    df.loc[proto, 'authors_class'] = 'proto'
    df.loc[edge, 'authors_class'] = 'edge'
    return df

In [None]:
train_patients = add_author_classes(train_patients)
val_patients = add_author_classes(val_patients)
test_patients = add_author_classes(test_patients)

In [None]:
def group(df):
    by_patient = df.groupby('patient_id').mean('loss')['loss'].reset_index()
    by_spec = df.groupby('spectrogram_id').mean('loss')['loss'].reset_index()
    by_illness = df.groupby('expert_consensus').mean('loss')['loss'].reset_index()
    by_authors = df.groupby('authors_class').mean('loss')['loss'].reset_index()
    return by_patient, by_spec, by_illness, by_authors

In [None]:
by_patient_train, by_spec_train, by_illness_train, by_authors_train = group(train_patients)
by_patient_val, by_spec_val, by_illness_val, by_authors_val = group(val_patients)
by_patient_test, by_spec_test, by_illness_test, by_authors_test = group(test_patients)

In [None]:
by_illness_train

In [None]:
by_illness_val

In [None]:
by_illness_test

In [None]:
by_authors_train

In [None]:
by_authors_val

In [None]:
by_authors_test

In [None]:
train_patients['entropy'].hist(bins=20)
def entropy_mapper(entropy):
    if entropy == 0:
        return '0'
    elif entropy <= 0.75:
        return '(0: 0.75]'
    elif entropy <= 1.25:
        return '(0.75: 1.25]'
    else:
        return '(1.25: )'
train_patients['entropy_group'] = train_patients['entropy'].map(lambda x: entropy_mapper(x))
val_patients['entropy_group'] = val_patients['entropy'].map(lambda x: entropy_mapper(x))
test_patients['entropy_group'] = test_patients['entropy'].map(lambda x: entropy_mapper(x))

In [None]:
train_patients['loss'].quantile([0.65, 0.75, 0.85, 0.9, 0.95])

In [None]:
def loss_mapper(loss):
    if loss <= 0.000014:
        return '<0.65'
    elif loss <= 0.00002:
        return '0.65<=x<0.75'
    elif loss <= 0.0022:
        return '0.75<=x<0.85'
    elif loss <= 0.064518:
        return '0.85<=x<0.9'
    elif loss <= 0.935622:
        return '0.9<=x<0.95'
    else:
        return '0.95<=x'
train_patients['loss_group'] = train_patients['loss'].map(lambda x: loss_mapper(x))
val_patients['loss_group'] = val_patients['loss'].map(lambda x: loss_mapper(x))
test_patients['loss_group'] = test_patients['loss'].map(lambda x: loss_mapper(x))

### UMAP

In [None]:
umap_obj = umap.UMAP(n_components=2, random_state=42)
umap_data_train = umap_obj.fit_transform(embs_train)
umap_data_val = umap_obj.transform(embs_val)
umap_data_test = umap_obj.transform(embs_test)

In [None]:
def plot_group(umap_data_train, umap_data_val, umap_data_test, df_train, df_val, df_test, group_col, hue_order):
    fig, axes = plt.subplots(1, 3, sharex=True, figsize=(16,8))
    sns.scatterplot(ax=axes[0], x=umap_data_train[:,0], y=umap_data_train[:,1], hue=df_train[group_col], hue_order=hue_order, s=7)
    axes[0].set_title('Train')

    sns.scatterplot(ax=axes[1], x=umap_data_val[:,0], y=umap_data_val[:,1], hue=df_val[group_col], hue_order=hue_order, s=7)
    axes[1].set_title('Val')

    sns.scatterplot(ax=axes[2], x=umap_data_test[:,0], y=umap_data_test[:,1], hue=df_test[group_col], hue_order=hue_order, s=7)
    axes[2].set_title('Test')
    
    plt.setp(axes[0].get_legend().get_texts(), fontsize=7)
    plt.setp(axes[1].get_legend().get_texts(), fontsize=7)
    plt.setp(axes[2].get_legend().get_texts(), fontsize=7)
    plt.show()

In [None]:
hue_order_cons = ['GPD', 'LRDA', 'Seizure', 'Other', 'GRDA', 'LPD']
plot_group(umap_data_train, umap_data_val, umap_data_test, train_patients, val_patients, test_patients, group_col='expert_consensus', hue_order=hue_order_cons)

In [None]:
hue_order_cls = ['idealized', 'proto', 'edge', 'other'] 
plot_group(umap_data_train, umap_data_val, umap_data_test, train_patients, val_patients, test_patients, group_col='authors_class', hue_order=hue_order_cls)

In [None]:
train_patients['entropy'].hist(bins=20)
def entropy_mapper(entropy):
    if entropy == 0:
        return '0'
    elif entropy <= 0.75:
        return '(0: 0.75]'
    elif entropy <= 1.25:
        return '(0.75: 1.25]'
    else:
        return '(1.25: )'
train_patients['entropy_group'] = train_patients['entropy'].map(lambda x: entropy_mapper(x))
val_patients['entropy_group'] = val_patients['entropy'].map(lambda x: entropy_mapper(x))
test_patients['entropy_group'] = test_patients['entropy'].map(lambda x: entropy_mapper(x))

In [None]:
hue_order_ent = ['0', '(0: 0.75]', '(0.75: 1.25]', '(1.25: )'] 
plot_group(umap_data_train, umap_data_val, umap_data_test, train_patients, val_patients, test_patients, group_col='entropy_group', hue_order=hue_order_ent)

In [None]:
hue_order_loss = ['<0.65', '0.65<=x<0.75', '0.75<=x<0.85', '0.85<=x<0.9', '0.9<=x<0.95', '0.95<=x'] 
plot_group(umap_data_train, umap_data_val, umap_data_test, train_patients, val_patients, test_patients, group_col='loss_group', hue_order=hue_order_loss)

### Remove Other label

In [None]:
train_patients_no_other = train_patients[train_patients['authors_class'] != 'other'].copy()
val_patients_no_other = val_patients[val_patients['authors_class'] != 'other'].copy()
test_patients_no_other = test_patients[test_patients['authors_class'] != 'other'].copy()

In [None]:
umap_obj = umap.UMAP(n_components=2, random_state=42)
umap_data_train_no_other = umap_obj.fit_transform(embs_train[train_patients_no_other.index])
umap_data_val_no_other = umap_obj.transform(embs_val[val_patients_no_other.index])
umap_data_test_no_other = umap_obj.transform(embs_test[test_patients_no_other.index])

In [None]:
hue_order_cons = ['GPD', 'LRDA', 'Seizure', 'Other', 'GRDA', 'LPD']
plot_group(umap_data_train_no_other, umap_data_val_no_other, umap_data_test_no_other, train_patients_no_other, val_patients_no_other, test_patients_no_other, group_col='expert_consensus', hue_order=hue_order_cons)

In [None]:
hue_order_cls = ['idealized', 'proto', 'edge'] 
plot_group(umap_data_train_no_other, umap_data_val_no_other, umap_data_test_no_other, train_patients_no_other, val_patients_no_other, test_patients_no_other, group_col='authors_class', hue_order=hue_order_cls)

In [None]:
hue_order_ent = ['0', '(0: 0.75]', '(0.75: 1.25]', '(1.25: )'] 
plot_group(umap_data_train_no_other, umap_data_val_no_other, umap_data_test_no_other, train_patients_no_other, val_patients_no_other, test_patients_no_other, group_col='entropy_group', hue_order=hue_order_ent)

In [None]:
hue_order_loss = ['<0.65', '0.65<=x<0.75', '0.75<=x<0.85', '0.85<=x<0.9', '0.9<=x<0.95', '0.95<=x'] 
plot_group(umap_data_train_no_other, umap_data_val_no_other, umap_data_test_no_other, train_patients_no_other, val_patients_no_other, test_patients_no_other, group_col='loss_group', hue_order=hue_order_loss)

### Remove high entropy

In [None]:
train_patients_no_ent = train_patients[train_patients['entropy_group'] != '(1.25: )'].copy()
val_patients_no_ent = val_patients[val_patients['entropy_group'] != '(1.25: )'].copy()
test_patients_no_ent = test_patients[test_patients['entropy_group'] != '(1.25: )'].copy()

In [None]:
umap_obj = umap.UMAP(n_components=2, random_state=42)
umap_data_train_no_ent = umap_obj.fit_transform(embs_train[train_patients_no_ent.index])
umap_data_val_no_ent = umap_obj.transform(embs_val[val_patients_no_ent.index])
umap_data_test_no_ent = umap_obj.transform(embs_test[test_patients_no_ent.index])

In [None]:
hue_order_cons = ['GPD', 'LRDA', 'Seizure', 'Other', 'GRDA', 'LPD']
plot_group(umap_data_train_no_ent, umap_data_val_no_ent, umap_data_test_no_ent, train_patients_no_ent, val_patients_no_ent, test_patients_no_ent, group_col='expert_consensus', hue_order=hue_order_cons)

In [None]:
hue_order_cls = ['idealized', 'proto', 'edge'] 
plot_group(umap_data_train_no_ent, umap_data_val_no_ent, umap_data_test_no_ent, train_patients_no_ent, val_patients_no_ent, test_patients_no_ent, group_col='authors_class', hue_order=hue_order_cls)

In [None]:
hue_order_ent = ['0', '(0: 0.75]', '(0.75: 1.25]'] 
plot_group(umap_data_train_no_ent, umap_data_val_no_ent, umap_data_test_no_ent, train_patients_no_ent, val_patients_no_ent, test_patients_no_ent, group_col='entropy_group', hue_order=hue_order_ent)

In [None]:
hue_order_loss = ['<0.65', '0.65<=x<0.75', '0.75<=x<0.85', '0.85<=x<0.9', '0.9<=x<0.95', '0.95<=x'] 
plot_group(umap_data_train_no_ent, umap_data_val_no_ent, umap_data_test_no_ent, train_patients_no_ent, val_patients_no_ent, test_patients_no_ent, group_col='loss_group', hue_order=hue_order_loss)

## Clustering

In [None]:
from sklearn.cluster import Birch

In [None]:
db = Birch(n_clusters=None)

In [None]:
labels_train = db.fit_predict(umap_data_train)
labels_val = db.predict(umap_data_val)
labels_test = db.predict(umap_data_test)

db = Birch(n_clusters=None)
labels_train_no_other = db.fit_predict(umap_data_train_no_other)
labels_val_no_other = db.predict(umap_data_val_no_other)
labels_test_no_other = db.predict(umap_data_test_no_other)

db = Birch(n_clusters=None)
labels_train_no_ent = db.fit_predict(umap_data_train_no_ent)
labels_val_no_ent = db.predict(umap_data_val_no_ent)
labels_test_no_ent = db.predict(umap_data_test_no_ent)

In [None]:
train_patients['labels_birch'] = labels_train
train_patients_no_other['labels_birch'] = labels_val
train_patients_no_ent['labels_birch'] = labels_test

train_patients_no_other['labels_birch'] = labels_train_no_other
val_patients_no_other['labels_birch'] = labels_val_no_other
test_patients_no_other['labels_birch'] = labels_test_no_other

train_patients_no_ent['labels_birch'] = labels_train_no_ent
val_patients_no_ent['labels_birch'] = labels_val_no_ent
test_patients_no_ent['labels_birch'] = labels_test_no_ent

In [None]:
hue_order = np.sort(np.unique(labels_train))
plot_group(umap_data_train, umap_data_val, umap_data_test, train_patients, val_patients, test_patients, group_col='labels_birch', hue_order=hue_order)

In [None]:
hue_order = np.sort(np.unique(labels_train_no_other))
plot_group(umap_data_train_no_other, umap_data_val_no_other, umap_data_test_no_other, train_patients_no_other, val_patients_no_other, test_patients_no_other, group_col='labels_birch', hue_order=hue_order)

In [None]:
hue_order = np.sort(np.unique(labels_train_no_ent))
plot_group(umap_data_train_no_ent, umap_data_val_no_ent, umap_data_test_no_ent, train_patients_no_ent, val_patients_no_ent, test_patients_no_ent, group_col='labels_birch', hue_order=hue_order)