In [1]:
from pathlib import Path
import pandas as pd

artifacts = Path("/media/cumulus/curation_data/modality_classifiers_production/models/cord19/")
data_path = Path("/media/cumulus/curation_data/modality_classifiers_production/data/cord19")

classifiers = {
    "classifier": "higher-modality",
    "classname": "",
    "path": artifacts / "higher-modality/efficientnet-b1_higher-modality_0.pt",
    "children": [
        {
            "classifier": "experimental",
            "classname": "exp",
            "path": artifacts / "experimental/efficientnet-b1_experimental_0.pt",
            "children": [
                {
                    "classifier": "gel",
                    "classname": "exp.gel",
                    "path": artifacts / "gel/efficientnet-b1_gel_0.pt",
                    "children": []
                }
            ],
        },
        {
            "classifier": "graphics",
            "classname": "gra",
            "path": artifacts / "graphics/efficientnet-b1_graphics_0.pt",
            "children": [],
        },
        {
            "classifier": "microscopy",
            "classname": "mic",
            "path": artifacts / "microscopy/efficientnet-b0_microscopy_0.pt",
            "children": [
                {
                    "classifier": "electron",
                    "classname": "mic.ele",
                    "path": artifacts / "electron/efficientnet-b1_electron_0.pt",
                    "children": [],
                },
            ],
        },
        {
            "classifier": "molecular",
            "classname": "mol",
            "path": artifacts / "molecular" / "efficientnet-b1_molecular_0.pt",
            "children": [],
        },
        {
            "classifier": "radiology",
            "classname": "rad",
            "path": artifacts / "radiology" / "efficientnet-b0_radiology_0.pt",
            "children": [],
        },
        {
            "classifier": "photography",
            "classname": "pho",
            "path": artifacts / "photography/resnet34_photography_0.pt",
            "children": []
        }
    ],
}



model_path = data_path / "cord19_microscopy_v1.parquet"
df = pd.read_parquet(model_path)
# df = df[:10]

In [2]:
from image_modalities_classifier.models.predict import ModalityPredictor, RunConfig, SingleModalityPredictor
from os import cpu_count

config = RunConfig(32, cpu_count(), "cuda:0")
predictor = SingleModalityPredictor(str(artifacts / "microscopy/efficientnet-b0_microscopy_0.pt"), config)
# predictor = ModalityPredictor(classifiers, config)



In [3]:
base_img_path = "/home/jtt/Documents/datasets/curation_data/"
features = predictor.features(df, base_img_path)

In [4]:
df["features"] = list(features)
df.head()

Unnamed: 0,img,img_path,width,height,label,source,caption,is_gt,original,split_set,features
0,1471-2121-3-29-2.jpg,subfigure-classification/2013/train/DMFL/1471-...,600.0,541.0,mic.flu,clef13,Fluorescence microscopy of a S2 cell transfect...,True,DMFL,TRAIN,"[-0.004090134, -0.1427809, -0.1732117, -0.1795..."
1,1471-2172-7-1-4.jpg,subfigure-classification/2013/train/DMFL/1471-...,600.0,600.0,mic.flu,clef13,Distribution of Bu-1 + cells . PALT is seen b...,True,DMFL,TRAIN,"[0.7017001, -0.077621244, -0.13674453, -0.0627..."
2,1471-2180-10-283-1.jpg,subfigure-classification/2013/train/DMFL/1471-...,600.0,444.0,mic.flu,clef13,Merged image of the phase contrast and fluores...,True,DMFL,TRAIN,"[-0.11191165, -0.17719942, -0.15846144, -0.137..."
3,1471-2180-5-17-9.jpg,subfigure-classification/2013/train/DMFL/1471-...,552.0,548.0,mic.flu,clef13,"An epifluorescence image of Thermus aquaticus .,",True,DMFL,TRAIN,"[1.7194173, -0.08525187, -0.12554392, -0.05411..."
4,1471-2199-11-11-2.jpg,subfigure-classification/2013/train/DMFL/1471-...,600.0,576.0,mic.flu,clef13,Fluorescence microscopy of tagged-NrdB cells ....,True,DMFL,TRAIN,"[0.18778822, -0.09744128, -0.14088048, -0.1344..."


In [5]:
# restart because the feature extractor is affecting the original model
predictor = SingleModalityPredictor(str(artifacts / "microscopy/efficientnet-b0_microscopy_0.pt"), config)
predictions, probabilities = predictor.predict_with_probs(df, base_img_path)



In [8]:
df["prediction"] = predictions
df["probs"] = probabilities
df.head()

Unnamed: 0,img,img_path,width,height,label,source,caption,is_gt,original,split_set,features,prediction,probs
0,1471-2121-3-29-2.jpg,subfigure-classification/2013/train/DMFL/1471-...,600.0,541.0,mic.flu,clef13,Fluorescence microscopy of a S2 cell transfect...,True,DMFL,TRAIN,"[-0.004090134, -0.1427809, -0.1732117, -0.1795...",mic.flu,"[0.001480210805311799, 0.9981943964958191, 0.0..."
1,1471-2172-7-1-4.jpg,subfigure-classification/2013/train/DMFL/1471-...,600.0,600.0,mic.flu,clef13,Distribution of Bu-1 + cells . PALT is seen b...,True,DMFL,TRAIN,"[0.7017001, -0.077621244, -0.13674453, -0.0627...",mic.flu,"[2.457024311297573e-05, 0.999975323677063, 5.4..."
2,1471-2180-10-283-1.jpg,subfigure-classification/2013/train/DMFL/1471-...,600.0,444.0,mic.flu,clef13,Merged image of the phase contrast and fluores...,True,DMFL,TRAIN,"[-0.11191165, -0.17719942, -0.15846144, -0.137...",mic.flu,"[0.08060691505670547, 0.873210072517395, 0.046..."
3,1471-2180-5-17-9.jpg,subfigure-classification/2013/train/DMFL/1471-...,552.0,548.0,mic.flu,clef13,"An epifluorescence image of Thermus aquaticus .,",True,DMFL,TRAIN,"[1.7194173, -0.08525187, -0.12554392, -0.05411...",mic.flu,"[2.004333810035064e-09, 1.0, 6.101986537582205..."
4,1471-2199-11-11-2.jpg,subfigure-classification/2013/train/DMFL/1471-...,600.0,576.0,mic.flu,clef13,Fluorescence microscopy of tagged-NrdB cells ....,True,DMFL,TRAIN,"[0.18778822, -0.09744128, -0.14088048, -0.1344...",mic.flu,"[0.013884089887142181, 0.9833231568336487, 0.0..."


In [17]:
import numpy as np

def cal_margin_sampling(y_pred_prob):
    return np.diff(-np.sort(y_pred_prob)[:, ::-1][:, :2])

def calc_entropy(y_pred_prob):
    return -np.nansum(np.multiply(y_pred_prob, np.log(y_pred_prob)), axis=1)

In [13]:
probs = np.vstack(df.probs)

In [21]:
df["ms"] = cal_margin_sampling(probs)

In [22]:
df["en"] = calc_entropy(probs)

In [24]:
df.head()

Unnamed: 0,img,img_path,width,height,label,source,caption,is_gt,original,split_set,features,prediction,probs,ms,en
0,1471-2121-3-29-2.jpg,subfigure-classification/2013/train/DMFL/1471-...,600.0,541.0,mic.flu,clef13,Fluorescence microscopy of a S2 cell transfect...,True,DMFL,TRAIN,"[-0.004090134, -0.1427809, -0.1732117, -0.1795...",mic.flu,"[0.001480210805311799, 0.9981943964958191, 0.0...",0.996714,0.01406192
1,1471-2172-7-1-4.jpg,subfigure-classification/2013/train/DMFL/1471-...,600.0,600.0,mic.flu,clef13,Distribution of Bu-1 + cells . PALT is seen b...,True,DMFL,TRAIN,"[0.7017001, -0.077621244, -0.13674453, -0.0627...",mic.flu,"[2.457024311297573e-05, 0.999975323677063, 5.4...",0.999951,0.0002863747
2,1471-2180-10-283-1.jpg,subfigure-classification/2013/train/DMFL/1471-...,600.0,444.0,mic.flu,clef13,Merged image of the phase contrast and fluores...,True,DMFL,TRAIN,"[-0.11191165, -0.17719942, -0.15846144, -0.137...",mic.flu,"[0.08060691505670547, 0.873210072517395, 0.046...",0.792603,0.4633903
3,1471-2180-5-17-9.jpg,subfigure-classification/2013/train/DMFL/1471-...,552.0,548.0,mic.flu,clef13,"An epifluorescence image of Thermus aquaticus .,",True,DMFL,TRAIN,"[1.7194173, -0.08525187, -0.12554392, -0.05411...",mic.flu,"[2.004333810035064e-09, 1.0, 6.101986537582205...",1.0,4.030027e-08
4,1471-2199-11-11-2.jpg,subfigure-classification/2013/train/DMFL/1471-...,600.0,576.0,mic.flu,clef13,Fluorescence microscopy of tagged-NrdB cells ....,True,DMFL,TRAIN,"[0.18778822, -0.09744128, -0.14088048, -0.1344...",mic.flu,"[0.013884089887142181, 0.9833231568336487, 0.0...",0.969439,0.09234295


In [25]:
from sklearn.decomposition import PCA

random_state = 42

pca = PCA(n_components=2, random_state=random_state)
pca.fit(np.vstack(df.features))

In [29]:
embeddings_pca  = pca.transform(np.vstack(df.features))
df["x_pca"], df["y_pca"] = embeddings_pca[:,0], embeddings_pca[:, 1]

In [33]:
from sklearn.neighbors import NearestNeighbors

def calc_neighborhood_hit(df, x_col, y_col, n_neighbors=6,column_label='label'):    
    projections = [[i, j] for (i, j) in zip(df[x_col], df[y_col])]
    neigh = NearestNeighbors(n_neighbors=n_neighbors, algorithm='ball_tree').fit(projections)    
    n_hits = []
    for neighborhood in neigh.kneighbors(projections, n_neighbors + 1, return_distance=False):
        labels  = df.iloc[neighborhood][column_label].values
        targets = [labels[0]] * (len(labels) - 1) 
        n_hit = np.mean(targets == labels[1:])
        n_hits.append(n_hit)
    return n_hits

In [36]:
# from sklearn.preprocessing import LabelEncoder
# predictor.model.hparams

df["hits"] = calc_neighborhood_hit(df, "x_pca", "y_pca", n_neighbors=6)


In [37]:
df.head()

Unnamed: 0,img,img_path,width,height,label,source,caption,is_gt,original,split_set,features,prediction,probs,ms,en,x_pca,y_pca,hits
0,1471-2121-3-29-2.jpg,subfigure-classification/2013/train/DMFL/1471-...,600.0,541.0,mic.flu,clef13,Fluorescence microscopy of a S2 cell transfect...,True,DMFL,TRAIN,"[-0.004090134, -0.1427809, -0.1732117, -0.1795...",mic.flu,"[0.001480210805311799, 0.9981943964958191, 0.0...",0.996714,0.01406192,4.251132,-1.009367,1.0
1,1471-2172-7-1-4.jpg,subfigure-classification/2013/train/DMFL/1471-...,600.0,600.0,mic.flu,clef13,Distribution of Bu-1 + cells . PALT is seen b...,True,DMFL,TRAIN,"[0.7017001, -0.077621244, -0.13674453, -0.0627...",mic.flu,"[2.457024311297573e-05, 0.999975323677063, 5.4...",0.999951,0.0002863747,13.939093,-2.682397,1.0
2,1471-2180-10-283-1.jpg,subfigure-classification/2013/train/DMFL/1471-...,600.0,444.0,mic.flu,clef13,Merged image of the phase contrast and fluores...,True,DMFL,TRAIN,"[-0.11191165, -0.17719942, -0.15846144, -0.137...",mic.flu,"[0.08060691505670547, 0.873210072517395, 0.046...",0.792603,0.4633903,-0.888648,-0.57051,0.666667
3,1471-2180-5-17-9.jpg,subfigure-classification/2013/train/DMFL/1471-...,552.0,548.0,mic.flu,clef13,"An epifluorescence image of Thermus aquaticus .,",True,DMFL,TRAIN,"[1.7194173, -0.08525187, -0.12554392, -0.05411...",mic.flu,"[2.004333810035064e-09, 1.0, 6.101986537582205...",1.0,4.030027e-08,28.581532,-1.882622,1.0
4,1471-2199-11-11-2.jpg,subfigure-classification/2013/train/DMFL/1471-...,600.0,576.0,mic.flu,clef13,Fluorescence microscopy of tagged-NrdB cells ....,True,DMFL,TRAIN,"[0.18778822, -0.09744128, -0.14088048, -0.1344...",mic.flu,"[0.013884089887142181, 0.9833231568336487, 0.0...",0.969439,0.09234295,1.939335,-1.219068,1.0
