In [9]:
!pip install fastai==1.0.61
!pip install timm



## Library & Framework

This project using fastaiv1

In [10]:
from fastai.vision import *
from fastai.callbacks.hooks import *
from fastprogress import progress_bar

from PIL import Image
import os
from glob import glob
import cv2
from tqdm import tqdm
import multiprocessing as mp
import pydicom
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
import sklearn

from torch.utils.data.dataloader import DataLoader
from torch.utils.data.dataset import Dataset
from torchvision import transforms
import timm
import pandas as pd
import numpy as np


from torch.utils.data.dataset import Dataset
from torch.utils.data.dataloader import DataLoader
import torch
from torchvision import transforms

Variable config as a hyper-parametters object

In [11]:
config = {}

config['project_path'] = '/kaggle/input/rsna-pneumonia-detection-challenge'
config['img_input'] = os.path.join(config['project_path'],'stage_2_train_images')
config['working'] = '/kaggle/working'


config['size'] = 256

config['batch_size'] = 16

config['num_workers'] = 8
config['num_classes'] = 2
config['epochs'] = 1
config['lr'] = 5e-4
config['sch_lr'] = 0.9

config['pool'] = 6500

config['nb_iterations'] = 5

config['num_gist'] = 200

config['initial_decay_rate'] = 0.2
config['decay_rate'] = 0.2
config['thresh'] = None

config['discrim_ratio'] = 0.2

config['muy'] = 0.5

config['ite_momentum'] = 5

config['thres_momentum'] = 0.8

config['valid_ratio'] = 0.1

In [12]:
pd.options.mode.chained_assignment = None  # default='warn'

if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

## Utils functions

In [13]:
def prepare_working(config):
    if os.path.isdir(os.path.join(config['working'],'used')):
        shutil.rmtree(os.path.join(config['working'],'used'))
        os.mkdir(os.path.join(config['working'],'used'))
    else:
        os.mkdir(os.path.join(config['working'],'used'))
        
    if os.path.isdir(os.path.join(config['working'],'unused')):
        shutil.rmtree(os.path.join(config['working'],'unused'))
        os.mkdir(os.path.join(config['working'],'unused'))
    else:
        os.mkdir(os.path.join(config['working'],'unused'))

    if os.path.isdir(os.path.join(config['working'],'pool')):
        shutil.rmtree(os.path.join(config['working'],'pool'))
        os.mkdir(os.path.join(config['working'],'pool'))
    else:
        os.mkdir(os.path.join(config['working'],'pool'))

    if os.path.isdir(os.path.join(config['working'],'test')):
        shutil.rmtree(os.path.join(config['working'],'test'))
        os.mkdir(os.path.join(config['working'],'test'))
    else:
        os.mkdir(os.path.join(config['working'],'test'))
        
    print('Prepare Working directory Done')

def image_resize(image, width = None, height = None, inter = cv2.INTER_AREA):
    # initialize the dimensions of the image to be resized and
    # grab the image size
    dim = None
    (h, w) = image.shape[:2]

    # if both the width and height are None, then return the
    # original image
    if width is None and height is None:
        return image

    # check to see if the width is None
    if width is None:
        # calculate the ratio of the height and construct the
        # dimensions
        r = height / float(h)
        dim = (int(w * r), height)

    # otherwise, the height is None
    else:
        # calculate the ratio of the width and construct the
        # dimensions
        r = width / float(w)
        dim = (width, int(h * r))

    # resize the image
    resized = cv2.resize(image, dim, interpolation = inter)

    # return the resized image
    return resized


def convert_dcm_png(zip_info, fold_dcm = config['img_input'], img_size = config['size']):

    fname, dest = zip_info
    
    path_dcm = os.path.join(fold_dcm, fname + '.dcm')
    
    dicom = pydicom.read_file(path_dcm)
    data = dicom.pixel_array
    
    data = data - np.min(data)
    if np.max(data) != 0:
        data = data / np.max(data)
    
    data = (data * 255).astype(np.uint8)
    
    img_rs = image_resize(data,img_size)
    
    path_png = os.path.join(dest,fname + '.png')
    
    cv2.imwrite(path_png,img_rs)    
    

def load_checkpoint(filepath):
    
    checkpoint = torch.load(filepath)
    model = checkpoint['model']
    model.load_state_dict(checkpoint['model_state_dict'])
    for parameter in model.parameters():
        parameter.requires_grad = False
    model.eval()
    return model



class poolDataset(Dataset):
    def __init__(self, dir_path, df_pool, transforms=None):
        """
        param:
            dir_path: is path of images
            df_pool: is dataframe of pool            
            transform:
            
        return:
            
        """
        self.transforms = transforms

        self.dir_path = dir_path

        self.imgs = list(df_pool['fname'])
        
    def __getitem__(self, item):
        
        img_path = os.path.join(self.dir_path, self.imgs[item])
        img = Image.open(img_path).convert('RGB')
        
        if self.transforms is not None:
            img = self.transforms(img)
        return img,self.imgs[item]
    
    def __len__(self):
        return len(self.imgs)
    
def entropy_rank(pred):
    en = np.zeros(len(pred))

    for i in range(0, len(pred)):
        en[i] = sum(-pred[i] * np.log(pred[i]))

    return np.argsort(en), en


def uncertain_set(en, nb_annotations):
    return en[0:nb_annotations]


def certain_set(en, thresh, initial_decay_rate, decay_rate):
    # Threshold updating <-- review
    if thresh == None:
        thresh = max(en) - ((max(en) - min(en)) * initial_decay_rate)
    else:
        thresh = thresh + (max(en) - thresh) * decay_rate

    return np.where(en < thresh)[0], thresh


def predictions_max_class(predictions):
    return np.argmax(predictions,axis=1)


def pseudo_label_error(pseudo_samples, true_samples):
    aux = 0
    for i in range(0, len(pseudo_samples)):
        if (pseudo_samples[i] == true_samples[i]).sum() != len(true_samples[i]):
            aux += 1
    return aux / len(true_samples)

def learn_get_preds(model, device, data_loader):

    confidences = []
    indices = []
    
    model.eval()
    
    with torch.no_grad():
        for batch in progress_bar(data_loader):
        
            imgs, fnames = batch
            
            imgs = imgs.to(device)
            
            logits = model(imgs)
            
            probabilities = torch.sigmoid(logits)
            
            confidences.extend(probabilities.cpu().tolist())
            
            indices.extend(fnames)
            
    conf = np.asarray(confidences)
    ind = np.asarray(indices)
    return conf, ind

def multi_process_copy_img(df):
    
    pool = mp.Pool(8)
    
    fnames = list(df['patientId'])
    dest = list(df['path'])

    zip_info = list(zip(fnames,dest))

    for _ in tqdm(pool.imap_unordered(convert_dcm_png,zip_info),total=len(zip_info)):
        pass 
    
def feature_extractor(learn, dataloader, is_train = True):
    
#     backbone = nn.Sequential(*list(list(learn.model.module[0].children())[:-1]))
#     backbone = nn.Sequential(*list(list(learn.model.module.children())[0][0].children())[:-1])
    backbone = nn.Sequential(*list(list(learn.model.module.children())[0].children())[:-1])
    
    list_features = []
    list_categories = []
    
    list_fnames = []
    
    if is_train:
        for x, y in progress_bar(dataloader):
            x, y = x.cuda(), y.cuda()
            backbone = backbone.cuda()

            feature = backbone(x).detach().cpu()
            list_features.append(feature)
            list_categories.append(y.detach().cpu())

        array_features = np.concatenate(list_features,axis=0)
        array_categories = np.concatenate(list_categories,axis=0)
        
        return array_features, array_categories
    
    else:
        for x,fname in progress_bar(dataloader):
            x  = x.cuda()
            backbone = backbone.cuda()

            feature = backbone(x).detach().cpu()
            
            list_features.append(feature)
            list_fnames.append(fname)
            
        array_features = np.concatenate(list_features,axis=0)
        array_fnames = np.concatenate(list_fnames,axis=0)
        
        return array_features, array_fnames
    
def query_update(df, fnames, labels):
    
    ## Update df_concat & move image from pool --> used
    
    for fname, label in tqdm(zip(fnames, labels),total = len(fnames)):

        df.loc[df['fname'] == fname, 'target'] = label
        df.loc[df['fname'] == fname, 'is_valid'] = False
        df.loc[df['fname'] == fname, 'is_anno'] = True
        
        move_source = os.path.join(config['working'],'pool',fname)
        move_dest = os.path.join(config['working'],'used',fname)
        
        shutil.move(move_source,move_dest)
        
        df.loc[df['fname'] == fname, 'path'] = os.path.join(config['working'],'used')
    
    print('Pseudo-labeling {} instances: Done'.format(len(fnames)))
    
    return df

def generate_gist(uncertain_samples):
    
    mean = [0.485, 0.456, 0.406]
    std = [0.229, 0.224, 0.225]
    
    df_gist = pd.DataFrame([])

    df_gist['fname'] = uncertain_samples
    
    pool_transforms = transforms.Compose([
                        transforms.Resize(config['size'],config['size']),
                        transforms.ToTensor(),
                        transforms.Normalize(mean, std)
                        ])

    gist_ds = poolDataset(
                        os.path.join(config['working'],'pool'),
                        df_gist,
                        pool_transforms
                        )

    dls_gist = DataLoader(
                        gist_ds, 
                        batch_size=config['batch_size'], 
                        num_workers=config['num_workers']
                        )
    return dls_gist


def gist_clustering(feature_gist, fnames_gist, feature_train, categories_train, discrim_ratio=config['discrim_ratio']):
    
    pos_idx = np.where(categories_train == 1)[0]
    neg_idx = np.where(categories_train == 0)[0]

    pos_feature = feature_train[pos_idx]
    neg_feature = feature_train[neg_idx]

    
    all_vectors = np.concatenate([pos_feature, neg_feature])
    
    kmeans = KMeans(n_clusters = 2 ,random_state=0).fit(all_vectors)
    labels = kmeans.predict(all_vectors)
    
    centroids = kmeans.cluster_centers_
    
    gist_distances= []
    
    pos_centroid = np.mean(pos_feature,axis=0)
    
    neg_centroid = np.mean(neg_feature,axis=0)
    
    ## verify centroids
    
    verify_centroids = {}
    
    if np.linalg.norm(pos_centroid-centroids[0]) < np.linalg.norm(pos_centroid-centroids[1]):
        pos_centroid = centroids[0]
        neg_centroid = centroids[1]
    else:
        pos_centroid = centroids[1]
        neg_centroid = centroids[0]
    
    kmeans_preds_class = kmeans.predict(feature_gist)
    
    dist_between_centroids = np.linalg.norm(neg_centroid - pos_centroid)
        
    decision_boundary = (dist_between_centroids/2)*discrim_ratio
    
    kmeans_preds_score = []
    
    for g in tqdm(feature_gist,total = feature_gist.shape[0]):
        dist_neg = np.linalg.norm(g - neg_centroid)
        dist_pos = np.linalg.norm(g - pos_centroid)
        
#         kmeans_preds_score.append((dist_neg,dist_pos))
        kmeans_preds_score.append(np.linalg.norm(dist_neg-dist_pos))
    
    kmeans_preds_score = np.array(kmeans_preds_score)
    
    fnames_array = np.array(fnames_gist)
    
#     indices_db = np.where(np.min(kmeans_preds_score,axis=1) <= decision_boundary)[0]

    indices_db = np.where(kmeans_preds_score > decision_boundary)[0]
    
    preds_class_db = kmeans_preds_class[indices_db]
    
    preds_score_db = kmeans_preds_score[indices_db]
    
    fnames_db = fnames_array[indices_db]

    return fnames_db.tolist(), preds_class_db.tolist(), kmeans_preds_score

def moving_avg(last_conf , curr_conf, muy = config['muy']):
    return muy*last_conf + (1-muy)*curr_conf

def move_to_unused(df, fnames):
    
    for fname in tqdm(fnames ,total = len(fnames)):

        df.loc[df['fname'] == fname, 'is_used'] = False
        
        move_source = os.path.join(config['working'],'used',fname)
        move_dest = os.path.join(config['working'],'unused',fname)
        
        shutil.move(move_source,move_dest)
        
        df.loc[df['fname'] == fname, 'path'] = os.path.join(config['working'],'used')
    
    print('Remove {} low confidence instances: Done'.format(len(fnames)))
    
    return df

def online_active_learning(df, 
                learn, 
                iterations = config['ite_momentum'], 
                thres = config['thres_momentum']):

    learn.unfreeze()
    
    print('Current iteration train: {0} neg / {1} pos'.format(
            learn.data.train_ds.y.items.tolist().count(0),
            learn.data.train_ds.y.items.tolist().count(1))
         )
    
    for epoch in range(config['ite_momentum']):
        print('Momentum at epoch: {}'.format(str(epoch)))
        learn.fit_one_cycle(config['epochs'], slice(config['lr']), pct_start=config['sch_lr'])
        preds_train = learn.get_preds(learn.data.train_dl)
        curr_conf = preds_train[0]
        if epoch != 0:
            moment_conf = moving_avg(last_conf, curr_conf)
            last_conf = moment_conf   
        else:
            last_conf = curr_conf      
            
            
    last_conf = np.array(last_conf)
    fnames_train = np.array([item.split('/')[-1] for item in learn.data.train_ds.x.items.tolist()])
    indices_low_conf = np.where(np.max(last_conf, axis=1) <= thres)[0]
    fnames_low_conf = fnames_train[indices_low_conf].tolist()
    df = move_to_unused(df, fnames_low_conf)
    dls_anno, dls_pool = create_dataloader(df)
    
    learn = Learner(data=dls_anno,                  
                    model=learn.model,
                    metrics=FBeta(beta=1))
    
    
    return df, learn

def gist_selection(X_gist_set, learn):
    
    dls_gist = generate_gist(X_gist_set)
            
    feature_train, categories_train = feature_extractor(learn, learn.data.train_dl.dl)

    feature_gist, fnames_gist = feature_extractor(learn, dls_gist, is_train=False)

    X_gist, y_gist, gist_sdb = gist_clustering(
                    feature_gist, 
                    fnames_gist, 
                    feature_train, 
                    categories_train)
    
    return X_gist, y_gist, gist_sdb

## Prepare Dataframe

In [14]:
def prepare_dataframe(config):
    
    df_ = pd.read_csv(os.path.join(config['project_path'],'stage_2_detailed_class_info.csv')).drop_duplicates(subset='patientId').reset_index(drop=True)
    
    df_['target'] = [1 if item == 'Lung Opacity' else 0 for item in list(df_['class'])]
    
    df_full, df_anno_valid = train_test_split(df_, test_size = config['valid_ratio'],random_state=0)

#     df_anno_full = df_full[(df_full['class']=='Normal')|(df_full['class']=='Lung Opacity')]
    
    ## fixed bug 
    
    df_anno_full, df_not_anno_full = train_test_split(df_full, test_size = config['anno_ratio'],random_state=0)
    
#     df_anno_full['target'] = [1 if item == 'Lung Opacity' else 0 for item in list(df_anno_full['class'])]
#     df_not_anno_full = df_full[df_full['class']=='No Lung Opacity / Not Normal']

    df_not_anno_full['target'] = 0

#     df_anno_used = df_anno_full.sample(config['used']).reset_index(drop=True)
    
    df_anno_used, _ = train_test_split(df_anno_full,test_size = (1 - config['used']/len(df_anno_full)))
    
    df_anno_train = df_anno_used.reset_index(drop=True)
    
#     df_anno_test = df_anno_full.sample(config['test']).reset_index(drop=True)

#     df_not_anno = df_not_anno_full.sample(config['pool']).reset_index(drop=True)
    df_not_anno = df_not_anno_full

#     df_anno_test = df_anno_test[~df_anno_test['patientId'].isin(df_anno_used['patientId'])]
#     df_anno_test['path'] = os.path.join(config['working'],'test')

#     df_anno_train, df_anno_valid = train_test_split(df_anno_used, test_size=0.2,random_state=0)

    df_anno_train['is_valid'] = False
    df_anno_train['is_used'] = True
    df_anno_train['is_anno'] = True
    df_anno_train['path'] = os.path.join(config['working'],'used')

    df_anno_valid['is_valid'] = True
    df_anno_valid['is_used'] = True
    df_anno_valid['is_anno'] =True
    df_anno_valid['path'] = os.path.join(config['working'],'used')

#     df_anno_test['is_valid'] = True
#     df_anno_test['is_used'] = True
#     df_anno_test['is_anno'] =True
#     df_anno_test['path'] = os.path.join(config['working'],'test')

    df_not_anno['is_valid'] = False
    df_not_anno['is_used'] = True
    df_not_anno['is_anno'] = False
    df_not_anno['path'] = os.path.join(config['working'],'pool')

    df_concat = pd.concat([df_anno_train,df_anno_valid,df_not_anno]).reset_index(drop=True)
    
    df_concat['fname'] = [item + '.png' for item in list(df_concat['patientId'])]
    
#     df_anno_test['fname'] = [item + '.png' for item in list(df_anno_test['patientId'])]


    ## copy image from Input to Working
    
    multi_process_copy_img(df_concat)
    
#     zip_concat = list(zip(list(df_concat['patientId']),list(df_concat['path'])))
#     for item in tqdm(zip_concat):
#         convert_dcm_png(item)

#     zip_test = list(zip(list(df_anno_test['patientId']),list(df_anno_test['path'])))
    
#     for item in tqdm(zip_test):
#         convert_dcm_png(item)
    
    print('Preparing Dataframe Done')
    
#     return df_concat, df_anno_test
    return df_concat

## Transform Augmentation & Create Dataloader

In [15]:
# def create_dataloader(df_concat, df_anno_test):
def create_dataloader(df_concat):

    mean = [0.485, 0.456, 0.406]
    std = [0.229, 0.224, 0.225]

    pool_transforms = transforms.Compose([
                            transforms.Resize(config['size'],config['size']),
                            transforms.ToTensor(),
                            transforms.Normalize(mean, std)
                            ])

    ex_p = 0.1

    extra = [*rand_resize_crop(
                config['size'], 
                max_scale=1.1),
             squish(
                scale=(0.975, 1.2), 
                p=ex_p),
             tilt(
                direction=(0, 3), 
                magnitude=(-0.3, 0.3), 
                p=ex_p),
             cutout(
                n_holes=(1, 10), 
                length=(3, 10), 
                p=ex_p*2)
            ]

    anno_transforms = get_transforms(
                max_rotate=15, 
                max_zoom=1.1, 
                max_lighting=0.2, 
                max_warp=0.15, 
                p_affine=0.5, 
                p_lighting=0.4, 
                xtra_tfms=extra)

    anno_transforms = list(anno_transforms)
    
    anno_transforms[1] = []
    
    dls_anno = (
                ImageList.from_df(
                    df=df_concat[
                        (df_concat['is_anno']==True)&(df_concat['is_used']==True)
                                ].reset_index(drop=True), 
                    path=os.path.join(config['working'],'used'), 
                    cols='fname')
                .split_from_df(col='is_valid')
                .label_from_df(cols=['target'], label_cls=CategoryList)
                .transform(anno_transforms, size=config['size'])
                .databunch(bs=config['batch_size'], num_workers=config['num_workers'])
                .normalize(imagenet_stats)
                )
    
#     df_pool_uniform = df_concat[df_concat['is_anno']==False].sample(config['pool']).reset_index(drop=True)

    df_pool = df_concat[df_concat['is_anno']==False]
    
    df_pool_uniform, _ = train_test_split(df_pool,test_size = (1 - config['pool']/len(df_pool)))
    
    df_pool_uniform = df_pool_uniform.reset_index(drop=True)

    pool_ds = poolDataset(
                    os.path.join(config['working'],'pool'),
                    df_pool_uniform,
                    pool_transforms
                    )

    dls_pool = DataLoader(
                    pool_ds, 
                    batch_size=config['batch_size'], 
                    num_workers=config['num_workers']
                    )
    
    print('Train {} samples'.format(str(dls_anno.train_ds.x.items.shape[0])))
    print('Valid {} samples'.format(str(dls_anno.valid_ds.x.items.shape[0])))
    print('Pool {} samples'.format(str(len(dls_pool.dataset.imgs))))
    
    return dls_anno, dls_pool

In [16]:
for ratio in sorted(list(np.array(list(range(3,10)))*0.1),reverse=True):

    config['used'] = int((1-ratio)*24000)

    config['anno_ratio'] = ratio

    prepare_working(config)

    config['thresh'] = 0.2

    print('='*80)
    print('Experiments with {} samples annotation'.format(str(config['used'])))

#     df_concat, df_anno_test = prepare_dataframe(config)

    df_concat = prepare_dataframe(config)

    fnames, labels = [], [] 

    for ite in range(config['nb_iterations']):

        if len(df_concat[df_concat['is_anno']==False]) != 0:

            print('*'*10)
            print('Iteration: {}'.format(str(ite)))

            df_concat = query_update(df_concat, fnames, labels)

            ## each iterations samples 6500 instances from the pool
    
            dls_anno, dls_pool = create_dataloader(df_concat)

            ## create Learner with dls_anno & model
            ## training with momentum Online Active learning
            
            if ite == 0:
                model = nn.Sequential(timm.create_model('resnext50_32x4d',pretrained=True,num_classes=config['num_classes']))
                learn = Learner(data=dls_anno,                  
                                model=model,
                                metrics=FBeta(beta=1))
                learn.model = nn.DataParallel(learn.model, device_ids=[0])
                
            else:
                learn = Learner(data=dls_anno,                  
                                model=learn.model,
                                metrics=FBeta(beta=1))
                
            df_concat, learn = online_active_learning(df_concat,learn)

            ## Query stratifies using pool dataloader

            confidence_scores, X_unlabeled = learn_get_preds(
                                model=learn.model,
                                device=device, 
                                data_loader=dls_pool
                              )

            predictions_rank, entropies = entropy_rank(confidence_scores)

            print('Entropy threshold for sampling: {}'.format(str(config['thresh'])))

            idx_certain_samples, config['thresh'] = certain_set(entropies, 
                            config['thresh'], 
                            config['initial_decay_rate'], 
                            config['decay_rate'])

            print('Certain {} instances'.format(str(idx_certain_samples.shape[0])))
            
            if idx_certain_samples.shape[0] != 0:
                ## Sampling high confidence score as certain instances

                X_certain = X_unlabeled[idx_certain_samples].tolist()

                X_gist, y_gist, gist_sdb = gist_selection(X_certain, learn)

                fnames = X_gist
                labels = y_gist

                print("Updating samples into Trainset: {0} neg / {1} pos".format(str(labels.count(0)),str(labels.count(1))))

                if labels.count(0) == labels.count(1) and labels.count(0) == 0:
                    break
            else:
                break
        else:
            break

Prepare Working directory Done
Experiments with 2399 samples annotation


100%|██████████| 26681/26681 [07:16<00:00, 61.19it/s]
0it [00:00, ?it/s]


Preparing Dataframe Done
**********
Iteration: 0
Pseudo-labeling 0 instances: Done
Train 2398 samples
Valid 2669 samples
Pool 6499 samples
Current iteration train: 1846 neg / 552 pos
Momentum at epoch: 0


epoch,train_loss,valid_loss,f_beta,time
0,0.444713,0.43996,0.469565,01:11


Momentum at epoch: 1


epoch,train_loss,valid_loss,f_beta,time
0,0.423251,0.418104,0.560766,01:11


Momentum at epoch: 2


epoch,train_loss,valid_loss,f_beta,time
0,0.413282,0.475472,0.405213,01:10


Momentum at epoch: 3


epoch,train_loss,valid_loss,f_beta,time
0,0.395548,0.39292,0.534031,01:11


Momentum at epoch: 4


epoch,train_loss,valid_loss,f_beta,time
0,0.404933,0.428122,0.582369,01:10


100%|██████████| 748/748 [00:07<00:00, 105.93it/s]


Remove 748 low confidence instances: Done
Train 1650 samples
Valid 2669 samples
Pool 6499 samples


Entropy threshold for sampling: 0.2
Certain 2112 instances


100%|██████████| 2112/2112 [00:00<00:00, 29229.08it/s]
  3%|▎         | 6/223 [00:00<00:03, 59.46it/s]

Updating samples into Trainset: 0 neg / 223 pos
**********
Iteration: 1


100%|██████████| 223/223 [00:04<00:00, 54.20it/s]


Pseudo-labeling 223 instances: Done
Train 1873 samples
Valid 2669 samples
Pool 6500 samples
Current iteration train: 1461 neg / 412 pos
Momentum at epoch: 0


epoch,train_loss,valid_loss,f_beta,time
0,0.46695,0.829553,0.326019,00:58


Momentum at epoch: 1


epoch,train_loss,valid_loss,f_beta,time
0,0.461651,0.434957,0.278581,00:58


Momentum at epoch: 2


epoch,train_loss,valid_loss,f_beta,time
0,0.450697,0.534058,0.588798,00:58


Momentum at epoch: 3


epoch,train_loss,valid_loss,f_beta,time
0,0.444885,0.421354,0.545455,00:58


Momentum at epoch: 4


epoch,train_loss,valid_loss,f_beta,time
0,0.457755,0.467542,0.515522,00:58


100%|██████████| 847/847 [00:07<00:00, 116.12it/s]


Remove 847 low confidence instances: Done
Train 1026 samples
Valid 2669 samples
Pool 6500 samples


Entropy threshold for sampling: 0.3053207639590883
Certain 1 instances


100%|██████████| 1/1 [00:00<00:00, 2774.01it/s]


Updating samples into Trainset: 0 neg / 0 pos
Prepare Working directory Done
Experiments with 4799 samples annotation


100%|██████████| 26679/26679 [06:05<00:00, 72.99it/s]
0it [00:00, ?it/s]


Preparing Dataframe Done
**********
Iteration: 0
Pseudo-labeling 0 instances: Done
Train 4798 samples
Valid 2669 samples
Pool 6500 samples
Current iteration train: 3706 neg / 1092 pos
Momentum at epoch: 0


epoch,train_loss,valid_loss,f_beta,time
0,0.412095,0.37561,0.58309,02:08


Momentum at epoch: 1


epoch,train_loss,valid_loss,f_beta,time
0,0.448491,0.412813,0.55687,02:09


Momentum at epoch: 2


epoch,train_loss,valid_loss,f_beta,time
0,0.414341,0.394462,0.579817,02:08


Momentum at epoch: 3


epoch,train_loss,valid_loss,f_beta,time
0,0.440999,0.383883,0.533333,02:08


Momentum at epoch: 4


epoch,train_loss,valid_loss,f_beta,time
0,0.413364,0.427931,0.380265,02:08


100%|██████████| 1703/1703 [00:14<00:00, 115.17it/s]


Remove 1703 low confidence instances: Done
Train 3095 samples
Valid 2669 samples
Pool 6500 samples


Entropy threshold for sampling: 0.2
Certain 2 instances


100%|██████████| 2/2 [00:00<00:00, 6026.30it/s]
100%|██████████| 2/2 [00:00<00:00, 55.80it/s]


Updating samples into Trainset: 0 neg / 2 pos
**********
Iteration: 1
Pseudo-labeling 2 instances: Done
Train 3097 samples
Valid 2669 samples
Pool 6500 samples
Current iteration train: 2827 neg / 270 pos
Momentum at epoch: 0


epoch,train_loss,valid_loss,f_beta,time
0,0.259376,0.503973,0.0,01:27


Momentum at epoch: 1


epoch,train_loss,valid_loss,f_beta,time
0,0.239118,0.522251,0.009709,01:27


Momentum at epoch: 2


epoch,train_loss,valid_loss,f_beta,time
0,0.268951,0.477978,0.059375,01:27


Momentum at epoch: 3


epoch,train_loss,valid_loss,f_beta,time
0,0.284152,0.438922,0.306533,01:28


Momentum at epoch: 4


epoch,train_loss,valid_loss,f_beta,time
0,0.27577,0.455515,0.444191,01:27


100%|██████████| 159/159 [00:01<00:00, 113.16it/s]


Remove 159 low confidence instances: Done
Train 2938 samples
Valid 2669 samples
Pool 6500 samples


Entropy threshold for sampling: 0.3070630794069518
Certain 174 instances


100%|██████████| 174/174 [00:00<00:00, 25075.90it/s]
  4%|▎         | 6/161 [00:00<00:02, 59.84it/s]

Updating samples into Trainset: 161 neg / 0 pos
**********
Iteration: 2


100%|██████████| 161/161 [00:02<00:00, 56.21it/s]


Pseudo-labeling 161 instances: Done
Train 3099 samples
Valid 2669 samples
Pool 6500 samples
Current iteration train: 2905 neg / 194 pos
Momentum at epoch: 0


epoch,train_loss,valid_loss,f_beta,time
0,0.233126,0.590711,0.0,01:27


Momentum at epoch: 1


epoch,train_loss,valid_loss,f_beta,time
0,0.229968,0.535656,0.0,01:27


Momentum at epoch: 2


epoch,train_loss,valid_loss,f_beta,time
0,0.255665,0.622726,0.0,01:27


Momentum at epoch: 3


epoch,train_loss,valid_loss,f_beta,time
0,0.239977,0.665966,0.0,01:27


Momentum at epoch: 4


epoch,train_loss,valid_loss,f_beta,time
0,0.209759,0.567312,0.0,01:28


100%|██████████| 1/1 [00:00<00:00, 57.15it/s]


Remove 1 low confidence instances: Done
Train 3098 samples
Valid 2669 samples
Pool 6500 samples


Entropy threshold for sampling: 0.3874054637662085
Certain 2455 instances


100%|██████████| 2455/2455 [00:00<00:00, 26561.77it/s]
  0%|          | 7/2368 [00:00<00:39, 60.34it/s]

Updating samples into Trainset: 2145 neg / 223 pos
**********
Iteration: 3


100%|██████████| 2368/2368 [00:40<00:00, 58.40it/s]


Pseudo-labeling 2368 instances: Done
Train 5466 samples
Valid 2669 samples
Pool 6500 samples
Current iteration train: 5049 neg / 417 pos
Momentum at epoch: 0


epoch,train_loss,valid_loss,f_beta,time
0,0.272198,0.638052,0.0,02:24


Momentum at epoch: 1


epoch,train_loss,valid_loss,f_beta,time
0,0.270822,0.75546,0.0,02:24


Momentum at epoch: 2


epoch,train_loss,valid_loss,f_beta,time
0,0.282638,0.60633,0.0,02:24


Momentum at epoch: 3


epoch,train_loss,valid_loss,f_beta,time
0,0.260628,0.663962,0.0,02:25


Momentum at epoch: 4


epoch,train_loss,valid_loss,f_beta,time
0,0.266738,0.674317,0.0,02:24


0it [00:00, ?it/s]


Remove 0 low confidence instances: Done
Train 5466 samples
Valid 2669 samples
Pool 6500 samples


Entropy threshold for sampling: 0.45064401776655316
Certain 1604 instances


100%|██████████| 1604/1604 [00:00<00:00, 26671.25it/s]
  0%|          | 7/1562 [00:00<00:25, 60.83it/s]

Updating samples into Trainset: 1463 neg / 99 pos
**********
Iteration: 4


100%|██████████| 1562/1562 [00:26<00:00, 58.58it/s]


Pseudo-labeling 1562 instances: Done
Train 7028 samples
Valid 2669 samples
Pool 6500 samples
Current iteration train: 6512 neg / 516 pos
Momentum at epoch: 0


epoch,train_loss,valid_loss,f_beta,time
0,0.235097,0.661231,0.0,03:02


Momentum at epoch: 1


epoch,train_loss,valid_loss,f_beta,time
0,0.262073,0.729295,0.0,03:03


Momentum at epoch: 2


epoch,train_loss,valid_loss,f_beta,time
0,0.250562,0.645589,0.006483,03:02


Momentum at epoch: 3


epoch,train_loss,valid_loss,f_beta,time
0,0.254228,0.690608,0.0,03:01


Momentum at epoch: 4


epoch,train_loss,valid_loss,f_beta,time
0,0.242213,0.723943,0.0,03:01


100%|██████████| 110/110 [00:00<00:00, 115.97it/s]


Remove 110 low confidence instances: Done
Train 6918 samples
Valid 2669 samples
Pool 6500 samples


Entropy threshold for sampling: 0.49081094943036097
Certain 4618 instances


100%|██████████| 4618/4618 [00:00<00:00, 31542.54it/s]


Updating samples into Trainset: 4618 neg / 0 pos
Prepare Working directory Done
Experiments with 7199 samples annotation


100%|██████████| 26679/26679 [05:50<00:00, 76.11it/s]
0it [00:00, ?it/s]


Preparing Dataframe Done
**********
Iteration: 0
Pseudo-labeling 0 instances: Done
Train 7199 samples
Valid 2669 samples
Pool 6500 samples
Current iteration train: 5588 neg / 1611 pos
Momentum at epoch: 0


epoch,train_loss,valid_loss,f_beta,time
0,0.376936,0.393383,0.525994,03:06


Momentum at epoch: 1


epoch,train_loss,valid_loss,f_beta,time
0,0.406378,0.389713,0.472284,03:06


Momentum at epoch: 2


epoch,train_loss,valid_loss,f_beta,time
0,0.395092,0.399227,0.552529,03:05


Momentum at epoch: 3


epoch,train_loss,valid_loss,f_beta,time
0,0.394804,0.392288,0.508827,03:05


Momentum at epoch: 4


epoch,train_loss,valid_loss,f_beta,time
0,0.396369,0.392632,0.511771,03:06


100%|██████████| 2607/2607 [00:22<00:00, 113.72it/s]


Remove 2607 low confidence instances: Done
Train 4592 samples
Valid 2669 samples
Pool 6500 samples


Entropy threshold for sampling: 0.2
Certain 208 instances


100%|██████████| 208/208 [00:00<00:00, 22291.89it/s]
  3%|▎         | 6/208 [00:00<00:03, 59.12it/s]

Updating samples into Trainset: 0 neg / 208 pos
**********
Iteration: 1


100%|██████████| 208/208 [00:03<00:00, 59.15it/s]


Pseudo-labeling 208 instances: Done
Train 4800 samples
Valid 2669 samples
Pool 6499 samples
Current iteration train: 4196 neg / 604 pos
Momentum at epoch: 0


epoch,train_loss,valid_loss,f_beta,time
0,0.343372,0.417794,0.358586,02:08


Momentum at epoch: 1


epoch,train_loss,valid_loss,f_beta,time
0,0.331962,0.411478,0.467169,02:08


Momentum at epoch: 2


epoch,train_loss,valid_loss,f_beta,time
0,0.355647,0.511075,0.207977,02:09


Momentum at epoch: 3


epoch,train_loss,valid_loss,f_beta,time
0,0.392295,0.478644,0.233803,02:08


Momentum at epoch: 4


epoch,train_loss,valid_loss,f_beta,time
0,0.339845,0.524373,0.276964,02:09


100%|██████████| 51/51 [00:00<00:00, 117.05it/s]


Remove 51 low confidence instances: Done
Train 4749 samples
Valid 2669 samples
Pool 6499 samples


Entropy threshold for sampling: 0.30007770510315135
Certain 106 instances


100%|██████████| 106/106 [00:00<00:00, 23634.91it/s]
  7%|▋         | 7/106 [00:00<00:01, 60.85it/s]

Updating samples into Trainset: 102 neg / 4 pos
**********
Iteration: 2


100%|██████████| 106/106 [00:01<00:00, 60.00it/s]


Pseudo-labeling 106 instances: Done
Train 4855 samples
Valid 2669 samples
Pool 6500 samples
Current iteration train: 4282 neg / 573 pos
Momentum at epoch: 0


epoch,train_loss,valid_loss,f_beta,time
0,0.362311,0.471139,0.210084,02:09


Momentum at epoch: 1


epoch,train_loss,valid_loss,f_beta,time
0,0.366522,0.514697,0.0,02:09


Momentum at epoch: 2


epoch,train_loss,valid_loss,f_beta,time
0,0.346945,0.436654,0.297927,02:09


Momentum at epoch: 3


epoch,train_loss,valid_loss,f_beta,time
0,0.343612,0.551451,0.006494,02:10


Momentum at epoch: 4


epoch,train_loss,valid_loss,f_beta,time
0,0.360337,0.500291,0.0,02:09


100%|██████████| 96/96 [00:00<00:00, 117.24it/s]


Remove 96 low confidence instances: Done
Train 4759 samples
Valid 2669 samples
Pool 6500 samples


Entropy threshold for sampling: 0.3779234507029655
Certain 32 instances


100%|██████████| 32/32 [00:00<00:00, 19723.40it/s]
 19%|█▉        | 6/32 [00:00<00:00, 59.88it/s]

Updating samples into Trainset: 32 neg / 0 pos
**********
Iteration: 3


100%|██████████| 32/32 [00:00<00:00, 60.07it/s]


Pseudo-labeling 32 instances: Done
Train 4791 samples
Valid 2669 samples
Pool 6500 samples
Current iteration train: 4266 neg / 525 pos
Momentum at epoch: 0


epoch,train_loss,valid_loss,f_beta,time
0,0.378073,0.562555,0.0,02:08


Momentum at epoch: 1


epoch,train_loss,valid_loss,f_beta,time
0,0.333619,0.689908,0.0,02:08


Momentum at epoch: 2


epoch,train_loss,valid_loss,f_beta,time
0,0.376811,0.55069,0.0,02:08


Momentum at epoch: 3


epoch,train_loss,valid_loss,f_beta,time
0,0.375375,0.60005,0.0,02:08


Momentum at epoch: 4


epoch,train_loss,valid_loss,f_beta,time
0,0.347311,0.609989,0.0,02:08


100%|██████████| 8/8 [00:00<00:00, 114.10it/s]


Remove 8 low confidence instances: Done
Train 4783 samples
Valid 2669 samples
Pool 6500 samples


Entropy threshold for sampling: 0.4393046523324651
Certain 5 instances


100%|██████████| 5/5 [00:00<00:00, 10586.33it/s]
100%|██████████| 5/5 [00:00<00:00, 60.05it/s]


Updating samples into Trainset: 0 neg / 5 pos
**********
Iteration: 4
Pseudo-labeling 5 instances: Done
Train 4788 samples
Valid 2669 samples
Pool 6499 samples
Current iteration train: 4259 neg / 529 pos
Momentum at epoch: 0


epoch,train_loss,valid_loss,f_beta,time
0,0.338535,0.560527,0.0,02:08


Momentum at epoch: 1


epoch,train_loss,valid_loss,f_beta,time
0,0.340127,0.619456,0.0,02:08


Momentum at epoch: 2


epoch,train_loss,valid_loss,f_beta,time
0,0.338448,0.698571,0.0,02:08


Momentum at epoch: 3


epoch,train_loss,valid_loss,f_beta,time
0,0.383404,0.608902,0.0,02:08


Momentum at epoch: 4


epoch,train_loss,valid_loss,f_beta,time
0,0.358373,0.570526,0.0,02:08


0it [00:00, ?it/s]


Remove 0 low confidence instances: Done
Train 4788 samples
Valid 2669 samples
Pool 6499 samples


Entropy threshold for sampling: 0.4816388015521494
Certain 260 instances


100%|██████████| 260/260 [00:00<00:00, 24423.17it/s]


Updating samples into Trainset: 257 neg / 2 pos
Prepare Working directory Done
Experiments with 9599 samples annotation


100%|██████████| 26678/26678 [05:53<00:00, 75.37it/s]
0it [00:00, ?it/s]


Preparing Dataframe Done
**********
Iteration: 0
Pseudo-labeling 0 instances: Done
Train 9599 samples
Valid 2669 samples
Pool 6500 samples
Current iteration train: 7461 neg / 2138 pos
Momentum at epoch: 0


epoch,train_loss,valid_loss,f_beta,time
0,0.406738,0.39534,0.521739,04:04


Momentum at epoch: 1


epoch,train_loss,valid_loss,f_beta,time
0,0.400032,0.390171,0.550607,04:04


Momentum at epoch: 2


epoch,train_loss,valid_loss,f_beta,time
0,0.41314,0.38225,0.546371,04:05


Momentum at epoch: 3


epoch,train_loss,valid_loss,f_beta,time
0,0.386011,0.379377,0.546559,04:04


Momentum at epoch: 4


epoch,train_loss,valid_loss,f_beta,time
0,0.373,0.36951,0.572797,04:04


100%|██████████| 3687/3687 [00:33<00:00, 110.50it/s]


Remove 3687 low confidence instances: Done
Train 5912 samples
Valid 2669 samples
Pool 6500 samples


Entropy threshold for sampling: 0.2
Certain 141 instances


100%|██████████| 141/141 [00:00<00:00, 22933.91it/s]
  4%|▍         | 6/141 [00:00<00:02, 58.38it/s]

Updating samples into Trainset: 0 neg / 141 pos
**********
Iteration: 1


100%|██████████| 141/141 [00:02<00:00, 58.65it/s]


Pseudo-labeling 141 instances: Done
Train 6053 samples
Valid 2669 samples
Pool 6500 samples
Current iteration train: 5364 neg / 689 pos
Momentum at epoch: 0


epoch,train_loss,valid_loss,f_beta,time
0,0.302991,0.462387,0.152717,02:39


Momentum at epoch: 1


epoch,train_loss,valid_loss,f_beta,time
0,0.293354,0.387405,0.59415,02:38


Momentum at epoch: 2


epoch,train_loss,valid_loss,f_beta,time
0,0.297379,0.49574,0.460512,02:39


Momentum at epoch: 3


epoch,train_loss,valid_loss,f_beta,time
0,0.272771,0.384298,0.598499,02:38


Momentum at epoch: 4


epoch,train_loss,valid_loss,f_beta,time
0,0.285914,0.390849,0.617696,02:39


100%|██████████| 377/377 [00:03<00:00, 105.79it/s]


Remove 377 low confidence instances: Done
Train 5676 samples
Valid 2669 samples
Pool 6500 samples


Entropy threshold for sampling: 0.29873978333230883
Certain 450 instances


100%|██████████| 450/450 [00:00<00:00, 18560.87it/s]
  1%|▏         | 6/450 [00:00<00:07, 57.08it/s]

Updating samples into Trainset: 450 neg / 0 pos
**********
Iteration: 2


100%|██████████| 450/450 [00:07<00:00, 57.09it/s]


Pseudo-labeling 450 instances: Done
Train 6126 samples
Valid 2669 samples
Pool 6500 samples
Current iteration train: 5532 neg / 594 pos
Momentum at epoch: 0


epoch,train_loss,valid_loss,f_beta,time
0,0.25876,0.400122,0.534579,02:40


Momentum at epoch: 1


epoch,train_loss,valid_loss,f_beta,time
0,0.243877,0.419744,0.405728,02:40


Momentum at epoch: 2


epoch,train_loss,valid_loss,f_beta,time
0,0.235655,0.410309,0.45403,02:40


Momentum at epoch: 3


epoch,train_loss,valid_loss,f_beta,time
0,0.260554,0.412284,0.509724,02:40


Momentum at epoch: 4


epoch,train_loss,valid_loss,f_beta,time
0,0.261824,0.428094,0.327228,02:40


100%|██████████| 621/621 [00:05<00:00, 111.37it/s]


Remove 621 low confidence instances: Done
Train 5505 samples
Valid 2669 samples
Pool 6500 samples


Entropy threshold for sampling: 0.37752401314381057
Certain 1037 instances


100%|██████████| 1037/1037 [00:00<00:00, 23895.03it/s]
  1%|          | 6/1037 [00:00<00:17, 58.58it/s]

Updating samples into Trainset: 0 neg / 1037 pos
**********
Iteration: 3


100%|██████████| 1037/1037 [00:17<00:00, 58.08it/s]


Pseudo-labeling 1037 instances: Done
Train 6542 samples
Valid 2669 samples
Pool 6499 samples
Current iteration train: 5176 neg / 1366 pos
Momentum at epoch: 0


epoch,train_loss,valid_loss,f_beta,time
0,0.527307,0.510362,0.222222,02:50


Momentum at epoch: 1


epoch,train_loss,valid_loss,f_beta,time
0,0.523794,0.596672,0.0,02:49


Momentum at epoch: 2


epoch,train_loss,valid_loss,f_beta,time
0,0.510808,0.522047,0.182344,02:50


Momentum at epoch: 3


epoch,train_loss,valid_loss,f_beta,time
0,0.49251,0.609123,0.0,02:50


Momentum at epoch: 4


epoch,train_loss,valid_loss,f_beta,time
0,0.509601,0.578838,0.071875,02:50


100%|██████████| 4565/4565 [00:41<00:00, 110.59it/s]


Remove 4565 low confidence instances: Done
Train 1977 samples
Valid 2669 samples
Pool 6499 samples


Entropy threshold for sampling: 0.4408827955843893
Certain 206 instances


100%|██████████| 206/206 [00:00<00:00, 28585.54it/s]
  3%|▎         | 6/181 [00:00<00:02, 59.34it/s]

Updating samples into Trainset: 99 neg / 82 pos
**********
Iteration: 4


100%|██████████| 181/181 [00:03<00:00, 56.72it/s]


Pseudo-labeling 181 instances: Done
Train 2158 samples
Valid 2669 samples
Pool 6500 samples
Current iteration train: 1872 neg / 286 pos
Momentum at epoch: 0


epoch,train_loss,valid_loss,f_beta,time
0,0.390515,0.554035,0.022152,01:05


Momentum at epoch: 1


epoch,train_loss,valid_loss,f_beta,time
0,0.342893,0.619087,0.125668,01:05


Momentum at epoch: 2


epoch,train_loss,valid_loss,f_beta,time
0,0.330323,0.658194,0.066482,01:05


Momentum at epoch: 3


epoch,train_loss,valid_loss,f_beta,time
0,0.332448,0.676225,0.131234,01:05


Momentum at epoch: 4


epoch,train_loss,valid_loss,f_beta,time
0,0.324344,0.699762,0.048433,01:05


100%|██████████| 46/46 [00:00<00:00, 113.47it/s]


Remove 46 low confidence instances: Done
Train 2112 samples
Valid 2669 samples
Pool 6500 samples


Entropy threshold for sampling: 0.4914024861218893
Certain 2405 instances


100%|██████████| 2405/2405 [00:00<00:00, 25544.00it/s]


Updating samples into Trainset: 0 neg / 2405 pos
Prepare Working directory Done
Experiments with 12000 samples annotation


100%|██████████| 26677/26677 [05:54<00:00, 75.33it/s]
0it [00:00, ?it/s]


Preparing Dataframe Done
**********
Iteration: 0
Pseudo-labeling 0 instances: Done
Train 12000 samples
Valid 2669 samples
Pool 6499 samples
Current iteration train: 9300 neg / 2700 pos
Momentum at epoch: 0


epoch,train_loss,valid_loss,f_beta,time
0,0.436844,0.400586,0.505796,05:02


Momentum at epoch: 1


epoch,train_loss,valid_loss,f_beta,time
0,0.379581,0.389461,0.467269,05:01


Momentum at epoch: 2


epoch,train_loss,valid_loss,f_beta,time
0,0.372159,0.369712,0.56705,05:02


Momentum at epoch: 3


epoch,train_loss,valid_loss,f_beta,time
0,0.386411,0.373392,0.568401,05:01


Momentum at epoch: 4


epoch,train_loss,valid_loss,f_beta,time
0,0.411286,0.375351,0.59944,05:02


100%|██████████| 4171/4171 [00:37<00:00, 111.72it/s]


Remove 4171 low confidence instances: Done
Train 7829 samples
Valid 2669 samples
Pool 6499 samples


Entropy threshold for sampling: 0.2
Certain 0 instances
Prepare Working directory Done
Experiments with 14400 samples annotation


100%|██████████| 26674/26674 [05:55<00:00, 75.03it/s]
0it [00:00, ?it/s]


Preparing Dataframe Done
**********
Iteration: 0
Pseudo-labeling 0 instances: Done
Train 14399 samples
Valid 2669 samples
Pool 6500 samples
Current iteration train: 11154 neg / 3245 pos
Momentum at epoch: 0


epoch,train_loss,valid_loss,f_beta,time
0,0.395313,0.375616,0.54715,05:59


Momentum at epoch: 1


epoch,train_loss,valid_loss,f_beta,time
0,0.410101,0.37447,0.584942,05:58


Momentum at epoch: 2


epoch,train_loss,valid_loss,f_beta,time
0,0.382819,0.380754,0.577778,06:01


Momentum at epoch: 3


epoch,train_loss,valid_loss,f_beta,time
0,0.389446,0.362965,0.574144,06:00


Momentum at epoch: 4


epoch,train_loss,valid_loss,f_beta,time
0,0.41644,0.368862,0.539583,06:00


100%|██████████| 4827/4827 [00:43<00:00, 110.20it/s]


Remove 4827 low confidence instances: Done
Train 9572 samples
Valid 2669 samples
Pool 6500 samples


Entropy threshold for sampling: 0.2
Certain 625 instances


100%|██████████| 625/625 [00:00<00:00, 26456.21it/s]
  1%|          | 6/625 [00:00<00:10, 59.41it/s]

Updating samples into Trainset: 625 neg / 0 pos
**********
Iteration: 1


100%|██████████| 625/625 [00:10<00:00, 57.96it/s]


Pseudo-labeling 625 instances: Done
Train 10197 samples
Valid 2669 samples
Pool 6499 samples
Current iteration train: 9206 neg / 991 pos
Momentum at epoch: 0


epoch,train_loss,valid_loss,f_beta,time
0,0.247806,0.38197,0.553191,04:18


Momentum at epoch: 1


epoch,train_loss,valid_loss,f_beta,time
0,0.228422,0.385509,0.575397,04:17


Momentum at epoch: 2


epoch,train_loss,valid_loss,f_beta,time
0,0.231894,0.397737,0.495258,04:17


Momentum at epoch: 3


epoch,train_loss,valid_loss,f_beta,time
0,0.227151,0.384209,0.541922,04:17


Momentum at epoch: 4


epoch,train_loss,valid_loss,f_beta,time
0,0.202096,0.369775,0.584416,04:17


100%|██████████| 471/471 [00:04<00:00, 107.25it/s]


Remove 471 low confidence instances: Done
Train 9726 samples
Valid 2669 samples
Pool 6499 samples


Entropy threshold for sampling: 0.3006299222552766
Certain 1447 instances


100%|██████████| 1447/1447 [00:00<00:00, 24029.42it/s]
  0%|          | 6/1406 [00:00<00:23, 59.04it/s]

Updating samples into Trainset: 1392 neg / 14 pos
**********
Iteration: 2


100%|██████████| 1406/1406 [00:24<00:00, 56.91it/s]


Pseudo-labeling 1406 instances: Done
Train 11132 samples
Valid 2669 samples
Pool 6499 samples
Current iteration train: 10258 neg / 874 pos
Momentum at epoch: 0


epoch,train_loss,valid_loss,f_beta,time
0,0.162625,0.379813,0.571429,04:40


Momentum at epoch: 1


epoch,train_loss,valid_loss,f_beta,time
0,0.191342,0.367474,0.593665,04:40


Momentum at epoch: 2


epoch,train_loss,valid_loss,f_beta,time
0,0.187242,0.384758,0.583257,04:40


Momentum at epoch: 3


epoch,train_loss,valid_loss,f_beta,time
0,0.181694,0.367371,0.583643,04:40


Momentum at epoch: 4


epoch,train_loss,valid_loss,f_beta,time
0,0.196612,0.401991,0.559259,04:39


100%|██████████| 185/185 [00:01<00:00, 113.93it/s]


Remove 185 low confidence instances: Done
Train 10947 samples
Valid 2669 samples
Pool 6499 samples


Entropy threshold for sampling: 0.3791371253635001
Certain 2004 instances


100%|██████████| 2004/2004 [00:00<00:00, 22041.82it/s]
  0%|          | 6/1691 [00:00<00:31, 53.48it/s]

Updating samples into Trainset: 78 neg / 1613 pos
**********
Iteration: 3


100%|██████████| 1691/1691 [00:29<00:00, 56.90it/s]


Pseudo-labeling 1691 instances: Done


ValueError: test_size=-0.10469068660774994 should be either positive and smaller than the number of samples 5884 or a float in the (0, 1) range

# Investigate

# Evaluation

In [None]:
feature_train, categories_train = feature_extractor(learn, learn.data.train_dl.dl)

pca = PCA(n_components=2)

feature_train_pca = pca.fit_transform(sklearn.preprocessing.normalize(feature_train))

x = [item[0] for item in feature_train_pca.tolist()]
y = [item[1] for item in feature_train_pca.tolist()]
color = ['darkorange' if item == 1 else 'navy' for item in categories_train.squeeze().tolist()]
plt.figure(figsize=(5, 5), dpi=80)
plt.scatter(x, y, color=color, alpha=0.5)
plt.show()

In [None]:
# for x,y in dls_anno.valid_dl.dl:
#     print(x,y)
#     break

In [None]:
feature_train, categories_train = feature_extractor(learn, dls_anno.valid_dl.dl)

pca = PCA(n_components=2)

feature_train_pca = pca.fit_transform(sklearn.preprocessing.normalize(feature_train))

x = [item[0] for item in feature_train_pca.tolist()]
y = [item[1] for item in feature_train_pca.tolist()]
color = ['darkorange' if item == 1 else 'navy' for item in categories_train.squeeze().tolist()]
plt.figure(figsize=(5, 5), dpi=80)
plt.scatter(x, y, color=color, alpha=0.5)
plt.show()

In [None]:
# dls_anno

In [None]:
# learn.get_preds()

In [None]:
# test_predictions = []
# for test_image in progress_bar(dls_anno.test_dl.dl):
#     test_predictions.append(learn.predict(test_image)[0])

# print(classification_report(
#     y_true=list(df_anno_test['target']), 
#     y_pred=test_predictions))

In [None]:
# feature_test = feature_extractor(learn, dls_pool, is_train=False)

In [None]:
# train_diffs = []
# batch_size = config['batch_size']
# feature_train = torch.tensor(feature_train)
# train_diffs = torch.empty([feature_train.shape[0], feature_train.shape[0]])

# for i in tqdm(range((feature_train.shape[0]//batch_size) + 1)):
#     if i*batch_size > feature_train.shape[0]:
#         break
                      
#     end_idx = min(feature_train.shape[0], (i+1)*batch_size)
        
#     batch1 = feature_train[i*batch_size:end_idx]
    
#     batch2 = feature_train
#     diff = torch.norm(batch1[:, np.newaxis] - batch2, dim=-1)
    
#     train_diffs[i*batch_size:end_idx] = diff

In [None]:
# test_diffs = []
# batch_size = config['batch_size']

# feature_test = torch.tensor(feature_test)

# test_diffs = torch.empty([feature_test.shape[0], feature_train.shape[0]])

# for i in tqdm(range((feature_test.shape[0]//batch_size) + 1)):
#     if i*batch_size > feature_test.shape[0]:
#         break
                      
#     end_idx = min(feature_test.shape[0], (i+1)*batch_size)
        
#     batch1 = feature_test[i*batch_size:end_idx]
    
#     batch2 = feature_train
#     diff = torch.norm(batch1[:, np.newaxis] - batch2, dim=-1)
    
#     test_diffs[i*batch_size:end_idx] = diff

In [None]:
# test_diffs.shape

In [None]:
# test_diffs = test_diffs.cpu().numpy()
# test_diffs.sort()

In [None]:
# feature_train, categories_train = feature_extractor(learn, learn.data.valid_dl.dl)

# pca = PCA(n_components=2)

# feature_train_pca = pca.fit_transform(sklearn.preprocessing.normalize(feature_train))

# x = [item[0] for item in feature_train_pca.tolist()]
# y = [item[1] for item in feature_train_pca.tolist()]
# color = ['darkorange' if item == 1 else 'navy' for item in categories_train.squeeze().tolist()]

# plt.figure(figsize=(5, 5), dpi=80)

# plt.scatter(x, y, color=color, alpha=0.5)
# plt.show()

In [None]:
# import plotly.figure_factory as ff
# #
# hist_data = [test_diffs[:, 1:12].mean(axis=-1), np.median(test_diffs[:, 1:12], axis=-1)]
# group_labels = ['mean', 'median']
# ff.create_distplot(hist_data, group_labels).show('iframe')

In [None]:
# tmp = pd.DataFrame(test_diffs[:, :11])
# valid_index = tmp[tmp[0] < 3.15].index
# len(valid_index)