In [23]:
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN

In [24]:
DATA_KAGGLE_DIR = '../../raw'
EXP_NAME = "ens_baseline"
LIST_CV = [
    'TS_5_4',
    'TS_69_2',
    'TS_6_4',
    'TS_6_6',
]

In [25]:
WANDB = False
WANDB_EXP_NAME = f"{EXP_NAME}"
# EXP_NAME = "try"

if WANDB:
    # !pip install wandb
    import wandb
    import os
    from dotenv import load_dotenv
    load_dotenv()
    wandb.login(key=os.environ.get("WANDB_API_KEY"))

In [26]:
sub1 = pd.read_csv("../../proc/sub/submission_baseline_cv.csv")
sub2 = pd.read_csv("../../proc/sub/submission_cv_mod_resize.csv")
sub3 = pd.read_csv("../../proc/sub/submission_cv_mod_resize_35.csv")

list_sub = [sub1, sub2, sub3]

In [27]:
# https://www.kaggle.com/code/sachinkumar62/czii-cryo-s
df = pd.concat(list_sub, ignore_index=True)

particle_names = ['apo-ferritin', 'beta-amylase', 'beta-galactosidase', 'ribosome', 'thyroglobulin', 'virus-like-particle']
particle_radius = {
    'apo-ferritin': 60,
    'beta-amylase': 60,
    'beta-galactosidase': 90,
    'ribosome': 140,
    'thyroglobulin': 130,
    'virus-like-particle': 140,
}



final = []
for pidx, p in enumerate(particle_names):
    pdf = df[df['particle_type'] == p].reset_index(drop=True)
    p_rad = particle_radius[p]
    
    grouped = pdf.groupby(['experiment'])
    
    for exp, group in grouped:
        group = group.reset_index(drop=True)
        
        coords = group[['x', 'y', 'z']].values
        db = DBSCAN(eps=p_rad, min_samples=2, metric='euclidean').fit(coords)
        labels = db.labels_
        
        group['cluster'] = labels
        
        for cluster_id in np.unique(labels):
            if cluster_id == -1:
                continue
            
            cluster_points = group[group['cluster'] == cluster_id]
            
            avg_x = cluster_points['x'].mean()
            avg_y = cluster_points['y'].mean()
            avg_z = cluster_points['z'].mean()
            
            group.loc[group['cluster'] == cluster_id, ['x', 'y', 'z']] = avg_x, avg_y, avg_z
            group = group.drop_duplicates(subset=['x', 'y', 'z'])
        final.append(group)

df_save = pd.concat(final, ignore_index=True)
df_save = df_save.drop(columns=['cluster'])
df_save = df_save.sort_values(by=['experiment', 'particle_type']).reset_index(drop=True)
df_save['id'] = np.arange(0, len(df_save))
# df_save.to_csv('submission.csv', index=False)

In [28]:
df_save

Unnamed: 0,id,experiment,particle_type,x,y,z
0,0,TS_5_4,apo-ferritin,5880.101481,5132.953020,88.750000
1,1,TS_5_4,apo-ferritin,5748.147477,5107.293462,91.111111
2,2,TS_5_4,apo-ferritin,5471.871745,1521.511971,91.666667
3,3,TS_5_4,apo-ferritin,5720.456589,5002.732057,115.000000
4,4,TS_5_4,apo-ferritin,5300.916829,4170.392795,141.666667
...,...,...,...,...,...,...
1366,1366,TS_6_6,virus-like-particle,5211.253426,4068.538546,260.000000
1367,1367,TS_6_6,virus-like-particle,241.136249,3182.985952,492.500000
1368,1368,TS_6_6,virus-like-particle,336.642212,3173.884521,650.000000
1369,1369,TS_6_6,virus-like-particle,3665.172445,4075.987793,1105.000000


# Scoring

In [29]:
submission = df_save

In [30]:
import sys
sys.path.append('hengck')

from czii_helper import *
from dataset import *
from model2 import *
import numpy as np
from scipy.optimize import linear_sum_assignment

In [31]:
def do_one_eval(truth, predict, threshold):
    P=len(predict)
    T=len(truth)

    if P==0:
        hit=[[],[]]
        miss=np.arange(T).tolist()
        fp=[]
        metric = [P,T,len(hit[0]),len(miss),len(fp)]
        return hit, fp, miss, metric

    if T==0:
        hit=[[],[]]
        fp=np.arange(P).tolist()
        miss=[]
        metric = [P,T,len(hit[0]),len(miss),len(fp)]
        return hit, fp, miss, metric

    #---
    distance = predict.reshape(P,1,3)-truth.reshape(1,T,3)
    distance = distance**2
    distance = distance.sum(axis=2)
    distance = np.sqrt(distance)
    p_index, t_index = linear_sum_assignment(distance)

    valid = distance[p_index, t_index] <= threshold
    p_index = p_index[valid]
    t_index = t_index[valid]
    hit = [p_index.tolist(), t_index.tolist()]
    miss = np.arange(T)
    miss = miss[~np.isin(miss,t_index)].tolist()
    fp = np.arange(P)
    fp = fp[~np.isin(fp,p_index)].tolist()

    metric = [P,T,len(hit[0]),len(miss),len(fp)] #for lb metric F-beta copmutation
    return hit, fp, miss, metric


def compute_lb(submit_df, overlay_dir):
    valid_id = list(submit_df['experiment'].unique())
    print(valid_id)

    eval_df = []
    for id in valid_id:
        truth = read_one_truth(id, overlay_dir) #=f'{valid_dir}/overlay/ExperimentRuns')
        id_df = submit_df[submit_df['experiment'] == id]
        for p in PARTICLE:
            p = dotdict(p)
            print('\r', id, p.name, end='', flush=True)
            xyz_truth = truth[p.name]
            xyz_predict = id_df[id_df['particle_type'] == p.name][['x', 'y', 'z']].values
            hit, fp, miss, metric = do_one_eval(xyz_truth, xyz_predict, p.radius* 0.5)
            eval_df.append(dotdict(
                id=id, particle_type=p.name,
                P=metric[0], T=metric[1], hit=metric[2], miss=metric[3], fp=metric[4],
            ))
    print('')
    eval_df_all = pd.DataFrame(eval_df)
    gb_all = []
    lb_score_all = []
    for exp in LIST_CV:
        eval_df = eval_df_all[eval_df_all['id'] == exp]
        gb = eval_df.groupby('particle_type').agg('sum').drop(columns=['id'])
        gb.loc[:, 'precision'] = gb['hit'] / gb['P']
        gb.loc[:, 'precision'] = gb['precision'].fillna(0)
        gb.loc[:, 'recall'] = gb['hit'] / gb['T']
        gb.loc[:, 'recall'] = gb['recall'].fillna(0)
        gb.loc[:, 'f-beta4'] = 17 * gb['precision'] * gb['recall'] / (16 * gb['precision'] + gb['recall'])
        gb.loc[:, 'f-beta4'] = gb['f-beta4'].fillna(0)

        gb = gb.sort_values('particle_type').reset_index(drop=False)
        # https://www.kaggle.com/competitions/czii-cryo-et-object-identification/discussion/544895
        gb.loc[:, 'weight'] = [1, 0, 2, 1, 2, 1]
        lb_score = (gb['f-beta4'] * gb['weight']).sum() / gb['weight'].sum()
        gb_all.append(gb)
        lb_score_all.append(lb_score)
    return gb_all, lb_score_all


def score_submission(submission):
    #if 1:
    submit_df=submission.copy()
    gb_all, lb_score_all = compute_lb(submit_df, '../../raw/train/overlay/ExperimentRuns')
    for gb, lb_score in zip(gb_all, lb_score_all):
        display(gb)
        print(f'lb_score: {lb_score:.4f}')
        print('')
        print("--------------------------------")

    return lb_score_all


    #show one ----------------------------------
    # fig = plt.figure(figsize=(18, 8))

    # id = valid_id[0]
    # truth = read_one_truth(id,overlay_dir=f'{valid_dir}/overlay/ExperimentRuns')

    # submit_df = submit_df[submit_df['experiment']==id]
    # for p in PARTICLE:
    #     p = dotdict(p)
    #     xyz_truth = truth[p.name]
    #     xyz_predict = submit_df[submit_df['particle_type']==p.name][['x','y','z']].values
    #     hit, fp, miss, _ = do_one_eval(xyz_truth, xyz_predict, p.radius)
    #     print(id, p.name)
    #     print('\t num truth   :',len(xyz_truth) )
    #     print('\t num predict :',len(xyz_predict) )
    #     print('\t num hit  :',len(hit[0]) )
    #     print('\t num fp   :',len(fp) )
    #     print('\t num miss :',len(miss) )

    #     ax = fig.add_subplot(2, 3, p.label, projection='3d')
    #     if hit[0]:
    #         pt = xyz_predict[hit[0]]
    #         ax.scatter(pt[:, 0], pt[:, 1], pt[:, 2], alpha=0.5, color='r')
    #         pt = xyz_truth[hit[1]]
    #         ax.scatter(pt[:,0], pt[:,1], pt[:,2], s=80, facecolors='none', edgecolors='r')
    #     if fp:
    #         pt = xyz_predict[fp]
    #         ax.scatter(pt[:, 0], pt[:, 1], pt[:, 2], alpha=1, color='k')
    #     if miss:
    #         pt = xyz_truth[miss]
    #         ax.scatter(pt[:, 0], pt[:, 1], pt[:, 2], s=160, alpha=1, facecolors='none', edgecolors='k')

    #     ax.set_title(f'{p.name} ({p.difficulty})')

    # plt.tight_layout()
    # plt.show()
    
    # #--- 
    # zz=0

In [32]:
lb_score_all = score_submission(submission)

['TS_5_4', 'TS_69_2', 'TS_6_4', 'TS_6_6']
 TS_6_6 virus-like-particlee


Unnamed: 0,particle_type,P,T,hit,miss,fp,precision,recall,f-beta4,weight
0,apo-ferritin,59,46,38,8,21,0.644068,0.826087,0.812579,1
1,beta-amylase,0,10,0,10,0,0.0,0.0,0.0,0
2,beta-galactosidase,30,12,8,4,22,0.266667,0.666667,0.612613,2
3,ribosome,43,31,24,7,19,0.55814,0.774194,0.756957,1
4,thyroglobulin,128,30,24,6,104,0.1875,0.8,0.671053,2
5,virus-like-particle,16,11,11,0,5,0.6875,1.0,0.973958,1


lb_score: 0.7301

--------------------------------


Unnamed: 0,particle_type,P,T,hit,miss,fp,precision,recall,f-beta4,weight
0,apo-ferritin,59,35,35,0,24,0.59322,1.0,0.961228,1
1,beta-amylase,0,12,0,12,0,0.0,0.0,0.0,0
2,beta-galactosidase,44,16,10,6,34,0.227273,0.625,0.566667,2
3,ribosome,49,37,34,3,15,0.693878,0.918919,0.901716,1
4,thyroglobulin,125,34,27,7,98,0.216,0.794118,0.686099,2
5,virus-like-particle,20,9,9,0,11,0.45,1.0,0.932927,1


lb_score: 0.7573

--------------------------------


Unnamed: 0,particle_type,P,T,hit,miss,fp,precision,recall,f-beta4,weight
0,apo-ferritin,89,58,53,5,36,0.595506,0.913793,0.885939,1
1,beta-amylase,0,9,0,9,0,0.0,0.0,0.0,0
2,beta-galactosidase,43,12,9,3,34,0.209302,0.75,0.651064,2
3,ribosome,115,74,62,12,53,0.53913,0.837838,0.811393,1
4,thyroglobulin,149,30,26,4,123,0.174497,0.866667,0.702703,2
5,virus-like-particle,20,10,8,2,12,0.4,0.8,0.755556,1


lb_score: 0.7372

--------------------------------


Unnamed: 0,particle_type,P,T,hit,miss,fp,precision,recall,f-beta4,weight
0,apo-ferritin,69,41,40,1,29,0.57971,0.97561,0.937931,1
1,beta-amylase,0,14,0,14,0,0.0,0.0,0.0,0
2,beta-galactosidase,45,11,8,3,37,0.177778,0.727273,0.615385,2
3,ribosome,22,23,16,7,6,0.727273,0.695652,0.697436,1
4,thyroglobulin,212,35,29,6,183,0.136792,0.828571,0.638601,2
5,virus-like-particle,34,19,17,2,17,0.5,0.894737,0.85503,1


lb_score: 0.7141

--------------------------------


In [33]:
# wandbの初期化
if WANDB:
    wandb_config = {
        # ... 既存の設定 ...
        # "epochs": CONFIG['epochs'],
        # "learning_rate": CONFIG['learning_rate'],
        # "min_lr": CONFIG["min_lr"],
        # "weight_decay": CONFIG["weight_decay"],
        # "mixup_alpha": CONFIG["mixup_alpha"],
        # "mixup_epochs": CONFIG["mixup_epochs"],  # 新しく追加
    }
    wandb.init(project="CZII", name=WANDB_EXP_NAME, config=wandb_config)

for exp, score in zip(LIST_CV, lb_score_all):
    print(f'lb_score: {score:.4f}')
    if WANDB:
        wandb.log({f"lb_score_{exp}": score})
print(f'mean: {np.mean(lb_score_all):.4f}')
if WANDB:
    wandb.log({"mean_lb_score": np.mean(lb_score_all)})
    wandb.finish()

lb_score: 0.7301
lb_score: 0.7573
lb_score: 0.7372
lb_score: 0.7141
mean: 0.7347
