In [1]:
import os

from utils.data_func import *
from utils.auxiliary_func import *
from utils.plots_func import *
import config.const as const
import config.params as params

from scipy.stats import ttest_ind, mannwhitneyu
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

## Define Parameters

In [2]:
data_preparation_dir = os.path.join(const.base_dir, 'figure_results', 'data_preparation')
os.makedirs(data_preparation_dir, exist_ok=True)

figure_4_dir = os.path.join(const.base_dir, 'figure_results', 'Figure 4')
os.makedirs(figure_4_dir, exist_ok=True)

### Read files

In [3]:
df_category_with_rcb, all_leap_list, df_category_all = extract_core_resection_from_tnbc(const.rcb_file, slide_num = True)
df_category_with_rcb.head()

Core Responder Count: 30
Core Non-Responder Count: 23
Resection Count: 17


Unnamed: 0,leap_ID,slide_num,sample_type,category,RCB_group
14,15,7,core,non responder,3.0
15,16,7,resection,non responder,3.0
16,17,8,core,non responder,2.0
17,18,8,resection,non responder,2.0
18,19,9,core,non responder,3.0


### Create median lifetime data frame from all the sample types

In [4]:
sample_type = 'resection'
resection_dir = os.path.join(const.full_tissue_dir, sample_type)

resection_median_path = f'{resection_dir}/features_median_data.csv'
resection_median_df = pd.read_csv(resection_median_path,  dtype={'leap_ID': str})
resection_median_df.head()


Unnamed: 0,leap_ID,lifetime_mean,area,extent,solidity,perimeter,diameter_area,convex_hull_area,minor_axis_length,perimeter_crofton,major_axis_length,orientation,diameter_max,eccentricity,density_radius_20,density_radius_40,density_radius_60,density_radius_80,category
0,16,3.975238,17.0,0.76,0.954545,13.071068,4.652426,18.0,3.993285,15.073666,5.602338,0.0,5.830952,0.68313,0.011141,0.009748,0.00893,0.008356,non responder
1,18,4.234513,16.0,0.75,0.947368,12.828427,4.513517,18.0,3.679465,15.073666,5.782733,0.0,6.0,0.739975,0.007958,0.006764,0.006101,0.005769,non responder
2,20,4.22754,29.0,0.75,0.947368,17.899495,6.076508,31.0,5.212951,19.976625,7.276737,0.0,7.615773,0.673317,0.011937,0.012136,0.012113,0.012086,non responder
3,29,2.878154,14.0,0.75,0.952381,11.656854,4.222008,15.0,3.289022,13.732908,5.326819,0.0,5.385165,0.752023,0.007958,0.00756,0.00725,0.007063,non responder
4,33,3.444826,19.0,0.75,0.947368,14.242641,4.918491,21.0,3.938749,16.414425,6.286994,0.050623,6.324555,0.74455,0.010345,0.009748,0.009372,0.009052,non responder


In [5]:
sample_type = 'core'
core_dir = os.path.join(const.full_tissue_dir, sample_type)

core_median_path = f'{core_dir}/features_median_data.csv'
core_median_df = pd.read_csv(core_median_path,  dtype={'leap_ID': str})
core_median_df.head()


Unnamed: 0,leap_ID,lifetime_mean,area,extent,solidity,perimeter,diameter_area,convex_hull_area,minor_axis_length,perimeter_crofton,major_axis_length,orientation,diameter_max,eccentricity,density_radius_20,density_radius_40,density_radius_60,density_radius_80,category
0,15,3.915147,16.0,0.75,0.954545,12.242641,4.513517,17.0,3.669612,14.518306,5.449354,0.0,5.385165,0.71318,0.007958,0.005769,0.00504,0.004526,non responder
1,17,4.1773,18.0,0.75,0.944444,13.656854,4.787307,20.0,3.991464,16.184387,6.06544,0.084918,6.324555,0.722862,0.019099,0.018104,0.016977,0.016015,non responder
2,19,4.465636,17.0,0.755102,0.958333,13.071068,4.652426,18.0,3.894303,15.073666,5.602338,0.0,5.830952,0.694588,0.016711,0.01691,0.0168,0.016662,non responder
3,21,4.353615,14.0,0.75,0.96,11.071068,4.222008,14.0,3.337716,13.732908,5.149449,0.0,5.385165,0.729661,0.012732,0.011738,0.011406,0.011091,responder
4,23,3.970538,11.0,0.75,1.0,9.414214,3.74241,11.0,2.970079,11.836789,4.418072,0.0,4.472136,0.720941,0.015915,0.014125,0.013528,0.013081,responder


In [6]:
core_responder = core_median_df[core_median_df['category'] == 'responder'][['leap_ID', 'lifetime_mean']]
core_non_responder = core_median_df[core_median_df['category'] == 'non responder'][['leap_ID', 'lifetime_mean']]
resection = resection_median_df[resection_median_df['category']=='non responder'][['leap_ID', 'lifetime_mean']]
# resection = resection_median_df[['leap_ID', 'lifetime_mean']]  # No category filtering needed


In [7]:
# Create labeled DataFrames for each group, using leap_ID and lifetime_mean
df_responder = core_responder.copy()
df_responder['sample_type'] = 'core_responder'

df_non_responder = core_non_responder.copy()
df_non_responder['sample_type'] = 'core_non_responder'

df_resection = resection.copy()
df_resection['sample_type'] = 'resection'

# Combine into a single DataFrame
median_df_all_samples = pd.concat(
    [df_responder, df_non_responder, df_resection],
    ignore_index=True
)[['leap_ID', 'sample_type', 'lifetime_mean']]  # Optional: reorder columns

# Optionally rename column
median_df_all_samples = median_df_all_samples.rename(columns={'lifetime_mean': 'median_lifetime'})



In [8]:
median_df_all_samples.head()

Unnamed: 0,leap_ID,sample_type,median_lifetime
0,21,core_responder,4.353615
1,23,core_responder,3.970538
2,36,core_responder,4.0532
3,38,core_responder,2.602444
4,41,core_responder,2.059636


In [9]:
output_all_median_path = os.path.join(data_preparation_dir, 'median_lifetime_df_core_and_resection_samples.csv')
median_df_all_samples.to_csv(output_all_median_path, index=False)

## Create the core-resection matched pair similarity rank

In [10]:
# Group by 'slide_num' and filter pairs where 'sample_type' contains both 'core' and 'resection'
grouped = df_category_with_rcb.groupby('slide_num')
filtered_groups = grouped.filter(lambda group: set(group['sample_type'].unique()) >= {'core', 'resection'})
filtered_leap_ids = median_df_all_samples['leap_ID'].unique()  
filtered_groups = filtered_groups[filtered_groups['leap_ID'].isin(filtered_leap_ids)]

# Merge Median Lifetime from median_df_all_samples into filtered_groups
filtered_groups_with_median = filtered_groups.merge(
    median_df_all_samples[['leap_ID', 'median_lifetime']],  # columns to merge from
    on='leap_ID',
    how='left' 
)

# Preview the result
filtered_groups_with_median.head()


Unnamed: 0,leap_ID,slide_num,sample_type,category,RCB_group,median_lifetime
0,15,7,core,non responder,3.0,3.915147
1,16,7,resection,non responder,3.0,3.975238
2,17,8,core,non responder,2.0,4.1773
3,18,8,resection,non responder,2.0,4.234513
4,19,9,core,non responder,3.0,4.465636


In [11]:
pair_dict = {}
# Iterate over groups of 'slide_num' in filtered_groups
for slide_num, group in filtered_groups_with_median.groupby('slide_num'):
    # Filter for 'core' and 'resection' sample types
    core_group = group[group['sample_type'].str.contains("core")]
    resection_group = group[group['sample_type'] == "resection"]
    
    # Only proceed if both 'core' and 'resection' samples exist
    if not core_group.empty and not resection_group.empty:
        # Get leap_IDs and median lifetime values
        core_leap_id, core_lifetime = core_group['leap_ID'].values[0], core_group['median_lifetime'].values[0]
        resection_leap_id, resection_lifetime = resection_group['leap_ID'].values[0],  resection_group['median_lifetime'].values[0]

        # Calculate absolute difference
        diff_couple = abs(core_lifetime - resection_lifetime)
        print(f"core: {core_leap_id}, resection: {resection_leap_id}, difference: {diff_couple}")

        other_core_nr = []
        for _,row in median_df_all_samples.iterrows():
            if row['sample_type'] == 'core_non_responder' and row['leap_ID'] != core_leap_id: 
                diff_pair_nr = abs(row['median_lifetime'] - resection_lifetime)
                other_core_nr.append(diff_pair_nr)            
        
        other_core_r = []
        for _,row in median_df_all_samples.iterrows():
            if row['sample_type'] == 'core_responder' and row['leap_ID'] != core_leap_id: 
                diff_pair_r = abs(row['median_lifetime'] - resection_lifetime)
                other_core_r.append(diff_pair_r) 
        
        pair_dict[(core_leap_id, resection_leap_id)] = {'couple_diff': diff_couple, 'other_core_nr_diff': other_core_nr, 'other_core_r_diff':other_core_r}    

core: 015, resection: 016, difference: 0.06009103641456637
core: 017, resection: 018, difference: 0.05721282051282017
core: 019, resection: 020, difference: 0.23809636363636333
core: 028, resection: 029, difference: 0.9069538461538462
core: 032, resection: 033, difference: 0.9290448369565212
core: 034, resection: 035, difference: 0.1350630252100844
core: 039, resection: 040, difference: 0.028833333333333933
core: 046, resection: 047, difference: 0.6533571428571427
core: 048, resection: 049, difference: 0.24651044932079325
core: 050, resection: 051, difference: 0.0836607142857142
core: 064, resection: 065, difference: 0.058428322152856094
core: 073, resection: 074, difference: 0.16764674922600653
core: 093, resection: 094, difference: 0.013509677419354826
core: 106, resection: 107, difference: 0.06623976608187165


In [12]:
# matched_rank_dict_path = os.path.join(data_preparation_dir, 'matched_couple_diff_and_other_couple_diff_paired_dict.pkl')

# with open(matched_rank_dict_path, 'wb') as f:
#     pickle.dump(pair_dict, f)

In [14]:
rank_data = []

for (core_id, resection_id), values in pair_dict.items():
    couple_diff = values['couple_diff']
    other_core_nr_diff = values['other_core_nr_diff']
    other_core_r_diff = values['other_core_r_diff']

    # Combine couple_diff with other_core_nr_diff and rank
    nr_diff_with_couple = other_core_nr_diff + [couple_diff]
    nr_diff_sorted = sorted(nr_diff_with_couple)
    nr_rank = nr_diff_sorted.index(couple_diff) + 1  # Get rank (1-based)

    # Save for DataFrame
    rank_data.append({
        'core_leap_id': core_id,
        'resection_leap_id': resection_id,
        'nr_rank': nr_rank,
    })

# Create DataFrame and save as CSV
rank_df = pd.DataFrame(rank_data)
rank_df.head()


Unnamed: 0,core_leap_id,resection_leap_id,nr_rank
0,15,16,1
1,17,18,3
2,19,20,4
3,28,29,17
4,32,33,18


In [15]:
core_resection_ranking_path = os.path.join(data_preparation_dir, 'core_resection_matched_pair_ranking.csv' )
rank_df.to_csv(core_resection_ranking_path, index=False)

print(f"CSV saved to {core_resection_ranking_path}")

CSV saved to /sise/assafzar-group/assafzar/reut/Test_FLIM/from_FLIM_dir/figure_results/data_preparation/core_resection_matched_pair_ranking.csv
