In [26]:
import pandas as pd
import os
from transformers import set_seed
import numpy as np

In [27]:
set_seed(42)

In [28]:
comp_train = pd.read_csv('../data/clean/non_synoptic/complete/complete_train_data.csv')

In [30]:
directory = 'U:\Documents\Breast_Non_Synoptic\src'
os.chdir(directory)
foi_lookup_table = pd.read_csv('../results/EDA/FOI_lookup_table.csv',
                         usecols=['label','label_key', 'label_value', 'question'])

In [31]:
# add the label_keys to comp_train from foi_lookup_table
new_lookup = foi_lookup_table.drop_duplicates(subset = ["label_key", "question"])
merged_df = pd.merge(comp_train, new_lookup[['question', 'label_key']], on='question', how='left')

new_comp_train = comp_train.copy()

new_comp_train['label_key'] = merged_df['label_key_y']


In [32]:
desired_label_keys = ["DCIS Margins", "ER Status", "Extranodal Extension", "HER2 Status", "Insitu Component", "Invasive Carcinoma", "Invasive Carcinoma Margins", "Lymphovascular Invasion", "Necrosis", "PR Status", "Tumour Focality"]
sub_comp_train = new_comp_train[new_comp_train['label_key'].isin(desired_label_keys)]

In [33]:
# Create a new column 'mention' based on whether 'answer' is NaN or not
sub_comp_train['mention'] = sub_comp_train['answer'].apply(lambda x: 'No' if pd.isna(x) else 'Yes')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_comp_train['mention'] = sub_comp_train['answer'].apply(lambda x: 'No' if pd.isna(x) else 'Yes')


In [34]:
# Group by 'label_key' and 'mention' columns and count the occurrences
mention_counts = sub_comp_train.groupby(['label_key', 'mention']).size().reset_index(name='count')

# Pivot the DataFrame to have 'mention' as columns and the count as values
mention_counts_table = mention_counts.pivot_table(index='label_key', columns='mention', values='count', fill_value=0)

In [35]:
mention_counts_table

mention,No,Yes
label_key,Unnamed: 1_level_1,Unnamed: 2_level_1
DCIS Margins,71,160
ER Status,157,68
Extranodal Extension,183,46
HER2 Status,166,61
Insitu Component,22,205
Invasive Carcinoma,14,212
Invasive Carcinoma Margins,59,168
Lymphovascular Invasion,60,165
Necrosis,106,119
PR Status,167,58


In [36]:
# Calculate the total number of mentions for each label_key
total_mentions = mention_counts_table['No'] + mention_counts_table['Yes']

# get the total number of no + yes
mention_counts_table['Total'] = mention_counts_table['No'] + mention_counts_table['Yes']

# if Yes is larger, then Yes/No, otherwise No is larger, and No/Yes
mention_counts_table['imbalance ratio'] = np.where(
    mention_counts_table['Yes'] > mention_counts_table['No'],
    mention_counts_table['Yes'] / mention_counts_table['No'],
    mention_counts_table['No'] / mention_counts_table['Yes']
)

mention_counts_table_sorted = mention_counts_table.sort_values(by='imbalance ratio', ascending=False)


In [37]:
mention_counts_table_sorted

mention,No,Yes,Total,imbalance ratio
label_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Invasive Carcinoma,14,212,226,15.142857
Insitu Component,22,205,227,9.318182
Extranodal Extension,183,46,229,3.978261
PR Status,167,58,225,2.87931
Invasive Carcinoma Margins,59,168,227,2.847458
Lymphovascular Invasion,60,165,225,2.75
HER2 Status,166,61,227,2.721311
Tumour Focality,158,67,225,2.358209
ER Status,157,68,225,2.308824
DCIS Margins,71,160,231,2.253521


In [38]:
# Split DataFrame into parts based on conditions
invasive_carcinoma_condition = (sub_comp_train['label_key'] == 'Invasive Carcinoma') & (sub_comp_train['mention'] == 'Yes')
insitu_component_condition = (sub_comp_train['label_key'] == 'Insitu Component') & (sub_comp_train['mention'] == 'Yes')
# no_condition = (sub_comp_train['label_key'] == 'Extranodal Extension') & (sub_comp_train['mention'] == 'No')

# Select the % of rows you want to remove
invasive_to_remove = sub_comp_train[invasive_carcinoma_condition].sample(frac=0.7721)
insitu_to_remove = sub_comp_train[insitu_component_condition].sample(frac=0.6903)
# no_to_remove = sub_comp_train[no_condition].sample(frac=0.5)

# Create a mask that identifies the indices of the rows to keep
mask_to_keep = ~sub_comp_train.index.isin(invasive_to_remove.index) & \
               ~sub_comp_train.index.isin(insitu_to_remove.index) # & \
#               ~sub_comp_train.index.isin(no_to_remove.index)

# Concatenate the kept rows with the rest of the DataFrame
downsampled_sub_comp_train = sub_comp_train[mask_to_keep]

# Optionally reset the index
downsampled_sub_comp_train.reset_index(drop=True, inplace=True)

In [39]:
# save downsampled 11 to csv
downsampled_sub_comp_train.to_csv('../data/clean/non_synoptic/complete/sub_11/downsampled_sub_11_comp_train.csv', index=False)

# save non-downsampled 11 to csv 
sub_comp_train.reset_index(drop=True, inplace=True)
sub_comp_train.to_csv('../data/clean/non_synoptic/complete/sub_11/sub_11_comp_train.csv')

In [40]:
# Read validation data
comp_val = pd.read_csv('../data/clean/non_synoptic/complete/complete_val_data.csv')

# Change directory if needed
directory = 'U:\Documents\Breast_Non_Synoptic\src'
os.chdir(directory)

# Read FOI lookup table
foi_lookup_table = pd.read_csv('../results/EDA/FOI_lookup_table.csv', usecols=['label','label_key', 'label_value', 'question'])

# Add the label_keys to comp_val from foi_lookup_table
new_lookup = foi_lookup_table.drop_duplicates(subset = ["label_key", "question"])
merged_df = pd.merge(comp_val, new_lookup[['question', 'label_key']], on='question', how='left')

new_comp_val = comp_val.copy()

new_comp_val['label_key'] = merged_df['label_key_y']

desired_label_keys = ["DCIS Margins", "ER Status", "Extranodal Extension", "HER2 Status", "Insitu Component", "Invasive Carcinoma", "Invasive Carcinoma Margins", "Lymphovascular Invasion", "Necrosis", "PR Status", "Tumour Focality"]
sub_comp_val = new_comp_val[new_comp_val['label_key'].isin(desired_label_keys)]

# Create a new column 'mention' based on whether 'answer' is NaN or not
sub_comp_val['mention'] = sub_comp_val['answer'].apply(lambda x: 'No' if pd.isna(x) else 'Yes')

# Split DataFrame into parts based on conditions
invasive_carcinoma_condition = (sub_comp_val['label_key'] == 'Invasive Carcinoma') & (sub_comp_val['mention'] == 'Yes')
insitu_component_condition = (sub_comp_val['label_key'] == 'Insitu Component') & (sub_comp_val['mention'] == 'Yes')
# no_condition = (sub_comp_val['label_key'] == 'Extranodal Extension') & (sub_comp_val['mention'] == 'No')

# Select the % of rows you want to remove
invasive_to_remove = sub_comp_val[invasive_carcinoma_condition].sample(frac=0.7721)
insitu_to_remove = sub_comp_val[insitu_component_condition].sample(frac=0.6903)
# no_to_remove = sub_comp_val[no_condition].sample(frac=0.5)

# Create a mask that identifies the indices of the rows to keep
mask_to_keep = ~sub_comp_val.index.isin(invasive_to_remove.index) & \
               ~sub_comp_val.index.isin(insitu_to_remove.index) # & \
#               ~sub_comp_val.index.isin(no_to_remove.index)

# Concatenate the kept rows with the rest of the DataFrame
downsampled_sub_comp_val = sub_comp_val[mask_to_keep]

# Optionally reset the index
downsampled_sub_comp_val.reset_index(drop=True, inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_comp_val['mention'] = sub_comp_val['answer'].apply(lambda x: 'No' if pd.isna(x) else 'Yes')


In [41]:
# save downsampled 11 to csv
downsampled_sub_comp_val.to_csv('../data/clean/non_synoptic/complete/sub_11/downsampled_sub_11_comp_val.csv', index=False)

# save non-downsampled 11 to csv 
sub_comp_val.reset_index(drop=True, inplace=True)
sub_comp_val.to_csv('../data/clean/non_synoptic/complete/sub_11/sub_11_comp_val.csv')