# Part 2 

## Sample the 100 data with the less images required

For keep the consistency of the data, we will sample the 70 data with the less images required and concatenate with the 30 data filtered in the first part.

In [2]:
import json
from pathlib import Path

root = Path('.').cwd().parent
dataset_path = root / 'dataset' / 'mimic_iv_cxr'
filtered_path = dataset_path / 'filtered_test_with_scope_preprocessed.json'
with open(filtered_path, 'r') as f:
    filtered_test_data = json.load(f)

In [3]:
len(filtered_test_data)

2295

### We need to sample 100 data points and add them to the 30 data points we already have. 

#### Steps:
1. Remove the 30 data points from the filtered dataset
2. Sample 70 data points from the filtered dataset after removing the 30 data points
3. group the data by the category and re-sort the data points by the number of images
4. re-concatenate the grouped data 

In [4]:
# check the ids from the 30 samples
sampled_test_30 = dataset_path / 'sampled_test_with_scope_preprocessed_balenced_answer.json'
with open(sampled_test_30, 'r') as f:
    sampled_test_30_data = json.load(f)

sampled_test_30_data_ids = [d['id'] for d in sampled_test_30_data]

# remove the ids from the filtered_test_data if they are in the sampled_test_30_data
left_filtered_test_data = [d for d in filtered_test_data if d['id'] not in sampled_test_30_data_ids]

In [5]:
print(len(left_filtered_test_data), len(filtered_test_data))

2265 2295


In [12]:
# get first 10 samples from each category in filtered dataset
res = []
max_sapmles_all_categories = [20,20,30]
max_samples_all_answertype = [int(m) for m in max_sapmles_all_categories]
categories = ["IMAGE-SINGLE-1", "IMAGE-SINGLE-2", "MULTIMODAL-SINGLE"]
for scope, max_sapmles_per_category, max_samples_per_answertype in zip(categories, max_sapmles_all_categories, max_samples_all_answertype):
    data_per_scope = []
    for d in left_filtered_test_data:
        if d['scope'] == scope and len(d['answer']) > 0:
            data_per_scope.append(d)
    data_per_scope = sorted(data_per_scope, key=lambda x: x['num_required_images'])
    data_per_scope_single_numeric_value = list(filter(lambda x: len(x['answer']) == 1 and isinstance(x['answer'][0], int), data_per_scope))
    data_per_scope_list_value = list(filter(lambda x: len(x['answer']) > 1, data_per_scope))
    num_single_value = len(data_per_scope_single_numeric_value)
    num_list_value = len(data_per_scope_list_value)
    print(scope, num_single_value, num_list_value)
    # we want to have more list value than single value
    num_single_value = 0
    num_list_value = min(max_sapmles_per_category - num_single_value, num_list_value)
    if num_list_value < max_samples_per_answertype:
        num_single_value = max(max_sapmles_per_category - num_list_value, num_single_value)
        num_list_value = max_sapmles_per_category - num_single_value
    else:
        assert num_list_value == max_samples_per_answertype
    print(scope, num_single_value, num_list_value)
    res.extend(data_per_scope_single_numeric_value[:num_single_value])
    res.extend(data_per_scope_list_value[:num_list_value])

res = res + sampled_test_30_data

# group res by eachj category and sort it by num_required_images
res = sorted(res, key=lambda x: x['num_required_images'])
res_grouped = {}
for scope in categories:
    res_grouped[scope] = [d for d in res if d['scope'] == scope]

# re-concate the res_grouped
res = []
for scope in categories:
    res.extend(res_grouped[scope])



IMAGE-SINGLE-1 395 89
IMAGE-SINGLE-1 0 20
IMAGE-SINGLE-2 225 119
IMAGE-SINGLE-2 0 20
MULTIMODAL-SINGLE 977 0
MULTIMODAL-SINGLE 30 0


In [13]:
# group res by eachj category and sort it by id
res = sorted(res, key=lambda x: x['id'])
res_grouped = {}
for scope in categories:
    res_grouped[scope] = [d for d in res if d['scope'] == scope]

print([len(res_grouped[scope]) for scope in categories])
# re-concate the res_grouped
res = []
for scope in categories:
    res.extend(res_grouped[scope])

[30, 30, 40]


In [14]:
sampled_test = dataset_path / 'sampled_test_with_scope_preprocessed_balenced_answer_100.json'
with open(sampled_test, 'w') as f:
    json.dump(res, f, indent=4)

In [1]:
import json
from pathlib import Path

root = Path('.').cwd().parent
dataset_path = root / 'dataset' / 'mimic_iv_cxr'
sampled_test_100 = dataset_path / 'sampled_test_with_scope_preprocessed_balenced_answer_100.json'
sampled_test_30 = dataset_path / 'sampled_test_with_scope_preprocessed_balenced_answer.json'
sampled_test_70 = dataset_path / 'sampled_test_with_scope_preprocessed_balenced_answer_70.json'
with open(sampled_test_100, 'r') as f:
    sampled_test_100_data = json.load(f)
with open(sampled_test_30, 'r') as f:
    sampled_test_30_data = json.load(f)

idx_30 = [d['id'] for d in sampled_test_30_data]
idx_100 = [d['id'] for d in sampled_test_100_data]
idx_70 = [d['id'] for d in sampled_test_100_data if d['id'] not in idx_30]



In [4]:
sample_test_70_data = [d for d in sampled_test_100_data if d['id'] in idx_70]
with open(sampled_test_70, 'w') as f:
    json.dump(sample_test_70_data, f, indent=4)
