In [5]:
import json
import pandas as pd
from pathlib import Path

# load the test data
root_path = Path.cwd()
data_path = root_path / "dataset"
train_file = data_path / "mimic_iv_cxr"/ "train.json"
test_file = data_path / "mimic_iv_cxr"/ "sampled_test_with_scope_preprocessed_balenced_answer_100.json"

train_df = pd.read_json(train_file)
test_df = pd.read_json(test_file)

In [7]:
train_df.shape, test_df.shape

((36174, 15), (100, 18))

In [14]:
test_df.head()

Unnamed: 0,db_id,split,id,question,template,query,value,q_tag,t_tag,o_tag,v_tag,tag,para_type,is_impossible,answer,scope,tables,num_required_images
0,mimic_iv_cxr,test,5,given the last study of patient 10284038 in 21...,given the last study of patient 10284038 in 21...,"select func_vqa(""is the cardiac silhouette's w...",{'patient_id': 10284038},given the [time_filter_exact1] study of patien...,"[abs-year-in, , , exact-last, ]",{},"{'object': [], 'category': [], 'attribute': []}",given the [time_filter_exact1:exact-last] stud...,machine,False,[0],IMAGE-SINGLE-1,[tb_cxr],1
1,mimic_iv_cxr,test,21,given the last study of patient 19243401 this ...,given the last study of patient 19243401 this ...,"select (func_vqa(""can you confirm the presence...",{'patient_id': 19243401},given the [time_filter_exact1] study of patien...,"[rel-year-this, , , exact-last, ]",{},"{'object': [], 'category': ['anatomicalfinding...",given the [time_filter_exact1:exact-last] stud...,machine,False,[1],IMAGE-SINGLE-1,[tb_cxr],1
2,mimic_iv_cxr,test,49,given the last study of patient 15491652 in 06...,given the last study of patient 15491652 in 06...,"select (func_vqa(""can you identify any abnorma...",{'patient_id': 15491652},given the [time_filter_exact1] study of patien...,"[abs-month-in, , , exact-last, ]",{},"{'object': ['abdomen', 'right chest wall'], 'c...",given the [time_filter_exact1:exact-last] stud...,machine,False,[1],IMAGE-SINGLE-1,[tb_cxr],1
3,mimic_iv_cxr,test,66,have any tubes/lines related to both the abdom...,given the first study of patient 19243401 on 1...,"select (func_vqa(""have any tubes/lines related...",{'patient_id': 19243401},given the [time_filter_exact1] study of patien...,"[mix-month-this_year, , , exact-first, ]",{},"{'object': ['abdomen', 'right lung'], 'categor...",given the [time_filter_exact1:exact-first] stu...,machine,False,[0],IMAGE-SINGLE-1,[tb_cxr],1
4,mimic_iv_cxr,test,71,"given the study 55277734, can you find either ...","given the study 55277734, is there either airs...","select (func_vqa(""can you find airspace opacit...",{'study_id': 55277734},"given the study {study_id}, is there either ${...","[, , , , ]",{},"{'object': [], 'category': [], 'attribute': ['...","given the study {study_id}, is there either ai...",machine,False,[0],IMAGE-SINGLE-1,[tb_cxr],1


In [30]:
# get all categories in the test_df
all_scopes = test_df["scope"].unique().tolist()
scope2templates = {scope: [] for scope in all_scopes}
for _, row in test_df.iterrows():
    if row["q_tag"] not in scope2templates[row["scope"]]:
        scope2templates[row["scope"]].append(row["q_tag"])

# revert keya and values
template2scopes = {}
for scope, templates in scope2templates.items():
    for template in templates:
        if template not in template2scopes:
            template2scopes[template] = []
        template2scopes[template].append(scope)
assert len([v for v in template2scopes.values() if len(v) > 1]) == 0
for k, v in template2scopes.items():
    template2scopes[k] = v[0]

In [35]:
# filter out the train_df if the row's q_tag is in the template2scopes
train_df_filtered = train_df[train_df["q_tag"].isin(template2scopes.keys())]
train_df_filtered.shape
# add q_tag to the train_df_filtered
train_df_filtered["scope"] = train_df_filtered["q_tag"].apply(lambda x: template2scopes[x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df_filtered["scope"] = train_df_filtered["q_tag"].apply(lambda x: template2scopes[x])


In [36]:
train_df_filtered.shape

(7224, 16)

In [40]:
train_df_filtered.to_json(data_path / "mimic_iv_cxr" / "train_filtered.json", orient="records", indent=2) 

In [41]:
template2scopes

{'given the [time_filter_exact1] study of patient {patient_id} [time_filter_global1], is the width of the cardiac silhouette wider than 1/2 of the thorax width?': 'IMAGE-SINGLE-1',
 'given the [time_filter_exact1] study of patient {patient_id} [time_filter_global1], are there any ${category_1} or ${category_2}?': 'IMAGE-SINGLE-1',
 'given the [time_filter_exact1] study of patient {patient_id} [time_filter_global1], are there any abnormality in either the ${object_1} or the ${object_2}?': 'IMAGE-SINGLE-1',
 'given the [time_filter_exact1] study of patient {patient_id} [time_filter_global1], are there any ${category} related to the ${object_1} and the ${object_2}?': 'IMAGE-SINGLE-1',
 'given the study {study_id}, is there either ${attribute_1} or ${attribute_2}?': 'IMAGE-SINGLE-1',
 'given the [time_filter_exact1] study of patient {patient_id} [time_filter_global1], list all anatomical locations related to any ${category}.': 'IMAGE-SINGLE-1',
 'given the study {study_id}, list all abnorm

In [80]:
# let's extract the keywords from the q_tag which are rounded by {} or []
import re
def extract_keywords(q_tag):
    keywords = re.findall(r'\{(.+?)\}', q_tag)
    keywords += re.findall(r'\[(.+?)\]', q_tag)
    return sorted(list(set(keywords)))

train_df_filtered["keywords"] = train_df_filtered["q_tag"].apply(extract_keywords)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df_filtered["keywords"] = train_df_filtered["q_tag"].apply(extract_keywords)


In [50]:
train_df_filtered["keywords"]

3        [category, diagnosis_name, patient_id, time_fi...
4        [category, diagnosis_name, object, patient_id,...
10       [patient_id, procedure_name, time_filter_global1]
25            [category, comparison, study_id1, study_id2]
28       [category, object, patient_id, procedure_name,...
                               ...                        
36134    [diagnosis_name, object, patient_id, time_filt...
36147    [category, patient_id, time_filter_exact1, tim...
36148    [patient_id, procedure_name, time_filter_globa...
36152    [diagnosis_name, object, patient_id, time_filt...
36171    [category, comparison, patient_id, time_filter...
Name: keywords, Length: 7224, dtype: object

In [61]:
scope2keywords = {scope: [] for scope in all_scopes}
for k, v in scope2templates.items():
    for q_tag in v:
        keyword = extract_keywords(q_tag)
        if keyword not in scope2keywords[k]:
            scope2keywords[k].append(keyword)

In [62]:
scope2keywords

{'IMAGE-SINGLE-1': [['patient_id',
   'time_filter_exact1',
   'time_filter_global1'],
  ['category_1',
   'category_2',
   'patient_id',
   'time_filter_exact1',
   'time_filter_global1'],
  ['object_1',
   'object_2',
   'patient_id',
   'time_filter_exact1',
   'time_filter_global1'],
  ['category',
   'object_1',
   'object_2',
   'patient_id',
   'time_filter_exact1',
   'time_filter_global1'],
  ['attribute_1', 'attribute_2', 'study_id'],
  ['category', 'patient_id', 'time_filter_exact1', 'time_filter_global1'],
  ['study_id'],
  ['category', 'study_id'],
  ['attribute',
   'object_1',
   'object_2',
   'patient_id',
   'time_filter_exact1',
   'time_filter_global1'],
  ['object_1', 'object_2', 'study_id'],
  ['object', 'patient_id', 'time_filter_exact1', 'time_filter_global1']],
 'IMAGE-SINGLE-2': [['attribute', 'comparison', 'study_id1', 'study_id2'],
  ['comparison', 'study_id1'],
  ['category',
   'comparison',
   'patient_id',
   'time_filter_exact1',
   'time_filter_global1

In [77]:
# find 3 longest keywords in each scope
scope2typic_keywords = {scope: [] for scope in all_scopes}
for scope, keywords in scope2keywords.items():
    keywords = sorted(keywords, key=lambda x: len(x), reverse=True)
    scope2typic_keywords[scope] = keywords[-3:-1]+keywords[:3]


In [78]:
scope2typic_keywords

{'IMAGE-SINGLE-1': [['object_1', 'object_2', 'study_id'],
  ['category', 'study_id'],
  ['category',
   'object_1',
   'object_2',
   'patient_id',
   'time_filter_exact1',
   'time_filter_global1'],
  ['attribute',
   'object_1',
   'object_2',
   'patient_id',
   'time_filter_exact1',
   'time_filter_global1'],
  ['category_1',
   'category_2',
   'patient_id',
   'time_filter_exact1',
   'time_filter_global1']],
 'IMAGE-SINGLE-2': [['attribute', 'comparison', 'study_id1'],
  ['comparison', 'study_id1', 'study_id2'],
  ['attribute',
   'comparison',
   'object',
   'patient_id',
   'time_filter_exact1',
   'time_filter_exact2',
   'time_filter_global1',
   'time_filter_global2'],
  ['category',
   'comparison',
   'patient_id',
   'time_filter_exact1',
   'time_filter_exact2',
   'time_filter_global1',
   'time_filter_global2'],
  ['comparison',
   'patient_id',
   'time_filter_exact1',
   'time_filter_exact2',
   'time_filter_global1',
   'time_filter_global2']],
 'MULTIMODAL-SINGLE

In [97]:
# create scope2typic_keywords_str to store the string of keywords list 
scope2typic_keywords_str = {scope: [] for scope in all_scopes}
for scope, keywords in scope2typic_keywords.items():
    scope2typic_keywords_str[scope] = [", ".join(keyword) for keyword in keywords]

scope2typic_keywords_str

{'IMAGE-SINGLE-1': ['object_1, object_2, study_id',
  'category, study_id',
  'category, object_1, object_2, patient_id, time_filter_exact1, time_filter_global1',
  'attribute, object_1, object_2, patient_id, time_filter_exact1, time_filter_global1',
  'category_1, category_2, patient_id, time_filter_exact1, time_filter_global1'],
 'IMAGE-SINGLE-2': ['attribute, comparison, study_id1',
  'comparison, study_id1, study_id2',
  'attribute, comparison, object, patient_id, time_filter_exact1, time_filter_exact2, time_filter_global1, time_filter_global2',
  'category, comparison, patient_id, time_filter_exact1, time_filter_exact2, time_filter_global1, time_filter_global2',
  'comparison, patient_id, time_filter_exact1, time_filter_exact2, time_filter_global1, time_filter_global2'],
 'MULTIMODAL-SINGLE': ['diagnosis_name, patient_id, time_filter_global1',
  'patient_id, procedure_name, time_filter_global1',
  'category, drug_name, object, patient_id, time_filter_global1, time_filter_within',


In [96]:
scope2typic_keywords.items()

dict_items([('IMAGE-SINGLE-1', [['object_1', 'object_2', 'study_id'], ['category', 'study_id'], ['category', 'object_1', 'object_2', 'patient_id', 'time_filter_exact1', 'time_filter_global1'], ['attribute', 'object_1', 'object_2', 'patient_id', 'time_filter_exact1', 'time_filter_global1'], ['category_1', 'category_2', 'patient_id', 'time_filter_exact1', 'time_filter_global1']]), ('IMAGE-SINGLE-2', [['attribute', 'comparison', 'study_id1'], ['comparison', 'study_id1', 'study_id2'], ['attribute', 'comparison', 'object', 'patient_id', 'time_filter_exact1', 'time_filter_exact2', 'time_filter_global1', 'time_filter_global2'], ['category', 'comparison', 'patient_id', 'time_filter_exact1', 'time_filter_exact2', 'time_filter_global1', 'time_filter_global2'], ['comparison', 'patient_id', 'time_filter_exact1', 'time_filter_exact2', 'time_filter_global1', 'time_filter_global2']]), ('MULTIMODAL-SINGLE', [['diagnosis_name', 'patient_id', 'time_filter_global1'], ['patient_id', 'procedure_name', 'tim

In [94]:
scope2typic_keywords_str

{'IMAGE-SINGLE-1': 'h, a, s, _, v, e, r, b,  , p, a, t, i, e, n, t,  , {, p, a, t, i, e, n, t, _, i, d, },  , h, a, d,  , a,  , c, h, e, s, t,  , x, -, r, a, y,  , s, t, u, d, y,  , i, n, d, i, c, a, t, i, n, g,  , $, {, a, t, t, r, i, b, u, t, e, },  , [, t, i, m, e, _, f, i, l, t, e, r, _, w, i, t, h, i, n, ],  , a, f, t, e, r,  , h, a, v, i, n, g,  , b, e, e, n,  , d, i, a, g, n, o, s, e, d,  , w, i, t, h,  , {, d, i, a, g, n, o, s, i, s, _, n, a, m, e, },  , [, t, i, m, e, _, f, i, l, t, e, r, _, g, l, o, b, a, l, 1, ], ?',
 'IMAGE-SINGLE-2': 'h, a, s, _, v, e, r, b,  , p, a, t, i, e, n, t,  , {, p, a, t, i, e, n, t, _, i, d, },  , h, a, d,  , a,  , c, h, e, s, t,  , x, -, r, a, y,  , s, t, u, d, y,  , i, n, d, i, c, a, t, i, n, g,  , $, {, a, t, t, r, i, b, u, t, e, },  , [, t, i, m, e, _, f, i, l, t, e, r, _, w, i, t, h, i, n, ],  , a, f, t, e, r,  , h, a, v, i, n, g,  , b, e, e, n,  , d, i, a, g, n, o, s, e, d,  , w, i, t, h,  , {, d, i, a, g, n, o, s, i, s, _, n, a, m, e, },  ,

In [107]:
# sample the data the train_df_filtered by the scope2typic_keywords
train_df_filtered["keywords_str"] = train_df_filtered["keywords"].apply(lambda x: ", ".join(x))
train_df_sampled = pd.DataFrame()
for scope, keywords in scope2typic_keywords_str.items():
    for keyword in keywords:
        _df = train_df_filtered[train_df_filtered["keywords_str"] == keyword]
        _df = _df.sample(1, random_state=43)
        train_df_sampled = pd.concat([train_df_sampled, _df])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df_filtered["keywords_str"] = train_df_filtered["keywords"].apply(lambda x: ", ".join(x))


In [110]:
from preprocess.utils import sql_parser
def clean_query(q):
    res = q.replace("%y", "%Y").replace(
        "current_time", "strftime('2105-12-31 23:59:00')"
    )
    return res

train_df_sampled["query"] = train_df_sampled["query"].apply(clean_query)
train_df_sampled["tables"] = train_df_sampled["query"].apply(lambda x: sql_parser(x).tables)

In [None]:
# train_df_sampled = train_df_sampled.drop(columns=["keywords_str", "keywords"])
# train_df_sampled.to_json(data_path / "mimic_iv_cxr" / "few_shot_sampled.json", orient="records", indent=2)

In [None]:
# train_df_sampled.to_csv(data_path / "mimic_iv_cxr" / "few_shot_sampled.csv", index=False)