In [1]:
import json
from collections import defaultdict

In [2]:
# Open the JSONL file for reading
with open('./NEJM_case_test.jsonl', 'r') as jsonl_file:
    data_list = [json.loads(line) for line in jsonl_file]

for data in data_list:
    print(data)

{'source': 'NEJM case records of the massachusetts general hospital', 'year': '2021', 'case_num': '40', 'id': 'nejm-case-2021-40', 'title': 'A 9-Year-Old Boy with Transient Weakness, Facial Droop, and Slurred Speech', 'presentation_of_case': 'Dr. Sagar M. Raju (Pediatrics): A 9-year-old boy was transferred to this hospital because of transient weakness and facial droop on the left side and slurred speech. The patient had been well until 5 hours before this evaluation, when his parents observed that he had a facial droop on the left side and slurred speech. In addition, the patient noticed that his left arm and leg felt “limp.” When he moved to get out of his bed, he could not stand on his own or grip using the left hand. There was no confusion, loss of consciousness, shaking in the arms or legs, urinary or fecal incontinence, headache, nausea, vomiting, recent illness, or trauma. The symptoms lasted approximately 2 minutes and then resolved completely. Emergency medical services were c

### Presentation of Case (poc)

In [5]:
## presentation of case
cases_without_poc = []

## find the cases that do not have presentation of case
for data in data_list:
    if (data['presentation_of_case'] is None) or (data['presentation_of_case'].strip() == ""):
        cases_without_poc.append(data)

years_no_poc = []
for case in cases_without_poc:
    # extract the year infomration, last 4 characters of the case id
    if case['id'][-4:].isdigit():
        years_no_poc.append(int(case['id'][-4:]))
    

year_count_no_poc = defaultdict(int)
for year in years_no_poc:
    year_count_no_poc[year] += 1

## reverse sort the dictionary, make the descending order
for year in reversed(sorted(year_count_no_poc.keys())):
    print("Year: {} Number of cases without presentation of case: {}".format(year, year_count_no_poc[year]))

### Differential diagnosis (dd)

In [6]:
## differential diagnosis
cases_without_dd = []

## find the cases that do not have differential diagnosis
for data in data_list:
    if len(data['differential_diagnosis']) == 0:
        cases_without_dd.append(data)

years_no_dd = []
for case in cases_without_dd:
    # extract the year infomration, last 4 characters of the case id
    years_no_dd.append(int(case['year']))
    

year_count_no_dd = defaultdict(int)
for year in years_no_dd:
    year_count_no_dd[year] += 1


## reverse sort the dictionary, make the descending order
for year in reversed(sorted(year_count_no_dd.keys())):
    print("Year: {} Number of cases without differential diagnosis: {}".format(year,year_count_no_dd[year]))

Year: 2022 Number of cases without differential diagnosis: 1
Year: 2021 Number of cases without differential diagnosis: 3
Year: 2020 Number of cases without differential diagnosis: 9
Year: 2019 Number of cases without differential diagnosis: 7
Year: 2018 Number of cases without differential diagnosis: 5
Year: 2017 Number of cases without differential diagnosis: 4
Year: 2016 Number of cases without differential diagnosis: 5
Year: 2015 Number of cases without differential diagnosis: 4
Year: 2014 Number of cases without differential diagnosis: 4
Year: 2013 Number of cases without differential diagnosis: 6
Year: 2012 Number of cases without differential diagnosis: 6
Year: 2011 Number of cases without differential diagnosis: 10
Year: 2010 Number of cases without differential diagnosis: 4
Year: 2009 Number of cases without differential diagnosis: 9
Year: 2008 Number of cases without differential diagnosis: 12


In [16]:
## identify cases that wihout differential diagnosis
for case in cases_without_dd:
    print(case['id'])

## write those id into a file
with open('./case_id_without_dd.txt', 'w') as txt_file:
    for case in cases_without_dd:
        txt_file.write(case['id'])
        txt_file.write('\n')

## write those cases into another jsonl file
with open('./cases_without_dd.jsonl', 'w') as jsonl_file:
    for case in cases_without_dd:
        json.dump(case, jsonl_file)
        jsonl_file.write('\n')
    

nejm-case-2009-29
nejm-case-2017-37
nejm-case-2017-21
nejm-case-2018-12
nejm-case-2012-22
nejm-case-2009-25
nejm-case-2011-12
nejm-case-2020-20
nejm-case-2014-11
nejm-case-2018-32
nejm-case-2008-24
nejm-case-2013-02
nejm-case-2011-29
nejm-case-2013-14
nejm-case-2012-39
nejm-case-2011-05
nejm-case-2020-21
nejm-case-2011-13
nejm-case-2020-17
nejm-case-2008-05
nejm-case-2009-28
nejm-case-2012-12
nejm-case-2019-35
nejm-case-2021-27
nejm-case-2009-19
nejm-case-2010-15
nejm-case-2019-39
nejm-case-2008-14
nejm-case-2009-15
nejm-case-2008-38
nejm-case-2016-26
nejm-case-2018-38
nejm-case-2013-08
nejm-case-2009-02
nejm-case-2009-14
nejm-case-2019-14
nejm-case-2020-07
nejm-case-2011-23
nejm-case-2008-15
nejm-case-2010-34
nejm-case-2008-03
nejm-case-2008-23
nejm-case-2018-35
nejm-case-2021-26
nejm-case-2015-17
nejm-case-2008-19
nejm-case-2013-28
nejm-case-2022-35
nejm-case-2011-16
nejm-case-2011-20
nejm-case-2014-35
nejm-case-2017-33
nejm-case-2008-01
nejm-case-2020-29
nejm-case-2017-24
nejm-case-

### Final diagnosis (fd)

In [5]:
cases_without_finaldiagnosis = []

## find the cases that do not have final diagnosis
for data in data_list:
    if (len(data['final_diagnosis'].strip())==0):
        cases_without_finaldiagnosis.append(data)

years_no_fd = []
for case in cases_without_finaldiagnosis:
    # extract the year infomration, last 4 characters of the case id
    year_no_fd = case['year']
    if year_no_fd.isdigit():
        years_no_fd.append(int(year_no_fd))
    

year_count = defaultdict(int)
for year in years_no_fd:
    year_count[year] += 1


## reverse sort the dictionary, make the descending order
for year in reversed(sorted(year_count.keys())):
    print("Year: {} Number of cases without final diagnosis: {}".format(year, year_count[year]))


Year: 2022 Number of cases without final diagnosis: 2
Year: 2021 Number of cases without final diagnosis: 3
Year: 2020 Number of cases without final diagnosis: 7
Year: 2019 Number of cases without final diagnosis: 16
Year: 2018 Number of cases without final diagnosis: 14
Year: 2017 Number of cases without final diagnosis: 20
Year: 2016 Number of cases without final diagnosis: 18
Year: 2015 Number of cases without final diagnosis: 20
Year: 2014 Number of cases without final diagnosis: 22
Year: 2013 Number of cases without final diagnosis: 33
Year: 2012 Number of cases without final diagnosis: 31
Year: 2011 Number of cases without final diagnosis: 36
Year: 2010 Number of cases without final diagnosis: 38
Year: 2009 Number of cases without final diagnosis: 40
Year: 2008 Number of cases without final diagnosis: 40


### Dr's diagnosis  - serves as alternatives to final diagnosis

In [6]:
cases_without_drd = []

## find the cases that do not have Dr.'s diagnosis
for data in data_list:
    if (len(data['drs_diagnosis'].strip()) == 0):
        cases_without_drd.append(data)

years_no_drd = []
for case in cases_without_drd:
    # extract the year infomration, last 4 characters of the case id
    year_no_drd = case['year']
    if year_no_drd.isdigit():
        years_no_drd.append(int(year_no_drd))
    


year_count_no_drd = defaultdict(int)
for year in years_no_drd:
    year_count_no_drd[year] += 1


## reverse sort the dictionary, make the descending order
for year in reversed(sorted(year_count_no_drd.keys())):
    print("Year: {} Number of cases without alternative diagnosis options: {}".format(year, year_count_no_drd[year]))


Year: 2023 Number of cases without alternative diagnosis options: 2
Year: 2022 Number of cases without alternative diagnosis options: 7
Year: 2021 Number of cases without alternative diagnosis options: 9
Year: 2020 Number of cases without alternative diagnosis options: 13
Year: 2019 Number of cases without alternative diagnosis options: 6
Year: 2018 Number of cases without alternative diagnosis options: 3
Year: 2017 Number of cases without alternative diagnosis options: 7
Year: 2016 Number of cases without alternative diagnosis options: 9
Year: 2015 Number of cases without alternative diagnosis options: 5
Year: 2014 Number of cases without alternative diagnosis options: 5
Year: 2013 Number of cases without alternative diagnosis options: 7
Year: 2012 Number of cases without alternative diagnosis options: 5
Year: 2011 Number of cases without alternative diagnosis options: 6
Year: 2010 Number of cases without alternative diagnosis options: 6
Year: 2009 Number of cases without alternative 

## combined diagnosis outcome

In [7]:
cases_without_cd = []

## find the cases that do not have a composite final diagnosis
for data in data_list:
    if (len(data['final_diagnosis_comb']) == 0):
        cases_without_cd.append(data)

years_no_cd = []
for case in cases_without_cd:
    # extract the year infomration, last 4 characters of the case id
    year_no_cd = case['year']
    if year_no_cd.isdigit():
        years_no_cd.append(int(year_no_cd))
    


year_count_no_cd = defaultdict(int)
for year in years_no_cd:
    year_count_no_cd[year] += 1

year_count_no_cd[2023] = 0

## reverse sort the dictionary, make the descending order
for year in reversed(sorted(year_count_no_cd.keys())):
    print("Year: {} Number of cases without alternative diagnosis options: {}".format(year, year_count_no_cd[year]))


Year: 2023 Number of cases without alternative diagnosis options: 0
Year: 2022 Number of cases without alternative diagnosis options: 1
Year: 2020 Number of cases without alternative diagnosis options: 2
Year: 2019 Number of cases without alternative diagnosis options: 1
Year: 2018 Number of cases without alternative diagnosis options: 1
Year: 2017 Number of cases without alternative diagnosis options: 3
Year: 2016 Number of cases without alternative diagnosis options: 5
Year: 2015 Number of cases without alternative diagnosis options: 3
Year: 2014 Number of cases without alternative diagnosis options: 3
Year: 2013 Number of cases without alternative diagnosis options: 6
Year: 2012 Number of cases without alternative diagnosis options: 4
Year: 2011 Number of cases without alternative diagnosis options: 6
Year: 2010 Number of cases without alternative diagnosis options: 5
Year: 2009 Number of cases without alternative diagnosis options: 12
Year: 2008 Number of cases without alternative 

## Usable cases

### Usable cases1 - differential diagnosis

In [8]:
## usable cases
dd_usable_cases = []

# dd-usable case has both differential diagnosis and presentation of case
# sort the data_list by case id
for data in sorted(data_list, key=lambda x: x['id']) :
    if len(data['differential_diagnosis'])>0 and \
            len(data['presentation_of_case'].strip())>0:
        dd_usable_cases.append(data)


## store the usable cases in a jsonl file
with open('./NEJM_case_test_differential_diagnosis.jsonl', 'w') as jsonl_file:
    for case in dd_usable_cases:
        jsonl_file.write(json.dumps(case))
        jsonl_file.write('\n')

len(dd_usable_cases)

552

In [9]:
## manualy review
dd_useless_case_id_manual_check = [
    'nejm-case-2009-10',
        'nejm-case-2009-31',
        'nejm-case-2009-35',
        'nejm-case-2010-18',
        'nejm-case-2010-21',
        'nejm-case-2010-25',
        'nejm-case-2011-04',
        'nejm-case-2011-08',
        'nejm-case-2011-26',
        'nejm-case-2012-08',
        'nejm-case-2012-09',
        'nejm-case-2012-30',
        'nejm-case-2013-05',
        'nejm-case-2013-10',
        'nejm-case-2013-12',
        'nejm-case-2014-10',
        'nejm-case-2014-14',
        'nejm-case-2014-26',
        'nejm-case-2014-27',
        'nejm-case-2014-29',
        'nejm-case-2014-36',
        'nejm-case-2014-40',
        'nejm-case-2015-01',
        'nejm-case-2015-04',
        'nejm-case-2015-21',
        'nejm-case-2015-30',
        'nejm-case-2015-32',
        'nejm-case-2015-40',
        'nejm-case-2016-15',
        'nejm-case-2016-22',
        'nejm-case-2016-29',
        'nejm-case-2017-04',
        'nejm-case-2017-10',
        'nejm-case-2017-11',
        'nejm-case-2017-12',
        'nejm-case-2017-18',
        'nejm-case-2017-28',
        'nejm-case-2019-37',
        'nejm-case-2020-04',
        'nejm-case-2020-16',
        'nejm-case-2020-28',
        'nejm-case-2021-17',
        'nejm-case-2021-19',
        'nejm-case-2021-24',
        'nejm-case-2022-18',
        'nejm-case-2023-03',
        'nejm-case-2023-34'
        ]


useless_keys_in_dd_manual_check = [
    'Summary',
    'Treatment for this patient',
    'Differential diagnosis of abdominal findings',
    'Other Features of This Case',
    'Summary and conclusions',
    'Sources of rectal bleeding',
    'Surgical Approach',
    'Resuscitation',
    'Diagnostic Studies',
    'Critical Care during Transport',
    'Abnormal laboratory-test results',
    'Approach to Diagnosis and Initial Therapy',
    'Duration of kidney insufficiency',
    'Important Features of the Case',
    'Approach to diagnosis',
    'Diagnostic procedure',
    'Conclusions',
    'Clinical Approach to Confusional States',
    'Summary',
    'Evaluation of Infertility in Women',
    'Prevention of Complications of Radiation Therapy',
    'Assessment of altered mental status',
    'Microbiology',
    'A',
    'B',
    'Measurement Errors',
    'Important Features of the Case',
    'Electrodiagnostic Testing',
    'Neuroimaging Studies',
    'Diagnostic considerations',
    'Syndromic Diagnosis',
    'Diagnostic testing',
    'Loose ends',
    'Location of the process',
    'The cause of the process',
    'The problem list',
    'Initial management and transport',
    'Management and Prevention',
    'Clues to the diagnosis',
    'Medical History',
    'Clinical Examination',
    'Summary of the Present Illness',
    'History and Epidemiology',
    'The Differential Diagnosis',
    'Using a Global Lens',
    'Symptoms',
    'Epidemiologic Clues',
    'Diagnostic Testing',
    'Abnormal Laboratory Findings',
    '“For” and “Against” Lists',
    'Diagnostic Approach',
    'Findings on Imaging Studies',
    'Clues from the Case History and Laboratory Data',
    'Recommended Testing',
    'Anatomical Localization',
    'Additional Diagnostic Considerations',
    'Is the Mother’s Story Relevant?',
    'Examining Primary Data',
    'Diagnosis and Management',
    'First Episode',
    'Second Episode',
    'History',
    'Laboratory Test Results',
    'Narrowing the Differential Diagnosis',
    'Next Steps',
    'Imaging Studies and Biopsy',
    'Clinical Impression',
    'Findings on MRI',
    'Time Course of Symptoms',
    'Patient History',
    'Diagnostic Challenges',
    'Evaluation of First Seizure',
    'Making a Diagnosis',
    'Additional Information Obtained after the First Admission',
    'Reframing the Initial Presentation',
    'Narrowing the Differential Diagnosis',
    'CSF Profile',
    'Travel',
    'Immune Status',
    'Neuroimaging',
    'Review of Chest Imaging',
    'Background',
    'Midground',
    'Foreground',
    'Putting It All Together',
    'Refining the Differential Diagnosis'
    ]

In [10]:
## for each case, remove the useless keys
dd_usable_filtered_cases =[]

for case in dd_usable_cases:
    if case['id'] in dd_useless_case_id_manual_check:
        continue
    else:
        for key in useless_keys_in_dd_manual_check:
            if key in case['differential_diagnosis'].keys():
                del case['differential_diagnosis'][key]
                
        dd_usable_filtered_cases.append(case)

len(dd_usable_filtered_cases)
            

504

In [11]:
## write the filtered cases to a jsonl file
with open('./NEJM_case_test_differential_diagnosis_filtered.jsonl', 'w') as jsonl_file:
    for case in dd_usable_filtered_cases:
        jsonl_file.write(json.dumps(case))
        jsonl_file.write('\n')

### Usable cases2 - final diagnosis combined

In [12]:
## usable cases
fd_usable_cases = []

# dd-usable case has both differential diagnosis and presentation of case
# sort the data_list by case id
for data in sorted(data_list, key=lambda x: x['id']) :
    if len(data['final_diagnosis_comb'].strip())>0 and \
            len(data['presentation_of_case'].strip())>0:
        fd_usable_cases.append(data)

574

### manualy review final diagnosis combined
**problematic cases and revision**
* `nejm-case-2008-01` with title `A 45-Year-Old Man with Sudden Onset of Abdominal Pain and Hypotension`
    * final diagnosis comb should be : `Ruptured pancreaticoduodenal-artery aneurysm` 
* `nejm-case-2008-37` with title `A 17-Year-Old Boy with a Pituitary Tumor and Skull Abnormalities`
    * final diagnosis comb should be : `Prolactinoma, probably invasive; rule out a primary osseous lesion` 
* `nejm-case-2010-04` with title `A 53-Year-Old Man with Arthralgias, Oral Ulcers, Vision Loss, and Vocal-Cord Paralysis`
    * final diagnosis comb should be : `Giant-cell arteritis with central retinal artery occlusion. Drug-induced lupus erythematosus. ` 
* `nejm-case-2010-13` with title `An 18.5-Month-Old Girl with Watery Diarrhea and Poor Weight Gain`
    * final diagnosis comb should be : `Paraneoplastic diarrhea, probably resulting from catecholamines and VIP secreted from a ganglio-neuroblastoma or ganglioneuroma.` 
* `nejm-case-2015-37` with title `A 76-Year-Old Man with Fevers, Leukopenia, and Pulmonary Infiltrates`
    * final diagnosis comb should be : `Disseminated Mycobacterium bovis infection`
* `nejm-case-2016-11` with title `A 12-Year-Old Boy with Malaise, Fevers, Abdominal Pain, and Pallor`
    * final diagnosis comb should be : `Combined anemia of chronic disease and iron deficiency in a patient with inflammatory bowel disease.` 
* `nejm-case-2021-07` with title `A 19-Year-Old Man with Shock, Multiple Organ Failure, and Rash`
    * final diagnosis comb: `Meningococcal purpura fulminans.` 

In [61]:
# manualy revise final diagnosis combined
for case in fd_usable_cases:
    if case['id'] == 'nejm-case-2008-01':
        case['final_diagnosis_comb'] = 'Ruptured pancreaticoduodenal-artery aneurysm'
    elif case['id'] == 'nejm-case-2008-37':
        case['final_diagnosis_comb'] = 'Prolactinoma, probably invasive; rule out a primary osseous lesion'
    elif case['id'] == 'nejm-case-2010-04':
        case['final_diagnosis_comb'] = 'Giant-cell arteritis with central retinal artery occlusion. Drug-induced lupus erythematosus.'
    elif case['id'] == 'nejm-case-2010-13':
        case['final_diagnosis_comb'] = 'Paraneoplastic diarrhea, probably resulting from catecholamines and VIP secreted from a ganglio-neuroblastoma or ganglioneuroma.'
    elif case['id'] == 'nejm-case-2015-37':
        case['final_diagnosis_comb'] = 'Disseminated Mycobacterium bovis infection'
    elif case['id'] == 'nejm-case-2016-11':
        case['final_diagnosis_comb'] = 'Combined anemia of chronic disease and iron deficiency in a patient with inflammatory bowel disease.'
    elif case['id'] == 'nejm-case-2021-07':
        case['final_diagnosis_comb'] = 'Meningococcal purpura fulminans.'
    

In [66]:
## store the usable cases in a jsonl file
with open('./NEJM_case_test_final_diagnosis_combined.jsonl', 'w') as jsonl_file:
    for case in fd_usable_cases:
        jsonl_file.write(json.dumps(case))
        jsonl_file.write('\n')

len(fd_usable_cases)

574