In [1]:
import json
import time
import sys

import pandas as pd
import requests
import tqdm

sys.path.insert(0, '../../')
import map_modifiers

In [2]:
def get_c2q_mapping(string):
    post_data = {
        'inc': string,
        'exc': '',
        'initialevent': '',
        'rule': True,
        'ml': True,
        'abb': True,
        'obstart': '',
        'obend': '',
        'daysbefore': '0',
        'daysafter': '0',
        'limitto': 'All'
    }
    base = 'http://www.ohdsi.org/web/criteria2query/'
    with requests.Session() as s:
        # Clear cookies
        s.cookies.clear()
        
        # Post the data first
        post_response = s.post(
            base + 'main/autoparse',
            data=post_data,
            timeout=10,
        )

        get_response = s.get(
            base + 'queryformulate/formulateCohort',
            data={},
            timeout=10,
        )
    
    if get_response.status_code in {500, 504}:
        return post_response, get_response

    responses_list = (
        json.loads(get_response.json()['jsonResult'])
        ['ConceptSets']
    )
    return responses_list


def format_results(results, criteria_dict):
    """Format results depending on whether successful or erroneous"""
    if isinstance(results, tuple):
        return [], _format_error(results, criteria_dict)
    return _format_correct(results, criteria_dict), []


def _format_error(results, criteria_dict):
    criteria_dict.update({
        'post_request': results[0],
        'get_request': results[1],
    })
    return [criteria_dict,]


def _format_correct(results, criteria_dict):
    outputs = list()
    for res in results:
        if not res.get('name'):
            continue
        result_dict = {
            **criteria_dict.copy(),
            'cohort_name': res['name'],
        }
        for item in res['expression']['items']:
            outputs.append({
                **result_dict,
                'concept': item['concept'],
            })
    return outputs

In [3]:
uog_df = pd.read_csv('../../data/annotations/annotate_notes_uog.csv')

uog_criteria_strings = (
    uog_df
    .filter(items=['NCT_id', 'matched_string', 'criteria_string'])
    .dropna()
    .drop_duplicates()
    .assign(
        criteria_string=lambda df: df['criteria_string'].apply(map_modifiers.utils.normalize_text),
        matched_string=lambda df: df['matched_string'].apply(map_modifiers.utils.normalize_text),
    )
    .assign(
        criteria_string=lambda df: df.apply(
            lambda row: map_modifiers.recognize_parents.get_word_margin(row['criteria_string'],
                                                                        row['matched_string'], 3)[0],
            axis=1),
        source='uog'
    )
    .to_dict('records')
)

print(len(uog_criteria_strings), '\n', uog_criteria_strings[0])

for criteria_dict in uog_criteria_strings:
    assert criteria_dict['criteria_string'].index(criteria_dict['matched_string'])

420 
 {'NCT_id': 'NCT03937804', 'matched_string': 'scoliosis', 'criteria_string': 'bronchitis lung transplant kyphoscoliosis sarcoidosis bronchopulmonary dysplasia', 'source': 'uog'}


In [4]:
hrn_df = pd.read_csv('../../data/annotations/annotate_notes_hr2479.csv')

hrn_criteria_strings = (
    hrn_df
    .filter(items=['NCT_id', 'matched_string', 'criteria_string'])
    .dropna()
    .drop_duplicates()
    .assign(
        criteria_string=lambda df: df['criteria_string'].apply(map_modifiers.utils.normalize_text),
        matched_string=lambda df: df['matched_string'].apply(map_modifiers.utils.normalize_text),
    )
    .assign(
        criteria_string=lambda df: df.apply(
            lambda row: map_modifiers.recognize_parents.get_word_margin(
                row['criteria_string'], row['matched_string'], 3)[0] 
            if row['matched_string'] in row['criteria_string'] else row['criteria_string'],
            axis=1),
        source='hrn'
    )
    .to_dict('records')
)

print(len(hrn_criteria_strings), '\n', hrn_criteria_strings[0])

420 
 {'NCT_id': 'NCT00000456', 'matched_string': 'panic', 'criteria_string': 'of major depression panic disorder obsessive-compulsive disorder', 'source': 'hrn'}


In [5]:
# Combine criteria strings from Harry and Undina
all_criteria_string = [*uog_criteria_strings, *hrn_criteria_strings]

In [6]:
correct_results = list()
error_results = list()
for criteria_dict in tqdm.tqdm_notebook(all_criteria_string):
    results = get_c2q_mapping(criteria_dict['criteria_string'])
    
    correct, error = format_results(results, criteria_dict)
    
    correct_results.extend(correct)
    error_results.extend(error)
    time.sleep(1.5)

HBox(children=(IntProgress(value=0, max=840), HTML(value='')))




In [7]:
# Dump correct results to JSON file
with open('../../data/c2q/c2q_mappings_correct.json', 'w') as f:
    json.dump(correct_results, f, indent=2)

# Dump errors to a separate JSON file
#  Errors include no mappings, rate throttling, etc.
for error_result in error_results:
    error_result['post_request'] = error_result['post_request'].decode('utf-8')
    error_result['get_request'] = error_result['get_request'].decode('utf-8')

with open('../../data/c2q/c2q_mappings_error.json', 'w') as f:
    json.dump(error_results, f, indent=2)

AttributeError: 'Response' object has no attribute 'decode'

In [None]:
# Save correct results to a .csv also, for easier evaluation later
correct_results_df = (
    pd.DataFrame(correct_results)
    .assign(
        concept_code=lambda df: df['concept'].apply(lambda x: x['CONCEPT_CODE']),
        concept_name=lambda df: df['concept'].apply(lambda x: x['CONCEPT_NAME']),
        vocabulary=lambda df: df['concept'].apply(lambda x: x['VOCABULARY_ID']),
        concept_class=lambda df: df['concept'].apply(lambda x: x['CONCEPT_CLASS_ID']),
    )
    .loc[lambda df: df['vocabulary'] == 'SNOMED', ['NCT_id', 'matched_string', 'criteria_string',
                                                   'source', 'cohort_name', 'concept_code',
                                                   'concept_name']]
)

correct_results_df.to_csv('../../data/c2q/results_table.csv', index=False)

correct_results_df.head(2)