In [1]:
import os
import csv
from tqdm import tqdm
import pandas as pd
import numpy as np
from evaluation_functions import evaluate

## Set Up

### Variables

In [19]:
# train
# drug_file = 'data/train_drug_label_text.csv'
# manual_file = 'data/train_drug_label_text_manual_ades.csv'

# test
drug_file = 'data/test_drug_label_text.csv'
manual_file = 'data/test_drug_label_text_manual_ades.csv'

# my_max = 10000

In [20]:
drugs = pd.read_csv(drug_file)
manual_ades = pd.read_csv(manual_file)
set_type = drug_file.split('/')[1].split('_')[0] # assuming file follows format "train_..." or "test...."

all_sections = drugs.groupby('drug_name')['section_text'].apply(' '.join).reset_index()
all_sections.insert(1, "section_name", ["all-concat" for _ in range(all_sections.shape[0])])
drugs = pd.concat([drugs, all_sections])

## Run GPT

In [21]:
outputs = {}

## Exact Match Algorithm

In [22]:
run_key = 'exact_{}'.format(set_type)

if not os.path.exists('results/{}.csv'.format(run_key)):
    # load the meddra terms
    fh = open('data/meddra_llt_pt_map.txt')
    reader = csv.reader(fh, delimiter='|')
    header = next(reader)

    meddra_llt_terms = set()
    meddra_pt_terms = set()

    for row in reader:
        meddra_llt_terms.add(row[1].lower())
        meddra_pt_terms.add(row[4].lower())
    
    fh.close()

    meddra_terms = meddra_llt_terms | meddra_pt_terms
    len(meddra_llt_terms), len(meddra_pt_terms), len(meddra_terms)

    results = list()
    for _, row in tqdm(drugs.iterrows(), total=drugs.shape[0]):
        name, section = row['drug_name'], row['section_name']
        text = row['section_text'].lower()
        # if name != 'FULYZAQ':
        #     continue

        # if section != 'warnings and precautions':
        #     continue

        # if name == 'FULYZAQ':
        #     print(section)
        #     print(text)

        found_terms = set()
        for term in meddra_terms:
            if text.find(term) != -1 and len(term) > 3:
                found_terms.add(term)
        
        # if name == 'FULYZAQ':
        #     print(', '.join(found_terms))
        
        exact_out = ', '.join(list(found_terms))
        
        results.append([name, section, exact_out])

    exact_output = pd.DataFrame(
        [r for r in results if r is not None],
        columns=['drug_name', 'section_name', 'gpt_output']
    )
    exact_output.to_csv('results/{}.csv'.format(run_key))
    
    outputs[run_key] = exact_output

100%|██████████| 336/336 [02:50<00:00,  1.97it/s]


## Evaluation

In [23]:
for run_key in sorted(outputs.keys()):
    print(run_key)

exact_test


In [24]:
evaluate(outputs, manual_ades, 'strict')
evaluate(outputs, manual_ades, 'lenient')

Running strict evaluation and saving results to disk.
exact_test


100%|██████████| 99/99 [00:02<00:00, 37.40it/s]


Running lenient evaluation and saving results to disk.
exact_test


100%|██████████| 99/99 [00:36<00:00,  2.68it/s]
