In [30]:
import pandas as pd


# Load in the excel data
excel_filepath = 'rare-and-inherited-disease-national-genomics-test-directory-v7.xlsx'

df = pd.read_excel(excel_filepath, sheet_name="R&ID indications", header=1)
df.head()

Unnamed: 0,Clinical indication ID,Test ID,Clinical Indication,Target/Genes,Test Method,Commissioning category,Specialist test group,Part in the eligibility criteria document,Changes since January 2024 publication
0,R24,R24.1,Achondroplasia,FGFR3 c.1138,Targeted variant testing,Core,Core,XVI Musculoskeletal,No change
1,R169,R169.1,Acute intermittent porphyria,HMBS (1207),Single gene sequencing >=10 amplicons,Highly specialised,Gastrohepatology,VII Gastrohepatology,No change
2,R419,R419.1,Acute Rhabdomyolysis,Acute Rhabdomyolysis (1141),Medium panel,Highly Specialised,Neurology,XVII Neurology,No change
3,R14,R14.1,Acutely unwell children with a likely monogeni...,Trio gene agnostic or appropriate panels in si...,WGS,Core,Multi specialty,I Acutely unwell children,No change
4,R56,R56.3,"Adult onset dystonia, chorea or related moveme...",Adult onset movement disorder (540),WGS,Specialised,Neurology,XVII Neurology,No change


In [33]:
r_codes = df["Clinical indication ID"].unique()
associated_clinical_indications = []
new_r_codes = []
for r_code in r_codes:
    associated_clinical_indications.append(df[df["Clinical indication ID"] == r_code]["Clinical Indication"].tolist()[0])
    new_r_codes.append(r_code)


In [37]:
r_codes

['R24',
 'R169',
 'R419',
 'R14',
 'R56',
 'R60',
 'R62',
 'R58',
 'R233',
 'R39',
 'R293',
 'R191',
 'R106',
 'R330',
 'R314',
 'R340',
 'R65',
 'R47',
 'R414',
 'R23',
 'R446',
 'R133',
 'R83',
 'R294',
 'R295',
 'R201',
 'R19',
 'R155',
 'R413',
 'R167',
 'R422',
 'R107',
 'R391',
 'R49',
 'R31 ',
 'R90',
 'R43',
 'R128',
 'R337',
 'R319',
 'R156',
 'R246',
 'R244',
 'R362',
 'R129',
 'R333',
 'R84',
 'R87',
 'R336',
 'R196',
 'R57',
 'R61',
 'R109',
 'R359',
 'R171',
 'R265',
 'R343',
 'R415',
 'R124',
 'R123',
 'R445',
 'R401',
 'R99',
 'R263',
 'R443',
 'R229',
 'R181',
 'R180',
 'R150',
 'R199',
 'R46',
 'R137',
 'R144',
 'R145',
 'R28',
 'R79',
 'R80',
 'R81',
 'R262',
 'R237',
 'R185',
 'R184',
 'R253',
 'R193',
 'R334',
 'R258',
 'R91',
 'R447',
 'R449',
 'R450',
 'R240',
 'R451',
 'R364',
 'R146',
 'R132',
 'R346',
 'R73',
 'R59',
 'R163',
 'R101',
 'R140',
 'R217',
 'R255',
 'R164',
 'R335',
 'R74',
 'R345',
 'R112',
 'R118',
 'R115',
 'R116',
 'R117',
 'R119',
 'R120',
 'R

In [35]:
r_codes = new_r_codes

In [36]:
associated_clinical_indications

['Achondroplasia',
 'Acute intermittent porphyria',
 'Acute Rhabdomyolysis ',
 'Acutely unwell children with a likely monogenic disorder',
 'Adult onset dystonia, chorea or related movement disorder',
 'Adult onset hereditary spastic paraplegia',
 'Adult onset leukodystrophy',
 'Adult onset neurodegenerative disorder',
 'Agammaglobulinaemia with absent BTK expression',
 'Albinism or congenital nystagmus',
 'Albright hereditary osteodystrophy, pseudohypoparathyroidism, pseudopseudohypoparathyroidism, acrodysostosis and osteoma cutis',
 'Alpha-1-antitrypsin deficiency',
 'Alstrom syndrome',
 'Alveolar capillary dysplasia with misalignment of pulmonary veins',
 'Ambiguous genitalia presenting neonatally',
 'Amelogenesis imperfecta',
 'Aminoglycoside exposure posing risk to hearing',
 'Angelman syndrome',
 'APC associated Polyposis ',
 'Apert syndrome',
 'APOL1 kidney donor testing',
 'Arrhythmogenic right ventricular cardiomyopathy',
 'Arthrogryposis',
 'Ataxia telangiectasia - DNA repair

In [38]:
code_map = dict(zip(r_codes, associated_clinical_indications))

In [39]:
code_map["R208"]

'Inherited breast cancer and ovarian cancer'

In [42]:
import os
from openai import OpenAI
from dotenv import load_dotenv
import PyPDF2
import json

load_dotenv()

test_directory = "national-genomic-test-directory-rare-and-inherited-disease-eligibility-criteria-v7.pdf"
output_json_file = "extracted_tests.json"
output_csv_file = "extracted_tests.csv"
prompt_file = "extract_r_code.txt"

specific_tests = r_codes[0:20] + ["R208"]


with open(prompt_file, "r") as f:
    prompt = f.read()

def run_prompt_on_extracted_text():
    client = OpenAI()


    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": prompt
            },
            {
                "role": "user",
                "content": extracted_text
            }
        ],
        model="gpt-4o",
    )
    return chat_completion.choices[0].message.content


# creating a pdf reader object
reader = PyPDF2.PdfReader(test_directory)

all_extracted_tests = []

for j, specific_test in enumerate(specific_tests):
    specific_test_texts = []
    for i, page in enumerate(reader.pages):
        extracted_text = page.extract_text()
        if specific_test in extracted_text:
            specific_test_texts.append((i, extracted_text))

    start_section = False
    relevant_pages = []
    for text in specific_test_texts:
        if start_section and "R" in text[1][0:40]:
            break
        if specific_test + " " in text[1][0:100]:
            start_section = True
        if start_section:
            relevant_pages.append(text)
        
    extracted_text = ' '.join([text[1] for text in relevant_pages])	

    extracted_test = run_prompt_on_extracted_text()

    extracted_test_json = json.loads(extracted_test)
    extracted_test_json["test_code"] = specific_test
    extracted_test_json["clinical_indication_name"] = specific_test + ' - ' + code_map[specific_test]

    all_extracted_tests.append(extracted_test_json)

with open(output_file, "w") as f:
    json.dump(all_extracted_tests, f)

df = pd.read_json(output_file)

df.to_csv(output_csv_file, index=False)

In [5]:
output = pd.read_json(output_file)

In [6]:
output

Unnamed: 0,criteria,requesting_specialties,test_code
0,[Clinical features strongly suggestive of acho...,"[Clinical Genetics, Neonatology, Paediatrics]",R24
1,[Clinical features of acute intermittent porph...,"[Clinical Genetics, Dermatology, Gastroenterol...",R169


In [11]:
output['criteria'][0]

['Testing can be offered ---- when paternal exclusion testing can be offered in families at risk of a recessive disorder',
 'Testing can be offered ---- when parents carry different variants',
 'Testing can be offered ---- where the father has an autosomal dominant variant',
 'Testing can be offered ---- where the father is known mosaic for a variant',
 'Testing can be offered ---- NIPD should only be offered for conditions where invasive testing would otherwise be offered and following discussion with the testing laboratory',
 'Testing can be offered ---- Referrals for testing will be triaged by the Genomic Laboratory; testing should be targeted at those where a genetic or genomic diagnosis will guide management for the proband or family',
 'Testing can be offered ---- Testing should be discussed in advance with the testing laboratory to ensure that necessary samples and validation work has been performed',
 'Testing can be offered ---- Testing may not be possible in multiple pregnanc