In [1]:
from langchain_ollama import ChatOllama

In [79]:
import pandas as pd
pd.set_option('display.max_rows', 100)

In [34]:
LLM_MODEL="deepseek-r1:14b"

In [35]:
def get_llm():
    llm = ChatOllama(model=LLM_MODEL)
    return llm

def query_llama(prompt):
    llm = get_llm()
    res = llm.invoke(prompt)
    return res.content

In [2]:
import os
import csv
import json
import pandas as pd
import re
from docling.document_converter import DocumentConverter


ITEMS_DICT = {
    'signalment_physical': [
        'age', 'breed', 'gender', 'neuter_status', 'vomit_nausea',
        'lethargy_weakness', 'appetite_loss', 'diarrhea_melena',
        'abdominal_pain', 'weight_loss', 'duration', 'bw', 'temp',
        'hr', 'rr', 'bcs', 'hydration_status'
    ],
    'cbc': [
        'wbc', 'red_blood_cell_count', 'hemoglobin', 'packed_cell_volume',
        'mean_corpuscular_volume', 'mean_corpuscular_hemoglobin_concentration',
        'plasma_protein', 'platelet_count', 'absolute_neutrophil',
        'absolute_bands', 'absolute_lymphocyte', 'absolute_monocyte',
        'absolute_eosinophil', 'absolute_basophil', 'absolute_other'
    ],
    'chem': [
        'glucose', 'lactic_acid', 'blood_urea_nitrogen', 'creatinine',
        'sodium', 'potassium', 'enzymatic_carbon_dioxide', 'chloride',
        'anion_gap_calculated', 'calcium', 'phosphorus', 'magnesium',
        'total_protein', 'albumin', 'globulin', 'total_bilirubin',
        'gamma_glutamyltransferase', 'alanine_aminotransferase',
        'alkaline_phosphatase', 'cholesterol'
    ],
    'cpli': ['spec_cpli'],
    'aus': [
        'size', 'echogenecity_of_pancreatic_parenchyma',
        'echogenecity_of_peripancreatic_mesentery', 'pancreatic_echotexture',
        'free_fluid_effusion', 'conclusions'
    ]
}


def create_extraction_prompt(file_name, content):
    return f"""
    Analyze this veterinary medical document and extract structured data in JSON format.
    Extract the details with the following categories as keys: {ITEMS_DICT[file_name]}, with the main key being '{file_name}'.
    Follow this exact structure (ONLY AN EXAMPLE, NOT THE ACTUAL DATA. MODIFY IT TO FIT THE ACTUAL DATA, WITH ONLY ONE MAIN KEY BEING THE FILE NAME AND OTHER KEYS (INSIDE THE MAIN KEY) BEING THE CATEGORIES):
    ```{{
        "signalment_physical": {{
            "age": "<value>",
            "breed": "<value>",
            ...
        }},
        "cbc": {{
            "wbc": "<value>",
            ...
        }},
        ...
    }}```

    PLEASE KEEP THE BELOW STRUCTURE INTACT:
    ```{{
        "MAIN_KEY": {{
            "category1": "<value>",
            "category2": "<value>",
            ...
        }}
    }}

    DO NOT CREATE MULTIPLE SUB KEYS. 
    If there are any units, please include them in the extracted values. example: glucose: "100 mg/dL"
    Extract ALL available information from this text:
    {content}

    Return ONLY the JSON with extracted values. Use empty strings for missing information.
    """

  from .autonotebook import tqdm as notebook_tqdm


In [73]:
def process_file(file_path):
    try:
        file_name = os.path.basename(file_path).split('.')[0]
        # with open(file_path, 'r', encoding='utf-8') as f:
        #     content = f.read()
        
        converter = DocumentConverter()
        result = converter.convert(file_path)
        content = result.document.export_to_markdown()
        
        prompt = create_extraction_prompt(file_name, content)
        response = query_llama(prompt)
        
        json_pattern = re.compile(r'```json(.*?)```', re.DOTALL)
        response = json_pattern.search(response).group(1)
        # print(response)
        return json.loads(response)
    
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return {}

In [74]:
def generate_data(file):
    data = []
    
    print("file = ", file)
    try:
        extracted_data = process_file(file)
        return extracted_data

    except Exception as e:
        print(f"Error processing {file}: {str(e)}")

    return data

In [76]:
data_directory = 'data'

signalment_physical = os.path.join(data_directory, 'signalment_physical.pdf')
cbc = os.path.join(data_directory, 'cbc.pdf')
chem = os.path.join(data_directory, 'chem.pdf')
cpli = os.path.join(data_directory, 'cpli.pdf')
aus = os.path.join(data_directory, 'aus.pdf')

# files = [signalment_physical, cbc, chem, cpli, aus]

output_csv = 'extracted_data.csv'
# data = generate_data(files)
signalment_physical_data = generate_data(signalment_physical)
cbc_data = generate_data(cbc)
chem_data = generate_data(chem)
cpli_data = generate_data(cpli)
aus_data = generate_data(aus)

file =  data/signalment_physical.pdf


Could not load the custom kernel for multi-scale deformable attention: /home/dheeraj/.cache/torch_extensions/py311_cu124/MultiScaleDeformableAttention/MultiScaleDeformableAttention.so: cannot open shared object file: No such file or directory
Could not load the custom kernel for multi-scale deformable attention: /home/dheeraj/.cache/torch_extensions/py311_cu124/MultiScaleDeformableAttention/MultiScaleDeformableAttention.so: cannot open shared object file: No such file or directory
Could not load the custom kernel for multi-scale deformable attention: /home/dheeraj/.cache/torch_extensions/py311_cu124/MultiScaleDeformableAttention/MultiScaleDeformableAttention.so: cannot open shared object file: No such file or directory
Could not load the custom kernel for multi-scale deformable attention: /home/dheeraj/.cache/torch_extensions/py311_cu124/MultiScaleDeformableAttention/MultiScaleDeformableAttention.so: cannot open shared object file: No such file or directory
Could not load the custom ke

file =  data/cbc.pdf


Could not load the custom kernel for multi-scale deformable attention: /home/dheeraj/.cache/torch_extensions/py311_cu124/MultiScaleDeformableAttention/MultiScaleDeformableAttention.so: cannot open shared object file: No such file or directory
Could not load the custom kernel for multi-scale deformable attention: /home/dheeraj/.cache/torch_extensions/py311_cu124/MultiScaleDeformableAttention/MultiScaleDeformableAttention.so: cannot open shared object file: No such file or directory
Could not load the custom kernel for multi-scale deformable attention: /home/dheeraj/.cache/torch_extensions/py311_cu124/MultiScaleDeformableAttention/MultiScaleDeformableAttention.so: cannot open shared object file: No such file or directory
Could not load the custom kernel for multi-scale deformable attention: /home/dheeraj/.cache/torch_extensions/py311_cu124/MultiScaleDeformableAttention/MultiScaleDeformableAttention.so: cannot open shared object file: No such file or directory
Could not load the custom ke

file =  data/chem.pdf


Could not load the custom kernel for multi-scale deformable attention: /home/dheeraj/.cache/torch_extensions/py311_cu124/MultiScaleDeformableAttention/MultiScaleDeformableAttention.so: cannot open shared object file: No such file or directory
Could not load the custom kernel for multi-scale deformable attention: /home/dheeraj/.cache/torch_extensions/py311_cu124/MultiScaleDeformableAttention/MultiScaleDeformableAttention.so: cannot open shared object file: No such file or directory
Could not load the custom kernel for multi-scale deformable attention: /home/dheeraj/.cache/torch_extensions/py311_cu124/MultiScaleDeformableAttention/MultiScaleDeformableAttention.so: cannot open shared object file: No such file or directory
Could not load the custom kernel for multi-scale deformable attention: /home/dheeraj/.cache/torch_extensions/py311_cu124/MultiScaleDeformableAttention/MultiScaleDeformableAttention.so: cannot open shared object file: No such file or directory
Could not load the custom ke

file =  data/cpli.pdf


Could not load the custom kernel for multi-scale deformable attention: /home/dheeraj/.cache/torch_extensions/py311_cu124/MultiScaleDeformableAttention/MultiScaleDeformableAttention.so: cannot open shared object file: No such file or directory
Could not load the custom kernel for multi-scale deformable attention: /home/dheeraj/.cache/torch_extensions/py311_cu124/MultiScaleDeformableAttention/MultiScaleDeformableAttention.so: cannot open shared object file: No such file or directory
Could not load the custom kernel for multi-scale deformable attention: /home/dheeraj/.cache/torch_extensions/py311_cu124/MultiScaleDeformableAttention/MultiScaleDeformableAttention.so: cannot open shared object file: No such file or directory
Could not load the custom kernel for multi-scale deformable attention: /home/dheeraj/.cache/torch_extensions/py311_cu124/MultiScaleDeformableAttention/MultiScaleDeformableAttention.so: cannot open shared object file: No such file or directory
Could not load the custom ke

file =  data/aus.pdf


Could not load the custom kernel for multi-scale deformable attention: /home/dheeraj/.cache/torch_extensions/py311_cu124/MultiScaleDeformableAttention/MultiScaleDeformableAttention.so: cannot open shared object file: No such file or directory
Could not load the custom kernel for multi-scale deformable attention: /home/dheeraj/.cache/torch_extensions/py311_cu124/MultiScaleDeformableAttention/MultiScaleDeformableAttention.so: cannot open shared object file: No such file or directory
Could not load the custom kernel for multi-scale deformable attention: /home/dheeraj/.cache/torch_extensions/py311_cu124/MultiScaleDeformableAttention/MultiScaleDeformableAttention.so: cannot open shared object file: No such file or directory
Could not load the custom kernel for multi-scale deformable attention: /home/dheeraj/.cache/torch_extensions/py311_cu124/MultiScaleDeformableAttention/MultiScaleDeformableAttention.so: cannot open shared object file: No such file or directory
Could not load the custom ke

In [77]:
data = []

files_data = {
    'signalment_physical': signalment_physical_data,
    'cbc': cbc_data,
    'chem': chem_data,
    'cpli': cpli_data,
    'aus': aus_data
}

for file_name, file_data in files_data.items():
    for key, value in file_data.items():
        for key2, value2 in value.items():
            data.append({
                'filename': key,
                'items': key2,
                'results': value2,
                'details': ''
            })
df = pd.DataFrame(data)
df = df[['filename', 'items', 'results', 'details']]

In [80]:
df

Unnamed: 0,filename,items,results,details
0,signalment_physical,age,13 years,
1,signalment_physical,breed,Bichon Frise,
2,signalment_physical,gender,male,
3,signalment_physical,neuter_status,neutered,
4,signalment_physical,vomit_nausea,Present,
5,signalment_physical,lethargy_weakness,Present,
6,signalment_physical,appetite_loss,Present,
7,signalment_physical,diarrhea_melena,Present,
8,signalment_physical,abdominal_pain,Present,
9,signalment_physical,weight_loss,,


In [82]:
df.to_csv(output_csv, sep=',', index=False)

In [6]:
# from docling.document_converter import DocumentConverter

# cbc_test = os.path.join('data', 'cbc.pdf')

# converter = DocumentConverter()
# result = converter.convert(cbc_test)
# content = result.document.export_to_markdown()
# print(type(content))