In [1]:
from langchain_ollama import ChatOllama

In [6]:
LLM_MODEL="qwen2.5:14b"

In [7]:
def get_llm():
    llm = ChatOllama(model=LLM_MODEL)
    return llm

def query_llama(prompt):
    llm = get_llm()
    res = llm.invoke(prompt)
    return res.content

In [43]:
import os
import csv
import json
import pandas as pd
from docling.document_converter import DocumentConverter


ITEMS_DICT = {
    'signalment_physical': [
        'age', 'breed', 'gender', 'neuter_status', 'vomit_nausea',
        'lethargy_weakness', 'appetite_loss', 'diarrhea_melena',
        'abdominal_pain', 'weight_loss', 'duration', 'bw', 'temp',
        'hr', 'rr', 'bcs', 'hydration_status'
    ],
    'cbc': [
        'wbc', 'red_blood_cell_count', 'hemoglobin', 'packed_cell_volume',
        'mean_corpuscular_volume', 'mean_corpuscular_hemoglobin_concentration',
        'plasma_protein', 'platelet_count', 'absolute_neutrophil',
        'absolute_bands', 'absolute_lymphocyte', 'absolute_monocyte',
        'absolute_eosinophil', 'absolute_basophil', 'absolute_other'
    ],
    'chem': [
        'glucose', 'lactic_acid', 'blood_urea_nitrogen', 'creatinine',
        'sodium', 'potassium', 'enzymatic_carbon_dioxide', 'chloride',
        'anion_gap_calculated', 'calcium', 'phosphorus', 'magnesium',
        'total_protein', 'albumin', 'globulin', 'total_bilirubin',
        'gamma_glutamyltransferase', 'alanine_aminotransferase',
        'alkaline_phosphatase', 'cholesterol'
    ],
    'cpli': ['spec_cpli'],
    'aus': [
        'size', 'echogenecity_of_pancreatic_parenchyma',
        'echogenecity_of_peripancreatic_mesentery', 'pancreatic_echotexture',
        'free_fluid_effusion', 'conclusions'
    ]
}


def create_extraction_prompt(file_name, content):
    return f"""
    Analyze this veterinary medical document and extract structured data in JSON format.
    Extract the details based on the following categories: {ITEMS_DICT[file_name]}
    Follow this exact structure:
    ```{{
        "signalment_physical": {{
            "age": "<value>",
            "breed": "<value>",
            ...
        }},
        "cbc": {{
            "wbc": "<value>",
            ...
        }},
        ...
    }}```

    Extract ALL available information from this text:
    {content}

    Return ONLY the JSON with extracted values. Use empty strings for missing information.
    """

In [29]:
def process_file(file_path):
    try:
        file_name = os.path.basename(file_path).split('.')[0]
        # with open(file_path, 'r', encoding='utf-8') as f:
        #     content = f.read()
        
        converter = DocumentConverter()
        result = converter.convert(file_path)
        content = result.document.export_to_markdown()
        
        prompt = create_extraction_prompt(file_name, content)
        response = query_llama(prompt)
        
        response = response.strip("` \n")
        if response.startswith('json'):
            response = response[4:]
        return json.loads(response)
    
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return {}

In [53]:
def generate_data(files):
    data = []
    
    for file in files:
        try:
            file_name = os.path.basename(file).split('.')[0]
            extracted_data = process_file(file)
            
            # Build rows for this file
            for category in ITEMS_DICT[file_name]:
                for item in extracted_data.get(category, {}):
                    value = extracted_data[category][item]
                    data.append({
                        'filename': file_name,
                        'items': item,
                        'results': value,
                        'details': ''
                    })
                    
        except Exception as e:
            print(f"Error processing {file}: {str(e)}")
            continue

    return data

In [54]:
def generate_csv(data, output_file):
    df = pd.DataFrame(data)
    df = df[['filename', 'items', 'results', 'details']]
    
    df.to_csv(output_file, sep='\t', index=False)
    print(f"Extraction completed. Results saved to {output_file}")

In [55]:
data_directory = 'data'  # Update with your data directory

signalment_physical = os.path.join(data_directory, 'signalment_physical.pdf')
cbc = os.path.join(data_directory, 'cbc.pdf')
chem = os.path.join(data_directory, 'chem.pdf')
cpli = os.path.join(data_directory, 'cpli.pdf')
aus = os.path.join(data_directory, 'aus.pdf')

files = [signalment_physical, cbc, chem, cpli, aus]

output_csv = 'extracted_data.csv'
data = generate_data(files)

generate_csv(data, output_csv)

Could not load the custom kernel for multi-scale deformable attention: /home/dheeraj/.cache/torch_extensions/py311_cu124/MultiScaleDeformableAttention/MultiScaleDeformableAttention.so: cannot open shared object file: No such file or directory
Could not load the custom kernel for multi-scale deformable attention: /home/dheeraj/.cache/torch_extensions/py311_cu124/MultiScaleDeformableAttention/MultiScaleDeformableAttention.so: cannot open shared object file: No such file or directory
Could not load the custom kernel for multi-scale deformable attention: /home/dheeraj/.cache/torch_extensions/py311_cu124/MultiScaleDeformableAttention/MultiScaleDeformableAttention.so: cannot open shared object file: No such file or directory
Could not load the custom kernel for multi-scale deformable attention: /home/dheeraj/.cache/torch_extensions/py311_cu124/MultiScaleDeformableAttention/MultiScaleDeformableAttention.so: cannot open shared object file: No such file or directory
Could not load the custom ke

Error processing data/aus.pdf: string indices must be integers, not 'str'
Extraction completed. Results saved to extracted_data.csv


In [36]:
# from docling.document_converter import DocumentConverter

# converter = DocumentConverter()
# result = converter.convert(cpli)
# print(result.document.export_to_markdown()) 