### Reader

In [None]:
import os
import xml.etree.ElementTree as ET
cache_path = '/home/zhengzheng/scratch0/projects/Fine-Tuned-GPT-2-with-articles-ground-truth/code/llamaIndex/.cache'
filenames = [filename for filename in os.listdir(cache_path) if filename.endswith('tei.xml')]


def get_full_text(element):
    text = element.text or ''
    for subelement in element:
        text += ET.tostring(subelement, encoding='unicode', method='text')
        if subelement.tail:
            text += subelement.tail
    return text.strip()

iter_filenames = (iter(filenames))

In [None]:
file_path = os.path.join(cache_path, next(iter_filenames)) 
file_path

In [None]:
file_dict = {}
tree = ET.parse(file_path)
root = tree.getroot()

namespace = {'tei': 'http://www.tei-c.org/ns/1.0'}

In [None]:
# title
title = root.find('.//tei:teiHeader/tei:fileDesc/tei:titleStmt/tei:title', namespaces=namespace)
file_dict['title'] = title.text
file_dict

In [None]:
# authors
authors = root.findall('.//tei:teiHeader/tei:fileDesc/tei:sourceDesc/tei:biblStruct/tei:analytic/tei:author/tei:persName', namespaces=namespace)
file_dict['authors'] = []
for author in authors:
    forename = author.findall('tei:forename', namespaces=namespace)
    if len(forename) == 0:
        continue
    forename = ' '.join(name.text for name in forename)
    surname = ' '.join([name.text for name in author.findall('tei:surname', namespaces=namespace)])
    author_name = forename + ' ' + surname
    file_dict['authors'].append(author_name)
file_dict

In [None]:
# abstract
abstract = root.find('.//tei:teiHeader/tei:profileDesc/tei:abstract/tei:div/tei:p', namespaces=namespace)
print(abstract)
if abstract is not None:
    file_dict['abstract'] = abstract.text
file_dict

In [None]:
# Body
body = root.findall('.//tei:text/tei:body/tei:div', namespaces=namespace)

for i, child in enumerate(body):
    ps = child.findall('.//tei:p', namespaces=namespace)
    if len(ps) == 0:
        continue
    head = child.find('.//tei:head', namespaces=namespace).text
    content = '\n'.join([get_full_text(p) for p in ps])
    file_dict[head] = content
file_dict

In [None]:
from utils.custom_document_reader import CustomDocumentReader

documents = CustomDocumentReader(
    input_dir="/home/zhengzheng/scratch0/projects/Fine-Tuned-GPT-2-with-articles-ground-truth/data",
    cache_dir="/home/zhengzheng/scratch0/projects/Fine-Tuned-GPT-2-with-articles-ground-truth/code/llamaIndex/.cache",
    config_path="/home/zhengzheng/scratch0/projects/Fine-Tuned-GPT-2-with-articles-ground-truth/code/llamaIndex/utils/config.json",
    remove_cache=False
)._load_data()

### CustomSplitter

In [None]:
from llama_index.core.node_parser import SentenceSplitter

ss1 = SentenceSplitter(
    chunk_size=1024,
    include_metadata=True,
    include_prev_next_rel=True,
)

In [None]:
from llama_index.core.storage.docstore import SimpleDocumentStore
nodes = ss1.get_nodes_from_documents(documents)
docstore = SimpleDocumentStore()
docstore.add_documents(nodes)
docstore.persist(persist_path="/home/zhengzheng/scratch0/projects/Fine-Tuned-GPT-2-with-articles-ground-truth/code/llamaIndex/.cache/test.json")


In [None]:
from llama_index.core.node_parser import SimpleFileNodeParser

nodes = SimpleFileNodeParser().get_nodes_from_documents(documents=documents)

In [None]:
for k, v in nodes[0].to_dict().items():
    print(f"key: {k}\nvalue: {v}\n\n")

In [None]:
from utils.custom_document_reader import CustomDocumentReader

documents = CustomDocumentReader(
    input_dir="/home/zhengzheng/scratch0/projects/Fine-Tuned-GPT-2-with-articles-ground-truth/data",
    cache_dir="/home/zhengzheng/scratch0/projects/Fine-Tuned-GPT-2-with-articles-ground-truth/code/llamaIndex/.cache",
    config_path="/home/zhengzheng/scratch0/projects/Fine-Tuned-GPT-2-with-articles-ground-truth/code/llamaIndex/utils/config.json",
    remove_cache=False
)._load_data()

from utils.custom_parser import CustomHierarchicalNodeParser
hnp = CustomHierarchicalNodeParser.from_defaults()
hnp._parse_nodes(documents)

### Test custom document reader

In [None]:
from utils.custom_document_reader import CustomDocumentReader

documents = CustomDocumentReader(
    input_dir="/home/zhengzheng/scratch0/projects/Fine-Tuned-GPT-2-with-articles-ground-truth/data",
    cache_dir="/home/zhengzheng/scratch0/projects/Fine-Tuned-GPT-2-with-articles-ground-truth/code/llamaIndex/.cache",
    config_path="/home/zhengzheng/scratch0/projects/Fine-Tuned-GPT-2-with-articles-ground-truth/code/llamaIndex/utils/config.json",
    remove_cache=False
)._load_data()

from utils.custom_parser import CustomHierarchicalNodeParser
hnp = CustomHierarchicalNodeParser.from_defaults()
nodes = hnp.get_nodes_from_documents(documents)

In [None]:
len(documents)

In [None]:
len(nodes)

In [None]:
from utils.custom_extractor import HuggingfaceBasedExtractor, OllamaBasedExtractor, OpenAIBasedExtractor

extractor = OllamaBasedExtractor(
    model_name='sammcj/sfr-embedding-mistral:Q4_K_M',
    only_meta={'level': ['section']}
)

extractor.extract(nodes)

### Pydantic

In [None]:
from utils.get import get_a_store
file_path = 'D:\\Projects(D)\\Fine-Tuned-GPT-2-with-articles-ground-truth\\code\\llamaIndex\\.cache\\test-gpt-4o-batch_1_parser_CustomHierarchicalNodeParser-vicuna_13b.json'
docstore = get_a_store('SimpleDocumentStore').from_persist_path(persist_path=file_path)
nodes = [node for _, node in docstore.docs.items()][:4]

In [None]:
from custom.custom_extractor import OpenAIBasedQARExtractor
from custom.schema import QAR
from dotenv import load_dotenv

load_dotenv()
extractor = OpenAIBasedQARExtractor(
    model_name='gpt-4o',
    cache_dir='D:\\Projects(D)\\Fine-Tuned-GPT-2-with-articles-ground-truth\\code\\llamaIndex\\.cache',
    mode='immediately',
    embedding_only=True,
    only_meta=None
)

In [None]:
outputs = extractor.extract(nodes, 'test', 'test_action', 'D:\\Projects(D)\\Fine-Tuned-GPT-2-with-articles-ground-truth\\code\\llamaIndex\\.cache')

In [None]:
nodes[0].metadata['questions_this_excerpt_can_answer_and_corresponding_answers_and_reasons'].split('\n')

In [None]:
nodes[0].metadata['questions_this_excerpt_can_answer_and_corresponding_answers_and_reasons'].split('\n')

In [None]:
import re
def extract_json_str(text: str) -> list:
    """Extract JSON strings from text."""
    matches = re.findall(r'\{.*?\}', text.strip(), re.MULTILINE | re.IGNORECASE | re.DOTALL)
    if not matches:
        raise ValueError(f"Could not extract json strings from output: {text}")
    return matches

text = """```json
[
    {
        "Question": "What are the main pathological features of Parkinson's disease as described in the document?",
        "Answer": "The main pathological feature of Parkinson's disease is the progressive destruction of dopamine-producing cells in the substantia nigra region of the brain stem.",
        "Reason": "This specific detail about the pathological feature is highlighted in the context and is not general knowledge."
    },
    {
        "Question": "How does the document describe the role of neuromelanin in Parkinson's disease?",
        "Answer": "The document suggests that neuromelanin may serve as an indication of catecholamine synthesis in neuronal systems containing dopamine or norepinephrine and may influence neuronal metabolism due to the heavy metals it contains.",
        "Reason": "This insight into neuromelanin's role and its potential impact on neuronal metabolism is unique to the context provided."
    },
    {
        "Question": "What evidence does the document provide to support the hypothesis that oxidative stress is involved in Parkinson's disease?",
        "Answer": "The document provides evidence such as increased membrane peroxidation in the substantia nigra, elevated TBA-reactive substance levels, increased 8-hydroxy-2′deoxyguanosine levels, and low phospholipid turnover in the substantia nigra.",
        "Reason": "These specific pieces of evidence are detailed in the context and are not commonly known facts."
    },
    {
        "Question": "What are the potential sources of increased oxidative stress in Parkinson's disease according to the document?",
        "Answer": "Potential sources include dopamine metabolism, mitochondrial dysfunction, increased free iron levels, accumulation of AGEs, reduced activity of free radical defense systems, and the deleterious role of glial cells.",
        "Reason": "The document lists these specific sources, providing a comprehensive view that is unique to the context."
    },
    {
        "Question": "How does the document describe the involvement of mitochondrial dysfunction in Parkinson's disease?",
        "Answer": "The document describes mitochondrial dysfunction as involving abnormalities of the respiratory chain, particularly a 30-40% decrease in complex I activity in the substantia nigra, which leads to reductions in ATP production and elevated ROS generation.",
        "Reason": "This detailed description of mitochondrial dysfunction and its specific impact on Parkinson's disease is unique to the context provided."
    }
]
"""

for k in extract_json_str(text):
    print(k)
# print(extract_json_str(text))

### Custom IO nodes

In [None]:
from llama_index.core.storage.docstore import SimpleDocumentStore
docstore = SimpleDocumentStore.from_persist_path(persist_path="/home/zhengzheng/scratch0/projects/Fine-Tuned-GPT-2-with-articles-ground-truth/code/llamaIndex/.cache/test_2_parser_CustomHierarchicalNodeParser-ollama_vicuna_13b.json")
nodes = [node for _, node in docstore.docs.items()]

In [None]:
from llama_index.core.schema import TextNode
TextNode.from_dict(nodes[0].to_dict())

In [None]:
len(nodes)

In [None]:
nodes[0].get_content()

In [None]:
import os, sys
root_dir = os.path.abspath(os.path.join(os.path.dirname(os.getcwd()), '..'))
sys.path.insert(0, root_dir)

In [None]:
import json
from typing import List
from tqdm import tqdm
from llama_index.core.schema import BaseNode, TextNode

def save_nodes_jsonl(file_path: str, nodes: List[BaseNode]):
    try:
        with open(file_path, 'w') as file:
            for node in tqdm(nodes, desc='Saving nodes...'):
                json.dump(node.to_dict(), file)
                file.write('\n')
    except Exception as e:
        print(f"An error occurred while saving nodes: {e}")

def load_nodes_jsonl(file_path: str) -> List[TextNode]:
    nodes = []
    
    try:
        # Get the total file size
        file_size = os.path.getsize(file_path)
        
        # Read the file and track progress based on bytes read
        with open(file_path, 'r') as file:
            with tqdm(total=file_size, desc='Loading nodes...', unit='B', unit_scale=True, unit_divisor=1024) as pbar:
                for line in file:
                    node_data = json.loads(line)
                    nodes.append(TextNode.from_dict(node_data))
                    # Update progress bar based on bytes read
                    pbar.update(len(line))
    except Exception as e:
        print(f"An error occurred while loading nodes: {e}")
    
    return nodes


In [None]:
file_path = '/home/zhengzheng/scratch0/projects/Fine-Tuned-GPT-2-with-articles-ground-truth/code/llamaIndex/.cache/test.jsonl'
save_nodes_jsonl(file_path, nodes)

In [None]:
nodes = load_nodes_jsonl(file_path)
print(len(nodes))

### get pocessing node id

In [None]:
import os
file_path = '/home/zhengzheng/scratch0/projects/Fine-Tuned-GPT-2-with-articles-ground-truth/code/llamaIndex/.cache/gpt-4o-batch-all-p_2_parser_ManuallyHierarchicalNodeParser_7879_processing.jsonl'
nodes = load_nodes_jsonl(file_path)

In [None]:
_cache_process_path = '/home/zhengzheng/scratch0/projects/Fine-Tuned-GPT-2-with-articles-ground-truth/code/llamaIndex/.cache/pid-20.jsonl'
latest_level = -1
_level2nodes = {}
level2int = {level: i for i, level in enumerate(["document", "section", "paragraph", "multi-sentences"])}

In [None]:
cache_file = open(_cache_process_path, 'r')

In [None]:
if os.path.exists(_cache_process_path):
    file_size = os.path.getsize(_cache_process_path)
    i = 1
    with open(_cache_process_path, 'r') as cache_file:
        with tqdm(total=file_size, desc='Loading cache...', unit='B', unit_scale=True, unit_divisor=1024) as pbar:
            for line in cache_file:
                try:
                    node_dict = json.loads(line)
                    node = TextNode.from_dict(node_dict)
                    if level2int[node.metadata['level']] > latest_level:
                        latest_level = level2int[node.metadata['level']]
                        _level2nodes[latest_level] = []
                    _level2nodes[level2int[node.metadata['level']]].append(node)
                except Exception as e:
                    print(e)
                # Update progress bar based on bytes read
                pbar.update(len(line))
                i += 1

In [41]:
_level2nodes[0][0].metadata

{'title': 'Selecting phototrophic species of native biocrusts in arid and semi-arid regions',
 'authors': ['Leila Kashi Zenouzi',
  'Hasan Kaboli',
  'Kazem Khavazi',
  'Mohammad Sohrabi',
  'Mohammad Khosroshahi',
  'Ulf Karsten',
  'Seyed Hasan Kaboli'],
 'file_name': 'Selecting_phototrophic_species_of_native.pdf',
 'level': 'document',
 'original_content': 'Title: Selecting phototrophic species of native biocrusts in arid and semi-arid regions\n\n[1. abstract]\nBackground: Biological soil crusts (BSCs) that are able to produce sticky extracellular polymeric substances (EPS) play an important role in the formation of soil aggregates, thereby, reducing soil erosion. In this study, experiments were undertaken to identify biocrust species that produce EPS, in order to combat desertification in the Sejzi desert of Iran. Methods: A biocrust distribution map of Sejzi plain was prepared using Landsat 8 OLI images, then, various sampling points were selected. Some physicochemical parameters 

In [None]:
len(_level2nodes)

In [None]:
print(_level2nodes)

In [None]:
print(len(_level2nodes[0]), len(_level2nodes[1]))

In [None]:
level = 0
processed_prev_level_nodes_id = set()
for node in _level2nodes[level]:
    processed_prev_level_nodes_id.add(node.ref_doc_id)
prev_level_nodes = []
for node in nodes:
    if node.id_ not in processed_prev_level_nodes_id:
        prev_level_nodes.append(node)

In [None]:
print(len(processed_prev_level_nodes_id))

In [None]:


def _init_get_nodes_from_nodes(nodes):
        latest_level = 0
        prev_level_nodes = nodes
        _cache_process_path = '/home/zhengzheng/scratch0/projects/Fine-Tuned-GPT-2-with-articles-ground-truth/code/llamaIndex/.cache/pid-0.jsonl'

        if _cache_process_path is not None:
            # init attributions
            latest_level = -1
            _level2nodes = {}
            level2int = {level: i for i, level in enumerate(["document", "section", "paragraph", "multi-sentences"])}
            
            # loading nodes
            if os.path.exists(_cache_process_path):
                file_size = os.path.getsize(_cache_process_path)
                with open(_cache_process_path, 'r') as cache_file:
                    with tqdm(total=file_size, desc='Loading cache...', unit='B', unit_scale=True, unit_divisor=1024) as pbar:
                        for line in cache_file:
                            try:
                                node_dict = json.loads(line)
                                node = TextNode.from_dict(node_dict)
                                if level2int[node.metadata['level']] > latest_level:
                                    latest_level = level2int[node.metadata['level']]
                                    _level2nodes[latest_level] = []
                                    _level2nodes[latest_level].append(node)
                            except Exception as e:
                                print(e)
                            # Update progress bar based on bytes read
                            pbar.update(len(line))
            _cache_process_file = open(_cache_process_path, 'a+')

            latest_level = max(latest_level, 0)

            # get prev_level_nodes
            if latest_level == 1:
                latest_level = 0
                processed_prev_level_nodes_id = set()
                for node in _level2nodes[latest_level]:
                    processed_prev_level_nodes_id.add(node.ref_doc_id)
                prev_level_nodes = []
                for node in nodes:
                    if node.id_ not in processed_prev_level_nodes_id:
                        prev_level_nodes.append(node)
                
            if latest_level > 1:
                processed_prev_level_nodes_id = set()
                for node in _level2nodes[latest_level]:
                    parent_node_id = node.relationships[NodeRelationship.PARENT].node_id
                    processed_prev_level_nodes_id.add(parent_node_id)
                prev_level_nodes = []
                for node in _level2nodes[latest_level-1]:
                    if node.id_ not in processed_prev_level_nodes_id:
                        prev_level_nodes.append(node)

        return latest_level, prev_level_nodes

In [None]:
from llama_index.core.schema import BaseNode, Document, NodeRelationship

print(nodes[-1].relationships[NodeRelationship.PARENT])

In [None]:
len(processed_section_nodes_id)

In [None]:
node_dict.keys()

In [None]:
len(node_dict['paragraph'])

In [None]:
len(node_dict['section'])