### Reader

In [1]:
import os
import xml.etree.ElementTree as ET
cache_path = '/home/zhengzheng/scratch0/projects/Fine-Tuned-GPT-2-with-articles-ground-truth/code/llamaIndex/.cache'
filenames = [filename for filename in os.listdir(cache_path) if filename.endswith('tei.xml')]


def get_full_text(element):
    text = element.text or ''
    for subelement in element:
        text += ET.tostring(subelement, encoding='unicode', method='text')
        if subelement.tail:
            text += subelement.tail
    return text.strip()

iter_filenames = (iter(filenames))

In [5]:
file_path = os.path.join(cache_path, next(iter_filenames)) 
file_path

'/home/zhengzheng/scratch0/projects/Fine-Tuned-GPT-2-with-articles-ground-truth/code/llamaIndex/.cache/1_H_NMR_studies_of_molecular_interaction.grobid.tei.xml'

In [6]:
file_dict = {}
tree = ET.parse(file_path)
root = tree.getroot()

namespace = {'tei': 'http://www.tei-c.org/ns/1.0'}

In [7]:
# title
title = root.find('.//tei:teiHeader/tei:fileDesc/tei:titleStmt/tei:title', namespaces=namespace)
file_dict['title'] = title.text
file_dict

{'title': 'H NMR studies of molecular interaction of D-glucosamine and N-acetyl-Dglucosamine with capsaicin in aqueous and non-aqueous media'}

In [8]:
# authors
authors = root.findall('.//tei:teiHeader/tei:fileDesc/tei:sourceDesc/tei:biblStruct/tei:analytic/tei:author/tei:persName', namespaces=namespace)
file_dict['authors'] = []
for author in authors:
    forename = author.findall('tei:forename', namespaces=namespace)
    if len(forename) == 0:
        continue
    forename = ' '.join(name.text for name in forename)
    surname = ' '.join([name.text for name in author.findall('tei:surname', namespaces=namespace)])
    author_name = forename + ' ' + surname
    file_dict['authors'].append(author_name)
file_dict

{'title': 'H NMR studies of molecular interaction of D-glucosamine and N-acetyl-Dglucosamine with capsaicin in aqueous and non-aqueous media',
 'authors': ['Inocencio Higuera-Ciapara',
  'Claudia Virués',
  'Marcela Jiménez-Chávez',
  'Evelin Martínez-Benavidez',
  'Javier Hernández',
  'Zaira Domínguez',
  'Roberto López-Rendón',
  'Enrique F Velázquez',
  'Motomichi Inoue',
  'Evelin Martínez- Benavidez']}

In [10]:
# abstract
abstract = root.find('.//tei:teiHeader/tei:profileDesc/tei:abstract/tei:div/tei:p', namespaces=namespace)
print(abstract)
if abstract is not None:
    file_dict['abstract'] = abstract.text
file_dict

<Element '{http://www.tei-c.org/ns/1.0}p' at 0x7f47956d8ae0>


{'title': 'H NMR studies of molecular interaction of D-glucosamine and N-acetyl-Dglucosamine with capsaicin in aqueous and non-aqueous media',
 'authors': ['Inocencio Higuera-Ciapara',
  'Claudia Virués',
  'Marcela Jiménez-Chávez',
  'Evelin Martínez-Benavidez',
  'Javier Hernández',
  'Zaira Domínguez',
  'Roberto López-Rendón',
  'Enrique F Velázquez',
  'Motomichi Inoue',
  'Evelin Martínez- Benavidez'],
 'abstract': '►Interaction of glucosamine with capsaicin is sensitive to pH and solvent.'}

In [11]:
# Body
body = root.findall('.//tei:text/tei:body/tei:div', namespaces=namespace)

for i, child in enumerate(body):
    ps = child.findall('.//tei:p', namespaces=namespace)
    if len(ps) == 0:
        continue
    head = child.find('.//tei:head', namespaces=namespace).text
    content = '\n'.join([get_full_text(p) for p in ps])
    file_dict[head] = content
file_dict

{'title': 'H NMR studies of molecular interaction of D-glucosamine and N-acetyl-Dglucosamine with capsaicin in aqueous and non-aqueous media',
 'authors': ['Inocencio Higuera-Ciapara',
  'Claudia Virués',
  'Marcela Jiménez-Chávez',
  'Evelin Martínez-Benavidez',
  'Javier Hernández',
  'Zaira Domínguez',
  'Roberto López-Rendón',
  'Enrique F Velázquez',
  'Motomichi Inoue',
  'Evelin Martínez- Benavidez'],
 'abstract': '►Interaction of glucosamine with capsaicin is sensitive to pH and solvent.',
 'Introduction': 'Capsaicin, N-(4-hydroxy-3-methoxybenzyl)-8-methyl-trans-6-nonenamide, is the major capsaicinoid of chili peppers (Capsicum spp.). The molecule (shown in Scheme 1) consists of a hydrophilic unit (region A composed of a vanillyl ring) and a hydrophobic unit (region C composed of a fatty acid), which are linked by an amide linkage (region B), conferring amphipathic characters on the compound; every region is expected to contribute to the biological activities [1,2]. The well-kn

In [None]:
from utils.custom_document_reader import CustomDocumentReader

documents = CustomDocumentReader(
    input_dir="/home/zhengzheng/scratch0/projects/Fine-Tuned-GPT-2-with-articles-ground-truth/data",
    cache_dir="/home/zhengzheng/scratch0/projects/Fine-Tuned-GPT-2-with-articles-ground-truth/code/llamaIndex/.cache",
    config_path="/home/zhengzheng/scratch0/projects/Fine-Tuned-GPT-2-with-articles-ground-truth/code/llamaIndex/utils/config.json",
    remove_cache=False
)._load_data()

### CustomSplitter

In [None]:
from llama_index.core.node_parser import SentenceSplitter

ss1 = SentenceSplitter(
    chunk_size=1024,
    include_metadata=True,
    include_prev_next_rel=True,
)

In [None]:
from llama_index.core.storage.docstore import SimpleDocumentStore
nodes = ss1.get_nodes_from_documents(documents)
docstore = SimpleDocumentStore()
docstore.add_documents(nodes)
docstore.persist(persist_path="/home/zhengzheng/scratch0/projects/Fine-Tuned-GPT-2-with-articles-ground-truth/code/llamaIndex/.cache/test.json")


In [None]:
from llama_index.core.node_parser import SimpleFileNodeParser

nodes = SimpleFileNodeParser().get_nodes_from_documents(documents=documents)

In [None]:
for k, v in nodes[0].to_dict().items():
    print(f"key: {k}\nvalue: {v}\n\n")

In [None]:
from utils.custom_document_reader import CustomDocumentReader

documents = CustomDocumentReader(
    input_dir="/home/zhengzheng/scratch0/projects/Fine-Tuned-GPT-2-with-articles-ground-truth/data",
    cache_dir="/home/zhengzheng/scratch0/projects/Fine-Tuned-GPT-2-with-articles-ground-truth/code/llamaIndex/.cache",
    config_path="/home/zhengzheng/scratch0/projects/Fine-Tuned-GPT-2-with-articles-ground-truth/code/llamaIndex/utils/config.json",
    remove_cache=False
)._load_data()

from utils.custom_parser import CustomHierarchicalNodeParser
hnp = CustomHierarchicalNodeParser.from_defaults()
hnp._parse_nodes(documents)

### Test custom document reader

In [None]:
from utils.custom_document_reader import CustomDocumentReader

documents = CustomDocumentReader(
    input_dir="/home/zhengzheng/scratch0/projects/Fine-Tuned-GPT-2-with-articles-ground-truth/data",
    cache_dir="/home/zhengzheng/scratch0/projects/Fine-Tuned-GPT-2-with-articles-ground-truth/code/llamaIndex/.cache",
    config_path="/home/zhengzheng/scratch0/projects/Fine-Tuned-GPT-2-with-articles-ground-truth/code/llamaIndex/utils/config.json",
    remove_cache=False
)._load_data()

from utils.custom_parser import CustomHierarchicalNodeParser
hnp = CustomHierarchicalNodeParser.from_defaults()
nodes = hnp.get_nodes_from_documents(documents)

In [None]:
len(documents)

In [None]:
len(nodes)

In [None]:
from utils.custom_extractor import HuggingfaceBasedExtractor, OllamaBasedExtractor, OpenAIBasedExtractor

extractor = OllamaBasedExtractor(
    model_name='sammcj/sfr-embedding-mistral:Q4_K_M',
    only_meta={'level': ['section']}
)

extractor.extract(nodes)

### Pydantic

In [None]:
from pydantic import BaseModel
from typing import List

from llama_index.core.program import LLMTextCompletionProgram


class DocumentSummary(BaseModel):
    """Data model for a summary of a document."""

    summary: str


prompt_template_str = """\
Generate an example album, with an artist and a list of songs. \
Using the movie {movie_name} as inspiration.\
"""

program = LLMTextCompletionProgram.from_defaults(
    output_cls=DocumentSummary,
    prompt_template_str=prompt_template_str,
    verbose=True,
)