Run the grobid server locally and convert the pdf to xml files using grobid client.


In [33]:
#!grobid_client --input "/content/drive/MyDrive/dataset" --output "/content/drive/MyDrive/datasettei"

Preprocess the Xml Files

In [34]:
!pip install grobid_tei_xml
!pip install dateparser
!pip install tiktoken



In [35]:
import re
import os
from collections import OrderedDict
from bs4 import BeautifulSoup
import dateparser
import grobid_tei_xml
from pathlib import Path

import tiktoken
import copy
import json

In [36]:
# Define the path to the dataset folder
dataset_folder = "/content/drive/MyDrive/datasettei/"

# Define the output parent folder
output_parent_folder = "/content/drive/MyDrive/llama-2/"

In [37]:
def parse_grobid_xml(text):
    output_data = OrderedDict()

    doc_biblio = grobid_tei_xml.parse_document_xml(text)
    biblio = {
        "doi": doc_biblio.header.doi if doc_biblio.header.doi is not None else "",
        "authors": ", ".join([author.full_name for author in doc_biblio.header.authors]),
        "title": doc_biblio.header.title,
        "hash": doc_biblio.pdf_md5
    }
    try:
        year = dateparser.parse(doc_biblio.header.date).year
        biblio["publication_year"] = year
    except:
        pass

    output_data['biblio'] = biblio
    passages = []
    output_data['passages'] = passages
    passage_type = "paragraph"

    soup = BeautifulSoup(text, 'xml')
    blocks_header = get_xml_nodes_header(soup, use_paragraphs=True)

    """passages.append({
        "text": f"authors: {biblio['authors']}",
       "type": passage_type,
        "section": "<header>",
        "subSection": "<title>",
        "passage_id": "htitle",
    })

    passages.append({
        "text": post_process(" ".join([node.text for node in blocks_header['title']])),
        "type": passage_type,
        "section": "<header>",
        "subSection": "<title>",
        "passage_id": "htitle",
    })"""

    passages.append({
        "text": post_process(
            ''.join(node.text for node in blocks_header['abstract'] for text in node.find_all(text=True) if
                    text.parent.name != "ref" or (
                            text.parent.name == "ref" and text.parent.attrs[
                        'type'] != 'bibr'))),
        "type": passage_type,
        "section": "<header>",
        "subSection": "<abstract>",
        "passage_id": "habstract",
    })

    text_blocks_body = get_xml_nodes_body(soup, verbose=False, use_paragraphs=True)

    use_paragraphs = True
    if not use_paragraphs:
        passages.extend([
            {
                "text": post_process(''.join(text for text in sentence.find_all(text=True) if
                                                  text.parent.name != "ref" or (
                                                          text.parent.name == "ref" and text.parent.attrs[
                                                      'type'] != 'bibr'))),
                "type": passage_type,
                "section": "<body>",
                "subSection": "<paragraph>",
                "passage_id": str(paragraph_id),
            }
            for paragraph_id, paragraph in enumerate(text_blocks_body) for
            sentence_id, sentence in enumerate(paragraph)
        ])
    else:
        passages.extend([
            {
                "text": post_process(''.join(text for text in paragraph.find_all(text=True) if
                                                  text.parent.name != "ref" or (
                                                          text.parent.name == "ref" and text.parent.attrs[
                                                      'type'] != 'bibr'))),
                "type": passage_type,
                "section": "<body>",
                "subSection": "<paragraph>",
                "passage_id": str(paragraph_id),
            }
            for paragraph_id, paragraph in enumerate(text_blocks_body)
        ])

    text_blocks_figures = get_xml_nodes_figures(soup, verbose=False)

    if not use_paragraphs:
        passages.extend([
            {
                "text": post_process(''.join(text for text in sentence.find_all(text=True) if
                                                  text.parent.name != "ref" or (
                                                          text.parent.name == "ref" and text.parent.attrs[
                                                      'type'] != 'bibr'))),
                "type": passage_type,
                "section": "<body>",
                "subSection": "<figure>",
                "passage_id": str(paragraph_id) + str(sentence_id),
            }
            for paragraph_id, paragraph in enumerate(text_blocks_figures) for
            sentence_id, sentence in enumerate(paragraph)
        ])
    else:
        passages.extend([
            {
                "text": post_process(''.join(text for text in paragraph.find_all(text=True) if
                                                  text.parent.name != "ref" or (
                                                          text.parent.name == "ref" and text.parent.attrs[
                                                      'type'] != 'bibr'))),
                "type": passage_type,
                "section": "<body>",
                "subSection": "<figure>",
                "passage_id": str(paragraph_id),
            }
            for paragraph_id, paragraph in enumerate(text_blocks_figures)
        ])

    return output_data


In [38]:

def get_xml_nodes_header(soup: object, use_paragraphs: bool = True) -> list:
    sub_tag = "p" if use_paragraphs else "s"

    header_elements = {
        "authors": [persNameNode for persNameNode in soup.teiHeader.find_all("persName")],
        "abstract": [p_in_abstract for abstractNodes in soup.teiHeader.find_all("abstract") for p_in_abstract in
                     abstractNodes.find_all(sub_tag)],
        "title": [soup.teiHeader.fileDesc.title]
    }

    return header_elements


def get_xml_nodes_body(soup: object, use_paragraphs: bool = True, verbose: bool = False) -> list:
    nodes = []
    tag_name = "p" if use_paragraphs else "s"
    for child in soup.TEI.children:
        if child.name == 'text':
            # nodes.extend([subchild.find_all(tag_name) for subchild in child.find_all("body")])
            nodes.extend(
                [subsubchild for subchild in child.find_all("body") for subsubchild in subchild.find_all(tag_name)])

    if verbose:
        print(str(nodes))

    return nodes


def get_xml_nodes_figures(soup: object, verbose: bool = False) -> list:
    children = []
    for child in soup.TEI.children:
        if child.name == 'text':
            children.extend(
                [subchild for subchilds in child.find_all("body") for subchild in subchilds.find_all("figDesc")])

    if verbose:
        print(str(children))

    return children

def post_process( text):
        output = text.replace('À', '-')
        output = output.replace('¼', '=')
        output = output.replace('þ', '+')
        output = output.replace('Â', 'x')
        output = output.replace('$', '~')
        output = output.replace('−', '-')
        output = output.replace('–', '-')
        patterns = [
        r'\d+e\d+'
        ]

        for pattern in patterns:
            output = re.sub(pattern, lambda match: match.group().replace('e', '-'), output)

        return output


In [39]:
enc = tiktoken.get_encoding("gpt2")

def encode( text, allowed_special=set(), disallowed_special="all"):
  return enc.encode(
      text,
      allowed_special=allowed_special,
      disallowed_special=disallowed_special,
    )



In [40]:
def merge_passages(passages, chunk_size, tolerance=0.2):
    new_passages = []
    current_texts = []

    for idx, passage in enumerate(passages):
        text = passage['text']
        current_texts.append(text)

        accumulated_text = " ".join(current_texts)
        encoded_accumulated_text = encode(accumulated_text)

        if len(encoded_accumulated_text) > chunk_size + chunk_size * tolerance:
            if len(current_texts) > 1:
                new_passages.append(current_texts[:-1])
                current_texts = [current_texts[-1]]
            else:
                new_passages.append(current_texts)
                current_texts = []

        elif chunk_size <= len(encoded_accumulated_text) < chunk_size + chunk_size * tolerance:
            new_passages.append(current_texts)
            current_texts = []

    if len(current_texts) > 0:
        new_passages.append(current_texts)

    new_passages_struct = []

    for i, passages in enumerate(new_passages):
        text = " ".join(passages)

        new_passages_struct.append(
            {
                "text": text,
                "type": "aggregated chunks",
                "section": "mixed",
                "subSection": "mixed"
            }
        )

    return new_passages_struct


In [41]:


# Create output parent folder if it doesn't exist
os.makedirs(output_parent_folder, exist_ok=True)

# Iterate through each file in the dataset folder
for file_path in Path(dataset_folder).glob("*.tei.xml"):
    text = ""
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()

    # Parse the XML and perform other operations
    output = parse_grobid_xml(text)

    # Extract relevant information
    output['filename'] = file_path
    biblio = output['biblio']
    filename = file_path.stem
    biblio['filename'] = filename.replace(" ", "_")

    # Create a separate folder for each file
    output_folder = os.path.join(output_parent_folder, f"{filename}_output")
    os.makedirs(output_folder, exist_ok=True)

    texts = []
    metadatas = []
    ids = []

    new_passages = merge_passages(output['passages'], chunk_size=250)

    # Iterate through each passage in the new_passages
    for passage_id, passage in enumerate(new_passages):
        biblio_copy = copy.copy(biblio)

        if len(str.strip(passage['text'])) > 0:
            texts.append(passage['text'])

            biblio_copy['type'] = passage['type']
            biblio_copy['section'] = passage['section']
            biblio_copy['subSection'] = passage['subSection']
            metadatas.append(biblio_copy)

            ids.append(passage_id)

    # Store the results in separate files within the output folder
    output_text_path = os.path.join(output_folder, f"{filename}_text.txt")
    output_metadata_path = os.path.join(output_folder, f"{filename}_metadata.json")

    with open(output_text_path, 'w', encoding='utf-8') as text_file:
        text_file.write("\n".join(texts))

    with open(output_metadata_path, 'w', encoding='utf-8') as metadata_file:
        json.dump(metadatas, metadata_file, indent=2)



  ''.join(node.text for node in blocks_header['abstract'] for text in node.find_all(text=True) if
  "text": post_process(''.join(text for text in paragraph.find_all(text=True) if
  "text": post_process(''.join(text for text in paragraph.find_all(text=True) if
