In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Imports

In [None]:
import json
import requests
import re
import pandas as pd
import os

# Functions

This code serves the purpose of tagging Portuguese books with LX-Tagger. In order to do so a working API Key will be needed, its easy to get one, just follow the steps in  https://portulanclarin.net/workbench/lx-tagger/ use your API KEY and Indicate the .txt file of the book you want tagged, some extra-steps may need to be taken depending on the size of the book in question

In [None]:
class WSException(Exception):
    'Webservice Exception'
    def __init__(self, errordata):
        "errordata is a dict returned by the webservice with details about the error"
        super().__init__(self)
        assert isinstance(errordata, dict)
        self.message = errordata["message"]
        # see https://json-rpc.readthedocs.io/en/latest/exceptions.html for more info
        # about JSON-RPC error codes
        if -32099 <= errordata["code"] <= -32000:  # Server Error
            if errordata["data"]["type"] == "WebServiceException":
                self.message += f": {errordata['data']['message']}"
            else:
                self.message += f": {errordata['data']!r}"
    def __str__(self):
        return self.message

In [None]:
def tag(text, format):
    '''
    Arguments
        text: a string with a maximum of 4000 characters, Portuguese text, with
             the input to be processed
        format: either 'CINTIL', 'CONLL' or 'JSON'

    Returns a string with the output according to specification in
       https://portulanclarin.net/workbench/lx-tagger/

    Raises a WSException if an error occurs.
    '''

    request_data = {
        'method': 'tag',
        'jsonrpc': '2.0',
        'id': 0,
        'params': {
            'text': text,
            'format': format,
            'key': LXTAGGER_WS_API_KEY,
        },
    }
    request = requests.post(LXTAGGER_WS_API_URL, json=request_data)
    response_data = request.json()
    if "error" in response_data:
        raise WSException(response_data["error"])
    else:
        return response_data["result"]

In [None]:
def slice_into_chunks(lines, max_chunk_size=4000):
    chunk, chunk_size = [], 0
    for lnum, line in enumerate(lines, start=1):
        if (chunk_size + len(line)) <= max_chunk_size:
            chunk.append(line)
            chunk_size += len(line) + 1
            # the + 1 above is for the newline character terminating each line
        else:
            yield "\n".join(chunk)
            if len(line) > max_chunk_size:
                print(f"line {lnum} is longer than 4000 characters; truncating")
                line = line[:4000]
            chunk, chunk_size = [line], len(line) + 1
    if chunk:
        yield "\n".join(chunk)

Here Introduce you API Key

In [None]:
LXTAGGER_WS_API_KEY = '8364bfed4ce5ab699e6f23279e5cdad6'
LXTAGGER_WS_API_URL = 'https://portulanclarin.net/workbench/lx-tagger/api/'

Here Indicate the Path to the file for tagging

In [None]:
path='/content/drive/MyDrive/SNA Pipeline (Tese)/Livros (1)/O Crime do Padre Amaro.txt'

## Tag in chunks

Here excel_path needs to be indicated

In [None]:

def Import_to_Tag(path, start_chunk=1):
    with open(path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    sample_text_lines = [line.strip() for line in lines if line.strip()]
    chunks = list(slice_into_chunks(sample_text_lines))
    annotated_text = []

    final_chunk_processed = start_chunk  # Initialize with start_chunk in case no chunks are processed

    for cnum, chunk in enumerate(chunks[start_chunk-1:], start=start_chunk):
        try:
            annotated_text.extend(tag(chunk, format="JSON"))
            final_chunk_processed = cnum  # Update final_chunk_processed after each successful chunk processing
            print(".", end="", flush=True)  # Progress feedback
        except Exception as exc:
            chunk_preview = ' '.join(chunk.split()[:10]) + "..." if len(chunk.split()) > 10 else chunk
            print(f"\nError: annotation of chunk {cnum} failed ({exc}); chunk contents:\n{chunk_preview}\n")
            break

    # Extracting book name from the path for the Excel filename
    book_name_match = re.search(r'/([^/]+)\.txt', path)
    book_name = book_name_match.group(1) if book_name_match else "output"

    # Adding chunk range to the book name
    book_name_with_chunks = f"{book_name} ({start_chunk}-{final_chunk_processed})"

    # Convert the annotated text to a DataFrame and save it to an Excel file
    df = pd.DataFrame(annotated_text)
    excel_path = f'/content/drive/MyDrive/SNA Pipeline (Tese)/POS Books (1)/POS {book_name_with_chunks}.xlsx'
    df.to_excel(excel_path, index=False)
    print(f'\nYour book "{book_name_with_chunks}" was tagged :))')



Start with 1 and indicate the next starting chunk in case multiple runs are needed

In [None]:
Import_to_Tag(path,200)

.....................
Your book "O Crime do Padre Amaro (200-220)" was tagged :))


## Join the Chunks

In case the book in question was too large for a single file you run this code in order to join all anotated chunks
Paths need to be indicated

In [None]:
def consolidate_book_chunks(path):

    dir_path = "/content/drive/MyDrive/SNA Pipeline (Tese)/POS Books (1)"

    # Extract the base book name from the path, without the .txt extension
    book_name_match = re.search(r'/([^/]+)\.txt$', path)
    if not book_name_match:
        print("Invalid book path.")
        return
    book_name = book_name_match.group(1)
    print(book_name)

    # Pattern to match files starting with "POS", followed by the book name, and a chunk range in parentheses
    pattern = re.compile(rf'^POS {re.escape(book_name)} \((\d+)-(\d+)\)\.xlsx$')
    print(pattern)
    # List all files in the directory
    try:
        files = os.listdir(dir_path)
        print(files)
    except FileNotFoundError:
        print(f"Directory not found: {dir_path}")
        return

    # Filter and sort files based on the chunk range
    book_files = [file for file in files if pattern.match(file)]
    book_files.sort(key=lambda x: int(pattern.match(x).group(1)))

    print(book_files)
    if not book_files:
        print(f"No chunked files found for the book: {book_name}")
        return

    master_df = pd.DataFrame()

    # Process and append data from each file
    for file in book_files:
        df = pd.read_excel(os.path.join(dir_path, file))
        master_df = pd.concat([master_df, df], ignore_index=True)

    # Get the full range from the first and last file
    first_chunk = pattern.match(book_files[0]).group(1)
    last_chunk = pattern.match(book_files[-1]).group(2)
    consolidated_filename = f"POS {book_name} ({first_chunk}-{last_chunk}).xlsx"
    consolidated_path = os.path.join( "/content/drive/MyDrive/SNA Pipeline (Tese)/POS Books (1)/complete (1)", consolidated_filename)

    # Save the consolidated DataFrame
    master_df.to_excel(consolidated_path, index=False)
    print(f'Consolidated book saved as "{consolidated_filename}"')





In [None]:

def consolidate_book_chunks(path):
    dir_path = "/content/drive/MyDrive/SNA Pipeline/POS Books"

    # Extract the base book name from the path, without the .txt extension
    book_name_match = re.search(r'/([^/]+)\.txt$', path)
    if not book_name_match:
        print("Invalid book path.")
        return
    book_name = book_name_match.group(1)
    print("Book name extracted:", book_name)

    # Adjust the regex pattern to ensure it matches any content between the book name and the chunk range
    # The change here is to make sure we capture any text (including "integral" or similar) following the book name right up to the chunk numbers
    pattern = re.compile(rf'^POS {re.escape(book_name)} \(?(\d+)-(\d+)\)?\.xlsx$')

    try:
        files = os.listdir(dir_path)
        print("Files found:", files)
    except FileNotFoundError:
        print(f"Directory not found: {dir_path}")
        return

    # Filter and sort files based on the chunk range
    book_files = [file for file in files if pattern.match(file)]
    book_files.sort(key=lambda x: int(pattern.match(x).group(1)))

    if not book_files:
        print(f"No chunked files found for the book: {book_name}")
        return

    master_df = pd.DataFrame()

    # Process and append data from each file
    for file in book_files:
        df = pd.read_excel(os.path.join(dir_path, file))
        master_df = pd.concat([master_df, df], ignore_index=True)

    # Derive consolidated filename from the range of chunks
    first_chunk = pattern.match(book_files[0]).group(1)
    last_chunk = pattern.match(book_files[-1]).group(2)
    consolidated_filename = f"POS {book_name} ({first_chunk}-{last_chunk}).xlsx"
    consolidated_path = os.path.join(dir_path, "complete", consolidated_filename)

    # Save the consolidated DataFrame
    master_df.to_excel(consolidated_path, index=False)
    print(f'Consolidated book saved as "{consolidated_filename}"')


In [None]:
consolidate_book_chunks(path)

O Crime do Padre Amaro
re.compile('^POS O\\ Crime\\ do\\ Padre\\ Amaro \\((\\d+)-(\\d+)\\)\\.xlsx$')
['complete (1)', 'full names (1)', 'pos antigos', 'complete antigo', 'POS O Crime do Padre Amaro (1-199).xlsx', 'POS O Crime do Padre Amaro (200-220).xlsx']
['POS O Crime do Padre Amaro (1-199).xlsx', 'POS O Crime do Padre Amaro (200-220).xlsx']
Consolidated book saved as "POS O Crime do Padre Amaro (1-220).xlsx"
