# Introduction

@Author: Yingding Wang\
@CreatedOn: 20.11.2023

This notebook shows an example of using pdf data from a S3 bucket source to be translated from german lanuage into english.

In [1]:
#list=!nvidia-smi -L
#for i in range(len(list)):
#    print(list[i])

In [2]:
import os, time
from platform import python_version

print(python_version())

3.8.10


## Init the GPU environments

In [3]:
from util.accelerator_utils import AcceleratorStatus, AcceleratorHelper
gpu_status = AcceleratorStatus()

In [4]:
gpu_status.gpu_usage()

num_of_gpus: 1
--------------------
Device name      : NVIDIA A100 80GB PCIe MIG 2g.20gb 
Device idx       : 0 
No. of processors: 28
Physical  memory : 19.500000 GB
Reserved  memory : 0.000000 GB
Allocated memory : 0.000000 GB
Free      memory : 0.000000 GB
--------------------


In [5]:
gpu_helper = AcceleratorHelper()
UUIDs = gpu_helper.nvidia_device_uuids_filtered_by(is_mig=True, log_output=False)
# print(UUIDs)

In [6]:
# set the model download cache directory
display_architecture=True
DATA_ROOT="/home/jovyan/llm-models"

gpu_helper.init_cuda_torch(UUIDs, f"{DATA_ROOT}/core-kind/yinwang")

model_map = {
   "small": "google/mt5-small", # 1.2 GB
   "base" : "google/mt5-base", # 2.33 GB
   "large" : "google/mt5-large", # 4.9 GB,
   "xl" : "google/mt5-xl", # 15 GB
   "xxl" : "google/mt5-xxl", # 51.7 GB,
   "custom": "Helsinki-NLP/opus-mt-de-en", 
}

print(os.environ["CUDA_VISIBLE_DEVICES"])
print(os.environ["XDG_CACHE_HOME"])

MIG-0efc9f06-6dca-5886-98af-0273ca7fde51
/home/jovyan/llm-models/core-kind/yinwang/models


## Setting up translation model

In [7]:
model_type = "custom"
model_name = model_map.get(model_type, "small")

print(model_name)

Helsinki-NLP/opus-mt-de-en


In [8]:
from transformers import pipeline
import transformers

In [9]:
'''
device_map="auto" doesn't work with "Helsinki-NLP/opus-mt-de-en" translator model
use explicit gpu device id 0 with device=0
'''
generator = pipeline(
    "translation", 
    model=model_name,
    # device_map="auto",
    device=0,
)

In [10]:
#type(generator)

In [11]:
# task_prefix = "translate English to German: "
# task_prefix = "translate German to English: "
# task_prefix = "übersetze Deutsch zum Englisch: "
# Reference: https://huggingface.co/docs/transformers/model_doc/marian
def translate_gen(
    generator: transformers.pipelines.text2text_generation.TranslationPipeline, 
    info: AcceleratorStatus,
):  
    """
    Args:
      max_new_tokens: control the maximum length of the generation
    """
    
    def local(sentences: list, max_length=400, verbose: bool = True) -> list:
        """single input, no batch input
        Args:
          sentences:
        """
        start = time.time()
        
        result = generator(
            sentences, 
            max_length=max_length,
            # return_tensors="pt"
        )
        
        end = time.time()
        duration = end - start
        if verbose:
            print("-"*20)
            print(f"walltime: {duration} in secs.")
            info.gpu_usage()
        
        return result
    return local    

translate = translate_gen(generator, gpu_status)

In [12]:
input=["Das Haus ist wunderbar."]
# input="Das Haus ist wunderbar."

In [13]:
%timeit
translate(input, max_length=1000)

--------------------
walltime: 0.7152624130249023 in secs.
num_of_gpus: 1
--------------------
Device name      : NVIDIA A100 80GB PCIe MIG 2g.20gb 
Device idx       : 0 
No. of processors: 28
Physical  memory : 19.500000 GB
Reserved  memory : 0.310547 GB
Allocated memory : 0.285861 GB
Free      memory : 0.024686 GB
--------------------


[{'translation_text': 'The house is wonderful.'}]

## Loading pdf content from s3 buckets

In [14]:
import os
from pypdf import PdfReader

In [15]:
from util.objectstore_utils import S3BucketHelper, S3AccessConf

s3_conf = S3AccessConf(
    bucket_name = "scivias-medreports",
    access_key_id = os.environ.get('AWS_ACCESS_KEY_ID'),
    secret_access_key = os.environ.get('AWS_SECRET_ACCESS_KEY'),
    endpoint = os.environ.get('S3_ENDPOINT'),
    verify_host = True
)


bucket_helper = S3BucketHelper(conf=s3_conf, file_prefix="KK-SCIVIAS")

In [16]:
# items_max_limit = -1
items_max_limit = 3
item_map = bucket_helper.get_object_keys(items_max_limit)

In [17]:
# item_list = list(item_map)
# len(item_list)

In [18]:
def create_pdf_reader(id_bytes: dict):
    return {
        "name" : id_bytes.get('name'),
        "reader" : PdfReader(id_bytes.get('bytesio')) 
    }

In [19]:
reader_map = bucket_helper.transform_objects(item_map, create_pdf_reader)

In [20]:
def read_pdf_pages(id_reader: dict) -> dict:
    '''
    one pdf page can have till 2.5K token, need to join and then split
    '''
    # return [page.extract_text() for page in reader.pages]
    return {
        "name": id_reader.get('name'),
        "content" : "".join([page.extract_text() for page in id_reader.get('reader').pages])
    }

In [21]:
raw_pdf_map = map(read_pdf_pages, reader_map)

In [22]:
# https://stackoverflow.com/questions/13673060/split-string-into-strings-by-length
def wrap(name, s, w):
    """
    split string with length w into a list of strings with length w
    Arge:
      s: orginial str
      w: with of the each split for the string
      
    Return:
      a list of string with each element as string of length w
    """
    return {
        "name": name,
        "pages": [s[i:i + w] for i in range(0, len(s), w)]
    }

In [23]:
pdf_pages_map = map(lambda x: wrap(x.get('name'), x.get('content'), 350), raw_pdf_map)

In [24]:
# list of pdf documents, each item is a list representing the 350 token split of a docuemnt
# doc_list = list(pdf_pages_map)

In [25]:
# split_0_doc_0 = doc_list[0][0]

In [26]:
# len(split_0_doc_0)

## Translate the max-token splitted documents

In [27]:
def translate_document(splitted_doc: dict) -> str:
    output = []
    for input in splitted_doc.get('pages'):
        output.append(translate(input, verbose=False)[0].get('translation_text', '').strip())
    return {
        "name" : splitted_doc.get('name'),
        "pages" : ''.join(output)
    }

In [28]:
translated_doc_map = map(translate_document, pdf_pages_map)

In [29]:
start = time.time()
en_doc_dict_list = list(translated_doc_map)
end = time.time()
duration = end - start
print("-"*20)
print(f"walltime: {duration} in secs.")



--------------------
walltime: 44.90513825416565 in secs.


In [38]:
# 3 docs in one min
# total_mins = int(250 / 3)
# total_hours = int(total_mins/60) + 1
# print(total_mins)
# print(total_hours)

In [31]:
# print the tranlated document token size
#for en_doc in en_doc_dict_list:
#    print(en_doc.get('name'))

In [32]:
for en_doc in en_doc_dict_list:
    print(f"no. of token: {len(en_doc.get('pages'))}")

no. of token: 6973
no. of token: 14647
no. of token: 6720


In [33]:
#print(en_doc_dict_list[2].get('name'))
#print(en_doc_dict_list[2].get('pages'))

## Persist the translated docs

In [34]:
# print(f"the translated text has tokens: {len(en_content)}")

In [35]:
# def store_txt(content, path):
#    with open (path, "w") as text_file:
#        text_file.write(content)

In [36]:
# en_txt_path = loader.file_path_list[file_idx].replace("pdf", "txt")

In [37]:
# store_txt(en_content, en_txt_path)