# Introduction

@Author: Yingding Wang\
@CreatedOn: 20.11.2023\
@UpdatedOn: 23.11.2023

This notebook shows an example of using pdf data from a S3 bucket source to be translated from german lanuage into english.

In [2]:
# import sys
# !{sys.executable} -m pip list

In [1]:
import boto3
boto3.__version__

'1.34.14'

In [2]:
#list=!nvidia-smi -L
#for i in range(len(list)):
#    print(list[i])

In [3]:
import os, time
from platform import python_version

print(python_version())

3.8.10


## Init the GPU environments

In [4]:
from util.accelerator_utils import AcceleratorStatus, AcceleratorHelper
gpu_status = AcceleratorStatus()

In [5]:
gpu_status.gpu_usage()

num_of_gpus: 1
--------------------
Device name      : NVIDIA A100 80GB PCIe MIG 2g.20gb 
Device idx       : 0 
No. of processors: 28
Physical  memory : 19.500000 GB
Reserved  memory : 0.000000 GB
Allocated memory : 0.000000 GB
Free      memory : 0.000000 GB
--------------------


In [6]:
gpu_helper = AcceleratorHelper()
UUIDs = gpu_helper.nvidia_device_uuids_filtered_by(is_mig=True, log_output=False)
# print(UUIDs)

In [7]:
# set the model download cache directory
display_architecture=True
DATA_ROOT="/home/jovyan/llm-models"

gpu_helper.init_cuda_torch(UUIDs, f"{DATA_ROOT}/core-kind/yinwang")

model_map = {
   "small": "google/mt5-small", # 1.2 GB
   "base" : "google/mt5-base", # 2.33 GB
   "large" : "google/mt5-large", # 4.9 GB,
   "xl" : "google/mt5-xl", # 15 GB
   "xxl" : "google/mt5-xxl", # 51.7 GB,
   "custom": "Helsinki-NLP/opus-mt-de-en", 
}

print(os.environ["CUDA_VISIBLE_DEVICES"])
print(os.environ["XDG_CACHE_HOME"])

MIG-0efc9f06-6dca-5886-98af-0273ca7fde51
/home/jovyan/llm-models/core-kind/yinwang/models


## Setting up translation model

In [8]:
model_type = "custom"
model_name = model_map.get(model_type, "small")

print(model_name)

Helsinki-NLP/opus-mt-de-en


In [9]:
from transformers import pipeline
import transformers

In [10]:
'''
device_map="auto" doesn't work with "Helsinki-NLP/opus-mt-de-en" translator model
use explicit gpu device id 0 with device=0
'''
generator = pipeline(
    "translation", 
    model=model_name,
    # device_map="auto",
    device=0,
)

In [11]:
# task_prefix = "translate English to German: "
# task_prefix = "translate German to English: "
# task_prefix = "übersetze Deutsch zum Englisch: "
# Reference: https://huggingface.co/docs/transformers/model_doc/marian
def translate_gen(
    generator: transformers.pipelines.text2text_generation.TranslationPipeline, 
    info: AcceleratorStatus,
):  
    """
    Args:
      max_new_tokens: control the maximum length of the generation
    """
    
    def local(sentences: list, max_length=400, verbose: bool = True) -> list:
        """single input, no batch input
        Args:
          sentences:
        """
        start = time.time()
        
        result = generator(
            sentences, 
            max_length=max_length,
            # return_tensors="pt"
        )
        
        end = time.time()
        duration = end - start
        if verbose:
            print("-"*20)
            print(f"walltime: {duration} in secs.")
            info.gpu_usage()
        
        return result
    return local    

translate = translate_gen(generator, gpu_status)

In [12]:
input=["Das Haus ist wunderbar."]
# input="Das Haus ist wunderbar."

In [13]:
%timeit
translate(input, max_length=1000)

--------------------
walltime: 0.7268238067626953 in secs.
num_of_gpus: 1
--------------------
Device name      : NVIDIA A100 80GB PCIe MIG 2g.20gb 
Device idx       : 0 
No. of processors: 28
Physical  memory : 19.500000 GB
Reserved  memory : 0.310547 GB
Allocated memory : 0.285861 GB
Free      memory : 0.024686 GB
--------------------


[{'translation_text': 'The house is wonderful.'}]

## Loading pdf content from s3 buckets

In [4]:
import os

In [17]:
from util.objectstore_utils import S3PdfObjHelper, S3AccessConf

item_max_cap = -1 # unlimited
# item_max_cap = 2

s3_conf = S3AccessConf(
    bucket_name = "scivias-medreports",
    access_key_id = os.environ.get('AWS_ACCESS_KEY_ID'),
    secret_access_key = os.environ.get('AWS_SECRET_ACCESS_KEY'),
    endpoint = os.environ.get('S3_ENDPOINT'),
    verify_host = True
)

bucket_helper = S3PdfObjHelper(conf=s3_conf, file_prefix="KK-SCIVIAS")

In [18]:
# retrieval object from s3 bucket
item_map = bucket_helper.get_object_keys(item_max_cap)

In [19]:
item_list = list(item_map)
len(item_list)

800

In [36]:
translated_bucket_helper = S3PdfObjHelper(conf=s3_conf, file_prefix="trans2en/KK-SCIVIAS")

In [37]:
# retrieval object from translated s3 bucket prefix (virtual folder)
trans_item_map = translated_bucket_helper.get_object_keys(item_max_cap)

In [38]:
trans_item_list = list(trans_item_map)
len(trans_item_list)

246

In [18]:
# passing pdf objects from s3 to PdfReader
reader_map = bucket_helper.transform_objects(item_map, bucket_helper.pdf_reader_transformer)

In [19]:
# read pages from PdfReaders
raw_pdf_map = map(bucket_helper.read_pages_transformer, reader_map)

In [20]:
# split raw file string to list of 350 token segments
pdf_pages_map = map(bucket_helper.segment_pages_transformer, raw_pdf_map)

In [21]:
# list of pdf documents, each item is a list representing the 350 token split of a docuemnt
# doc_list = list(pdf_pages_map)

In [22]:
# doc_0_dict = doc_list[0]
# print(doc_0_dict['name'])
# print(doc_0_dict['content'][0])

In [23]:
# len(doc_0_dict['content'])

## Translate the max-token splitted documents

In [24]:
# def translate_document(splitted_doc: dict) -> str:
#     output = []
#     for input in splitted_doc.get('pages'):
#         output.append(translate(input, verbose=False)[0].get('translation_text', '').strip())
#     return {
#         "name" : splitted_doc.get('name'),
#         "content" : ''.join(output)
#     }

In [25]:
# def translate_func(segment: str):
#     return translate([segment], verbose=False)[0].get('translation_text', '').strip()

# segment_en_translater = bucket_helper.custom_pages_transformer_factory(translate_func)

In [26]:
from functools import partial

@partial(
    bucket_helper.custom_pages_transformer_factory,
)
def segment_en_translater(segment: str):
    return translate([segment], verbose=False)[0].get('translation_text', '').strip()

In [27]:
# use the current translater to translate from DE to EN for all document segements and for a sequence of documents
translated_doc_map = map(segment_en_translater, pdf_pages_map)

In [28]:
# start = time.time()
# en_doc_dict_list = list(translated_doc_map)
# end = time.time()
# duration = end - start
# print("-"*20)
# print(f"walltime: {duration} in secs.")

In [29]:
# 3 docs in one min
# total_mins = int(250 / 3)
# total_hours = int(total_mins/60) + 1
# print(total_mins)
# print(total_hours)

In [30]:
# # print the tranlated document token size
# for en_doc in en_doc_dict_list:
#     print(en_doc.get('name'))

In [31]:
# for en_doc in en_doc_dict_list:
#     print(f"no. of token: {len(en_doc.get('content'))}")

In [32]:
# print(en_doc_dict_list[0].get('name'))
# print(en_doc_dict_list[0].get('content'))

## Persist the translated docs

In [33]:
# def create_s3_name(old_key: str) -> str:
#     return f"trans2en/{old_key.replace('pdf', 'txt')}"

In [34]:
# key = en_doc_dict_list[0].get('name')
# print(key)
# print(bucket_helper.s3_key_mutater(key))

In [35]:
# upload the sequence of translated documents back to s3 storage.
upload_action_map = bucket_helper.upload_objects(translated_doc_map, bucket_helper.s3_key_mutater)

In [36]:
# need to use list to trigger the map reactive call for the map generator pipeline
start = time.time()
upload_action_list = list(upload_action_map)
end = time.time()
duration = end - start
print("-"*20)
print(f"walltime: {duration} in secs.")

for e in upload_action_list:
    print(e)



--------------------
walltime: 34.74611210823059 in secs.
s3.Object(bucket_name='scivias-medreports', key='trans2en/KK-SCIVIAS-00003^0053360847^2018-09-28^KIIGAS.txt')
s3.Object(bucket_name='scivias-medreports', key='trans2en/KK-SCIVIAS-00004^0051726752^2015-12-17^KIIS1.txt')
