# Introduction

@Author: Yingding Wang\
@CreatedOn: 20.11.2023

This notebook shows an example of using pdf data from a S3 bucket source to be translated from german lanuage into english.

In [None]:
#list=!nvidia-smi -L
#for i in range(len(list)):
#    print(list[i])

In [23]:
import os
from platform import python_version

print(python_version())

3.8.10


## Init the GPU environments

In [24]:
from util.accelerator_utils import AcceleratorStatus, AcceleratorHelper
gpu_status = AcceleratorStatus()

In [25]:
gpu_status.gpu_usage()

num_of_gpus: 1
--------------------
Device name      : NVIDIA A100 80GB PCIe MIG 2g.20gb 
Device idx       : 0 
No. of processors: 28
Physical  memory : 19.500000 GB
Reserved  memory : 0.000000 GB
Allocated memory : 0.000000 GB
Free      memory : 0.000000 GB
--------------------


In [26]:
gpu_helper = AcceleratorHelper()
UUIDs = gpu_helper.nvidia_device_uuids_filtered_by(is_mig=True, log_output=False)
# print(UUIDs)

In [27]:
# set the model download cache directory
display_architecture=True
DATA_ROOT="/home/jovyan/llm-models"

gpu_helper.init_cuda_torch(UUIDs, f"{DATA_ROOT}/core-kind/yinwang")

model_map = {
   "small": "google/mt5-small", # 1.2 GB
   "base" : "google/mt5-base", # 2.33 GB
   "large" : "google/mt5-large", # 4.9 GB,
   "xl" : "google/mt5-xl", # 15 GB
   "xxl" : "google/mt5-xxl", # 51.7 GB,
   "custom": "Helsinki-NLP/opus-mt-de-en", 
}

print(os.environ["CUDA_VISIBLE_DEVICES"])
print(os.environ["XDG_CACHE_HOME"])

MIG-0efc9f06-6dca-5886-98af-0273ca7fde51
/home/jovyan/llm-models/core-kind/yinwang/models


## Setting up translation model

In [28]:
model_type = "custom"
model_name = model_map.get(model_type, "small")

print(model_name)

Helsinki-NLP/opus-mt-de-en


In [29]:
from transformers import pipeline
import transformers

In [30]:
'''
device_map="auto" doesn't work with "Helsinki-NLP/opus-mt-de-en" translator model
use explicit gpu device id 0 with device=0
'''
generator = pipeline(
    "translation", 
    model=model_name,
    # device_map="auto",
    device=0,
)

In [39]:
#type(generator)

In [33]:
# task_prefix = "translate English to German: "
# task_prefix = "translate German to English: "
# task_prefix = "übersetze Deutsch zum Englisch: "
# Reference: https://huggingface.co/docs/transformers/model_doc/marian
def translate_gen(
    generator: transformers.pipelines.text2text_generation.TranslationPipeline, 
    info: AcceleratorStatus,
):  
    """
    Args:
      max_new_tokens: control the maximum length of the generation
    """
    
    def local(sentences: list, max_length=400) -> list:
        """single input, no batch input
        Args:
          sentences:
        """
        start = time.time()
        
        result = generator(
            sentences, 
            max_length=max_length,
            # return_tensors="pt"
        )
        
        end = time.time()
        duration = end - start
        print("-"*20)
        print(f"walltime: {duration} in secs.")
        info.gpu_usage()
        
        return result
    return local    

translate = translate_gen(generator, gpu_status)

In [36]:
input="Das Haus ist wunderbar."

In [37]:
%timeit
translate(input, max_length=1000)

--------------------
walltime: 0.060678958892822266 in secs.
num_of_gpus: 1
--------------------
Device name      : NVIDIA A100 80GB PCIe MIG 2g.20gb 
Device idx       : 0 
No. of processors: 28
Physical  memory : 19.500000 GB
Reserved  memory : 0.310547 GB
Allocated memory : 0.285861 GB
Free      memory : 0.024686 GB
--------------------


[{'translation_text': 'The house is wonderful.'}]

## Loading pdf content from s3 buckets

In [None]:
# WIP


In [None]:
from util.pdf_text_loader import PDFHelper
# DATA_ROOT="/home/jovyan/llm-models"
DATA_SUBDIR="core-kind/yinwang/data/medreports"
print(f"{DATA_ROOT}/{DATA_SUBDIR}")
loader = PDFHelper(data_folder = f"{DATA_ROOT}/{DATA_SUBDIR}", file_pattern="KK-SCIVIAS-*.pdf")

In [None]:
loader.file_path_list

In [None]:
# has two testing file, choose the pdf file to be tranlated with list index
# file_idx = 0
file_idx = 1

In [None]:
context = loader.read_pdf(file_idx)

In [None]:
loader.count_token(file_idx)

In [None]:
# https://stackoverflow.com/questions/13673060/split-string-into-strings-by-length
def wrap(s, w):
    """
    split string with length w into a list of strings with length w
    Arge:
      s: orginial str
      w: with of the each split for the string
      
    Return:
      a list of string with each element as string of length w
    """
    return [s[i:i + w] for i in range(0, len(s), w)]

In [None]:
splitted_content = wrap(context, 350)

In [None]:
len(splitted_content)

In [None]:
output = []
for input in splitted_content:
    output.append(translate(input)[0].get('translation_text', '').strip())

In [None]:
en_content = ''.join(output)

In [None]:
#print(en_content)

In [None]:
print(f"the translated text has tokens: {len(en_content)}")

In [None]:
def store_txt(content, path):
    with open (path, "w") as text_file:
        #write string to file
        text_file.write(content)

In [None]:
en_txt_path = loader.file_path_list[file_idx].replace("pdf", "txt")

In [None]:
store_txt(en_content, en_txt_path)