# language translation

In [2]:
%load_ext autoreload
%autoreload 2

import os
curpath = os.getcwd()
os.chdir(curpath.split("core")[0])

In [3]:
import json
import time
import logging
# logging.basicConfig(level=logging.INFO)


from typing import List, Dict, Any, Tuple, Union
from tqdm import tqdm

from prompts import SIMPLE_ENGLISH_TRANS_PROMPT, SIMPLE_HINDI_TRANS_PROMPT
from core.features.provider import creator
from core.features.utils import calculate_cost_gpt4_turbo, add_dicts

from dotenv import load_dotenv
load_dotenv()

text_model_defaults = {"model" : "gpt-4-1106-preview", "temperature" : 0.1, "response_format" : {"type": "json_object"}}

In [6]:
def get_text_list_from_raw_text(raw_text, sep = "\n"):

    if isinstance(raw_text, str):
        raw_text_l = raw_text.splitlines()
    elif isinstance(raw_text, list):
        raw_text_l = raw_text
    else:
        raise ValueError("raw_text must be a string or a list")

    return raw_text_l

def chunk_raw_text_list(raw_text_list, max_len=3000):
    """
    Splits a list of text strings into chunks where the total length of each chunk is 
    less than or equal to max_len.

    :param raw_text_list: List of text strings to be chunked.
    :param max_len: Maximum character length for each chunk.
    :return: List of text chunks, each being a list of text strings.
    """
    text_list = []
    current_chunk = []
    current_len = 0

    for text in raw_text_list:
        text_len = len(text)
        
        # Check if adding this text would exceed the max length
        if current_len + text_len > max_len and current_chunk:
            # Start a new chunk
            text_list.append(current_chunk)
            current_chunk = [text]
            current_len = text_len
        else:
            # Add text to current chunk
            current_chunk.append(text)
            current_len += text_len

    # Add the last chunk if it's not empty
    if current_chunk:
        text_list.append(current_chunk)

    return text_list


def text_path_to_chunk(path, max_len=20000, verbose=False):

    with open(path, 'rb') as f:
        _text = f.read().decode('utf-8')

    total_chars = len(_text)

    # convert raw_text to list and get unique values
    _text_lists = get_text_list_from_raw_text(_text)

    # chunking the list into sublists with max length of characters in a chunk is max_len
    chunked_text_list = chunk_raw_text_list(_text_lists, max_len=max_len)

    if verbose:
        print("Total chars:", total_chars)
        print("Unique Text Lists:", len(_text_lists))
        print("Text List chunks:", len(chunked_text_list))
        for i in chunked_text_list:
            t = "".join(i)
            print("Chunk of size", len(t), "characters")

    return chunked_text_list


In [13]:

def translate_gen(input_text: str, language: str):

    if language == "en":
        conversation = [{"role": "system", "content": SIMPLE_ENGLISH_TRANS_PROMPT}]
    elif language == "hi":
        conversation = [{"role": "system", "content": SIMPLE_HINDI_TRANS_PROMPT}]
    else:
        raise ValueError("Language not supported")

    user_prompt = f"""
    INPUT_TEXT:

    //input_text//

    {input_text}

    //input_text//
    
    OUTPUT:
    """

    user_message = {'role': 'user', "content": user_prompt}
    conversation.append(user_message)

    response = creator(
                    **text_model_defaults,
                    messages = conversation,
                    )

    output = response.choices[0].message.content
    total_usage = response.usage.model_dump()

    return {"output": output, "total_usage": total_usage}


In [14]:
# final batch processing function
def translate_batch_process(input_text: List[List], language) -> [Dict, Dict]:

    final_dict = {"text": ""}
    # print(final_dict)
    total_tokens = {'prompt_tokens': 0, 'completion_tokens': 0, 'total_tokens': 0}

    for raw_text_list in tqdm(input_text):
        raw_text = "\n".join(raw_text_list)

        # print(raw_text)
        result = translate_gen(input_text=raw_text, language=language)
        output = result['output']
        tokens = result['total_usage']

        try:
            json_out = json.loads(output)
        except:
            raise ValueError("Output is not a valid JSON")
        final_dict = add_dicts(final_dict, json_out)
        total_tokens = add_dicts(total_tokens, tokens)

    return final_dict, total_tokens


In [15]:
chunked_text_list = text_path_to_chunk(path=r"C:\Users\DELL\Documents\Curate\curate-v1\core\features\translation\test_news_hindi_short.txt", verbose=True)
res, tokens = translate_batch_process(input_text=chunked_text_list, language='en')

print(tokens)
print(calculate_cost_gpt4_turbo(tokens))
print(res)

Total chars: 5064
Unique Text Lists: 3
Text List chunks: 3
Chunk of size 84 characters
Chunk of size 602 characters
Chunk of size 4374 characters


 33%|███▎      | 1/3 [00:01<00:03,  1.91s/it]

{
"text": "Breaking News in Hindi: Live breaking news, read the main and latest news of November 9"
}


 67%|██████▋   | 2/3 [00:09<00:05,  5.15s/it]

{
"text": "Subscribe for unlimited article reading Link Copied Stay updated with every news, download Android Hindi News apps, iOS Hindi News apps and Amarujala Hindi News apps on your mobile. Get all India News in Hindi related to live updates of politics, sports, entertainment, technology, and education etc. Stay updated with us for all breaking news from India News and more news in Hindi. Next Article Please wait... Please wait... Delete All Cookies Followed Express your feedback ABP News Top 10, Morning Bulletin: Start your morning with news from ABP News, read all the major news from around the world together"
}


100%|██████████| 3/3 [01:33<00:00, 31.16s/it]


{
"text": "By: ABP News Bureau | Updated at: 10 Nov 2023 06:36 AM (IST) ABP News Top 10, Morning Bulletin: Start your morning with news from ABP News, read all the major news from around the country and the world together in ABP News Top 10, Night's Major News: Read all the major news from around the country and the world together on ABP News at night: 9 November 2023 | If you have missed important news in the day's hustle, read national-international, business, Bollywood, sports, and gadget news with just one click. Read More ABP News Top 10, Afternoon's Latest News: Read all the major news from around the country and the world together on ABP News in the afternoon: 9 November 2023 | Whether it's major news from home and abroad, the thrill of cricket, Bollywood gossip, or the ups and downs of the market. Get alerts for new gadgets, vehicles, or job information with just one click. Read More NCP Crisis: '...searched for 20,000 such affidavits', Sharad Pawar's faction accuses Ajit grou




Our cost per million characters: $12.5

Google cloud cost: $20

English to hindi translation is costly, by 3x of hindi to english.