In [1]:
import requests
import dotenv
import os
import pandas as pd
import nltk
nltk.download("punkt")
from nltk.tokenize import sent_tokenize
from tqdm import tqdm
tqdm.pandas()

dotenv.load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\svvlk\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# API endpoint
url = "https://api.openai.com/v1/chat/completions"
model = "gpt-3.5-turbo"
headers = {
    'Content-Type': 'application/json',
    'Authorization': f'Bearer {api_key}'
    }


def translate(prompt, text):

    # Request data
    data = {
        # 'prompt': prompt,
        # 'max_tokens': 15000,  # Maximum number of tokens (words) to generate
        "model": model,
        "messages": [
            {"role": "system", "content": prompt},
            {"role": "user", "content": text},
        ],
    }

    # Making the API request
    response = requests.post(url, headers=headers, json=data)

    # Check if the request was successful
    if response.status_code == 200:
        translated_text = response.json()["choices"][0]["message"]["content"]
    else:
        # Print error message
        raise Exception(f"Error: {response.status_code} - {response.text}")
    return translated_text

In [20]:
# creating a list of prompts

prompts = [
    'You are an expert translator. Translate the following text to Russian. The text to be translated is:', 
    'You are an expert translator that will be tasked with translating a piece of text into Russian. The translation must be faithful to the original tone of voice and writing style. Ensure that the meaning of the original text is not changed. The text to be translated is:'
]

In [21]:
# reading a dataset of English texts and their human translations from various fields

df = pd.read_json('texts.json')
df

Unnamed: 0,field,title,en_text,ru_human_translation,en_length
0,literature,Of Human Bondage by Somerset Maugham,The day broke gray and dull. The clouds hung h...,"День занялся тусклый, серый. Тучи повисли низк...",497
1,literature,A Tale of Two Cities by Charles Dickens,"It was the best of times, it was the worst of ...","Это было лучшее из всех времен, это было худш...",486
2,medical,Barnard C.N. The operation. A human cardiac tr...,POSTOPERATIVE CARE. The postoperative care of ...,Послеоперационное ведение. Послеоперационное в...,484
3,medical,"Guidelines on diabetes, pre-diabetes, and card...",Definition and classification of diabetes. Cri...,Определение и классификация диабета. Критерии ...,494
4,law,CODE OF CONDUCT FOR EUROPEAN LAWYERS,1. PREAMBLE. 1.1. The Function of the Lawyer i...,I. Преамбула. 1.1. Функция адвоката в обществе...,499
5,law,"The Indian Contract Act (ICA), 1872",Preamble. WHEREAS it is expedient to define an...,Преамбула. Принимая во внимание целесообразнос...,448


In [30]:
# combination of each prompt with each text

df_for_translate = pd.concat([df.assign(prompt=prompt) for prompt in prompts]).drop(
    columns=["en_length"]
)

df_for_translate.head()

Unnamed: 0,field,title,en_text,ru_human_translation,prompt
0,literature,Of Human Bondage by Somerset Maugham,The day broke gray and dull. The clouds hung h...,"День занялся тусклый, серый. Тучи повисли низк...",You are an expert translator. Translate the fo...
1,literature,A Tale of Two Cities by Charles Dickens,"It was the best of times, it was the worst of ...","Это было лучшее из всех времен, это было худш...",You are an expert translator. Translate the fo...
2,medical,Barnard C.N. The operation. A human cardiac tr...,POSTOPERATIVE CARE. The postoperative care of ...,Послеоперационное ведение. Послеоперационное в...,You are an expert translator. Translate the fo...
3,medical,"Guidelines on diabetes, pre-diabetes, and card...",Definition and classification of diabetes. Cri...,Определение и классификация диабета. Критерии ...,You are an expert translator. Translate the fo...
4,law,CODE OF CONDUCT FOR EUROPEAN LAWYERS,1. PREAMBLE. 1.1. The Function of the Lawyer i...,I. Преамбула. 1.1. Функция адвоката в обществе...,You are an expert translator. Translate the fo...


In [31]:
# adding a column with machine translations

df_for_translate["ru_machine_translation"] = df_for_translate.progress_apply(
    axis="columns",
    func=lambda row: translate(prompt=row["prompt"], text=row["en_text"]),
)

100%|██████████| 12/12 [06:20<00:00, 31.67s/it]


In [32]:
df_for_translate.head()

Unnamed: 0,field,title,en_text,ru_human_translation,prompt,ru_machine_translation
0,literature,Of Human Bondage by Somerset Maugham,The day broke gray and dull. The clouds hung h...,"День занялся тусклый, серый. Тучи повисли низк...",You are an expert translator. Translate the fo...,День начался серым и унылым. Облака висели гус...
1,literature,A Tale of Two Cities by Charles Dickens,"It was the best of times, it was the worst of ...","Это было лучшее из всех времен, это было худш...",You are an expert translator. Translate the fo...,"Это было лучшее время, это было худшее время, ..."
2,medical,Barnard C.N. The operation. A human cardiac tr...,POSTOPERATIVE CARE. The postoperative care of ...,Послеоперационное ведение. Послеоперационное в...,You are an expert translator. Translate the fo...,ПОСЛЕОПЕРАЦИОННЫЙ УХОД. Послеоперационный уход...
3,medical,"Guidelines on diabetes, pre-diabetes, and card...",Definition and classification of diabetes. Cri...,Определение и классификация диабета. Критерии ...,You are an expert translator. Translate the fo...,Определение и классификация диабета. Критерии ...
4,law,CODE OF CONDUCT FOR EUROPEAN LAWYERS,1. PREAMBLE. 1.1. The Function of the Lawyer i...,I. Преамбула. 1.1. Функция адвоката в обществе...,You are an expert translator. Translate the fo...,1. ПРЕАМБУЛА. 1.1. Функция адвоката в обществе...


In [33]:
# tokenizing the target and source texts with sent_tokenize

columns_to_tokenize = ['en_text', 'ru_human_translation', 'ru_machine_translation']
for column in columns_to_tokenize:
    df_for_translate[column] = df_for_translate[column].apply(sent_tokenize)

In [34]:
def add_paddings(row):
    '''in case the human and machine translations vary in sentence counts, 
    this function adds '' paddings to the shorter translation 
    so that their lengths match (required for evaluation)'''
    if len(row['ru_human_translation']) < len(row['ru_machine_translation']):
        row['ru_human_translation'].extend([''] * (len(row['ru_machine_translation']) - len(row['ru_human_translation'])))
    elif len(row['ru_machine_translation']) < len(row['ru_human_translation']):
        row['ru_machine_translation'].extend([''] * (len(row['ru_human_translation']) - len(row['ru_machine_translation'])))
    return row

In [35]:
df_for_translate = df_for_translate.apply(add_paddings, axis=1)

In [37]:
df_for_translate.to_json('mt_for_eval.json', orient='records')