In [2]:
import requests
import dotenv
import os
import pandas as pd
import json

dotenv.load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

In [16]:
# API endpoint
url = "https://api.openai.com/v1/chat/completions"
model = "gpt-3.5-turbo"
headers = {
    'Content-Type': 'application/json',
    'Authorization': f'Bearer {api_key}'
    }


In [17]:
def translate(prompt, text, url, model, headers):

    # Request headers
    headers = headers
    # Request data
    data = {
        # 'prompt': prompt,
        # 'max_tokens': 15000,  # Maximum number of tokens (words) to generate
        'model': model,
        'messages': [
            {
                "role": "system",
                "content": prompt
            },
            {
                "role": "user",
                "content": text
            }
        ],
    }

    # Making the API request
    response = requests.post(url, headers=headers, json=data)

    # Check if the request was successful
    if response.status_code == 200:
        translated_text = response.json()['choices'][0]['message']['content']
    else:
        # Print error message
        print(f"Error: {response.status_code} - {response.text}")
    return translated_text

In [12]:
# creating a list of prompts

prompt1 = 'You are an expert translator. Translate the following text to Russian using vocabulary and expressions of a Russian native. The text to be translated is:'
prompt2 = 'You are an expert translator that will be tasked with translating a piece of text into Russian. The translation must be faithful to the original tone of voice and writing style. Ensure that the meaning of the original text is not changed. The text to be translated is:'
prompts = [prompt1, prompt2]

In [10]:
# reading a dataset of english texts and their human translations from various fields

df = pd.read_json('texts.json')
print(df.field.unique())
df

['literature' 'medical' 'law']


Unnamed: 0,field,title,en_text,ru_human_translation,length
0,literature,Of Human Bondage by Somerset Maugham,The day broke gray and dull. The clouds hung h...,"День занялся тусклый, серый. Тучи повисли низк...",1511
1,literature,A Tale of Two Cities by Charles Dickens,"It was the best of times, it was the worst of ...","Это было лучшее из всех времен, это было худш...",1001
2,medical,Barnard C.N. The operation. A human cardiac tr...,POSTOPERATIVE CARE. The postoperative care of ...,Послеоперационное ведение. Послеоперационное в...,756
3,medical,"Guidelines on diabetes, pre-diabetes, and card...",Definition and classification of diabetes. Cri...,Определение и классификация диабета. Критерии ...,778
4,law,CODE OF CONDUCT FOR EUROPEAN LAWYERS,1. PREAMBLE. 1.1. The Function of the Lawyer i...,I. Преамбула. 1.1. Функция адвоката в обществе...,1664
5,law,"The Indian Contract Act (ICA), 1872",Preamble. WHEREAS it is expedient to define an...,Преамбула. Принимая во внимание целесообразнос...,508


In [14]:
# creating a list of (field, title, en text, human translation, prompt) tuples

texts = list(df.en_text)
fields = list(df.field)
titles = list(df.title)
human_ru = list(df.ru_human_translation)
prompt_texts = []

for prompt in prompts:
    item = [(field, title, prompt, text, human) for field, title, text, human in zip(fields, titles, texts, human_ru)]
    prompt_texts.extend(item)

len(prompt_texts)

12

In [18]:
# sending (prompt, text) pairs to the API and collecting translated results by adding it to the list of tuples

prompt_text_translations = []

for item in prompt_texts:
    translation = translate(item[2], item[3], url, model, headers)
    new_item = (*item, translation)
    prompt_text_translations.append(new_item)

In [19]:
# creating a dataframe with the results

df2 = pd.DataFrame(prompt_text_translations, columns=['field', 'title', 'prompt', 'en_text', 'ru_human_translation', 'ru_machine_translation'])
df2.head()

Unnamed: 0,field,title,prompt,en_text,ru_human_translation,ru_machine_translation
0,literature,Of Human Bondage by Somerset Maugham,You are an expert translator. Translate the fo...,The day broke gray and dull. The clouds hung h...,"День занялся тусклый, серый. Тучи повисли низк...",День начался серым и унылым. Облака висели низ...
1,literature,A Tale of Two Cities by Charles Dickens,You are an expert translator. Translate the fo...,"It was the best of times, it was the worst of ...","Это было лучшее из всех времен, это было худш...","Это было лучшее из времён, это было худшее из ..."
2,medical,Barnard C.N. The operation. A human cardiac tr...,You are an expert translator. Translate the fo...,POSTOPERATIVE CARE. The postoperative care of ...,Послеоперационное ведение. Послеоперационное в...,ПОСЛЕОПЕРАЦИОННЫЙ УХОД. Послеоперационный уход...
3,medical,"Guidelines on diabetes, pre-diabetes, and card...",You are an expert translator. Translate the fo...,Definition and classification of diabetes. Cri...,Определение и классификация диабета. Критерии ...,Определение и классификация диабета. Критерии ...
4,law,CODE OF CONDUCT FOR EUROPEAN LAWYERS,You are an expert translator. Translate the fo...,1. PREAMBLE. 1.1. The Function of the Lawyer i...,I. Преамбула. 1.1. Функция адвоката в обществе...,1. ПРЕАМБУЛА. 1.1. Функция адвоката в обществе...


In a separate notebook, I tokenized the original and translated entries with nltk's .sent_tokenize() and then parsed them by sentense using pandas' .explode() method. After saving the result as a .csv, I aligned the corresponding lines manually. The "en_ru_eval.csv" file below is the manually aligned data.

In [20]:
en_ru_aligned = pd.read_csv('en_ru_eval.csv', encoding='utf-8')
en_ru_aligned.head()

Unnamed: 0,field,title,prompt,en_text,ru_machine_translation,ru_human_translation
0,literature,Of Human Bondage by Somerset Maugham,You are an expert translator. Translate the fo...,The day broke gray and dull.,День начался серым и унылым.,"День занялся тусклый, серый."
1,literature,Of Human Bondage by Somerset Maugham,You are an expert translator. Translate the fo...,"The clouds hung heavily, and there was a rawne...","Тучи висели тяжело, и в воздухе чувствовалась ...","Тучи повисли низко, воздух был студеный – вот-..."
2,literature,Of Human Bondage by Somerset Maugham,You are an expert translator. Translate the fo...,A woman servant came into a room in which a ch...,"Служанка вошла в комнату, в которой ребенок сп...","В комнату, где спал ребенок, вошла служанка и ..."
3,literature,Of Human Bondage by Somerset Maugham,You are an expert translator. Translate the fo...,She glanced mechanically at the house opposite...,"Она механически взглянула на дом напротив, дом...",Она по привычке окинула взглядом фасад дома на...
4,literature,Of Human Bondage by Somerset Maugham,You are an expert translator. Translate the fo...,"“Wake up, Philip,” she said.","""Просыпайся, Филипп,"" - сказала она.","– Вставай, Фи́лип, – сказала она."


In [21]:
# creating a dictionary from the dataframe to store as .json

reference_candidate_dict = {}

for index, row in en_ru_aligned.iterrows():
    label = row['field']
    prompt = row['prompt']
    source = row['en_text']
    machine_translation = row['ru_machine_translation']
    human_translation = row['ru_human_translation']
    
    if label in reference_candidate_dict:
        if prompt in reference_candidate_dict[label]:
            reference_candidate_dict[label][prompt].append((source, machine_translation, human_translation))
        else:
            reference_candidate_dict[label][prompt] = [(source, machine_translation, human_translation)]
    else:
        reference_candidate_dict[label] = {prompt: [(source, machine_translation, human_translation)]}

display(reference_candidate_dict)

{'literature': {'You are an expert translator. Translate the following text to Russian using vocabulary and expressions of a Russian native. The text to be translated is:': [('The day broke gray and dull.',
    'День начался серым и унылым.',
    'День занялся тусклый, серый.'),
   ('The clouds hung heavily, and there was a rawness in the air that suggested snow.',
    'Тучи висели тяжело, и в воздухе чувствовалась сырость, предвещавшая снег.',
    'Тучи повисли низко, воздух был студеный – вот-вот выпадет снег.'),
   ('A woman servant came into a room in which a child was sleeping and drew the curtains.',
    'Служанка вошла в комнату, в которой ребенок спал, и открыла занавески.',
    'В комнату, где спал ребенок, вошла служанка и раздвинула шторы.'),
   ('She glanced mechanically at the house opposite, a stucco house with a portico, and went to the child’s bed.',
    'Она механически взглянула на дом напротив, дом с лепной отделкой и колоннадой, и направилась к кровати ребенка.',
  

In [22]:
file_path = 'mt_for_eval.json'

# Save the dictionary as a .json file
with open(file_path, 'w') as json_file:
    json.dump(reference_candidate_dict, json_file, indent=4)

print(f"Dictionary saved as '{file_path}'")

Dictionary saved as 'mt_for_eval.json'
