In [1]:
import requests
import dotenv
import os
import pandas as pd
import json
from nltk.tokenize import sent_tokenize

dotenv.load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

In [2]:
# API endpoint
url = "https://api.openai.com/v1/chat/completions"
model = "gpt-3.5-turbo"
headers = {
    'Content-Type': 'application/json',
    'Authorization': f'Bearer {api_key}'
    }


In [3]:
def translate(prompt, text, url, model, headers):

    # Request headers
    headers = headers
    # Request data
    data = {
        # 'prompt': prompt,
        # 'max_tokens': 15000,  # Maximum number of tokens (words) to generate
        'model': model,
        'messages': [
            {
                "role": "system",
                "content": prompt
            },
            {
                "role": "user",
                "content": text
            }
        ],
    }

    # Making the API request
    response = requests.post(url, headers=headers, json=data)

    # Check if the request was successful
    if response.status_code == 200:
        translated_text = response.json()['choices'][0]['message']['content']
    else:
        # Print error message
        print(f"Error: {response.status_code} - {response.text}")
    return translated_text

In [4]:
# creating a list of prompts

prompt1 = 'You are an expert translator. Translate the following text to Russian using vocabulary and expressions of a Russian native. The text to be translated is:'
prompt2 = 'You are an expert translator that will be tasked with translating a piece of text into Russian. The translation must be faithful to the original tone of voice and writing style. Ensure that the meaning of the original text is not changed. The text to be translated is:'
prompts = [prompt1, prompt2]

In [5]:
# reading a dataset of English texts and their human translations from various fields

df = pd.read_json('texts.json')
print(df.field.unique())
df

['literature' 'medical' 'law']


Unnamed: 0,field,title,en_text,ru_human_translation,en_length
0,literature,Of Human Bondage by Somerset Maugham,The day broke gray and dull. The clouds hung h...,"День занялся тусклый, серый. Тучи повисли низк...",497
1,literature,A Tale of Two Cities by Charles Dickens,"It was the best of times, it was the worst of ...","Это было лучшее из всех времен, это было худш...",486
2,medical,Barnard C.N. The operation. A human cardiac tr...,POSTOPERATIVE CARE. The postoperative care of ...,Послеоперационное ведение. Послеоперационное в...,484
3,medical,"Guidelines on diabetes, pre-diabetes, and card...",Definition and classification of diabetes. Cri...,Определение и классификация диабета. Критерии ...,494
4,law,CODE OF CONDUCT FOR EUROPEAN LAWYERS,1. PREAMBLE. 1.1. The Function of the Lawyer i...,I. Преамбула. 1.1. Функция адвоката в обществе...,499
5,law,"The Indian Contract Act (ICA), 1872",Preamble. WHEREAS it is expedient to define an...,Преамбула. Принимая во внимание целесообразнос...,448


In [6]:
# creating a list of (field, title, en text, human translation, prompt) tuples

texts = list(df.en_text)
fields = list(df.field)
titles = list(df.title)
human_ru = list(df.ru_human_translation)
prompt_texts = []

for prompt in prompts:
    item = [(field, title, prompt, text, human) for field, title, text, human in zip(fields, titles, texts, human_ru)]
    prompt_texts.extend(item)

len(prompt_texts)

12

In [7]:
# sending (prompt, text) pairs to the API and collecting translated results by adding it to the list of tuples

prompt_text_translations = []

for item in prompt_texts:
    translation = translate(item[2], item[3], url, model, headers)
    new_item = (*item, translation)
    prompt_text_translations.append(new_item)

In [9]:
# creating a dataframe with the results, tokenizing the data with sent_tokenize

df2 = pd.DataFrame(prompt_text_translations, columns=['field', 'title', 
                    'prompt', 'en_text', 'ru_human_translation', 'ru_machine_translation'])

df2['en_text'] = df2['en_text'].apply(sent_tokenize)
df2['ru_human_translation'] = df2['ru_human_translation'].apply(sent_tokenize)
df2['ru_machine_translation'] = df2['ru_machine_translation'].apply(sent_tokenize)

In [10]:
def add_padding(list1, list2):
    '''in case the human and machine translations vary in sentence counts, 
    this function adds '' paddings to the shorter translation 
    so that their lengths match (required for evaluation)'''
    len1 = len(list1)
    len2 = len(list2)
    if len1 > len2:
        num_pad = len1-len2
        paddings = ['' for x in range(num_pad)]
        list2.extend(paddings)
    elif len2 > len1:
        num_pad = len2-len1
        paddings = ['' for x in range(num_pad)]
        list1.extend(paddings)
    return(list1, list2)

In [11]:
for index, row in df2.iterrows():
    padded_human, padded_machine = add_padding(row['ru_human_translation'], row['ru_machine_translation'])
    row['ru_human_translation'] = padded_human
    row['ru_machine_translation'] = padded_machine

In [12]:
# creating a dictionary from the dataframe to store as .json

reference_candidate_dict = {}

for index, row in df2.iterrows():
    label = row['field']
    prompt = row['prompt']
    source = row['en_text']
    machine_translation = row['ru_machine_translation']
    human_translation = row['ru_human_translation']
    
    if label in reference_candidate_dict:
        if prompt in reference_candidate_dict[label]:
            reference_candidate_dict[label][prompt].append((source, machine_translation, human_translation))
        else:
            reference_candidate_dict[label][prompt] = [(source, machine_translation, human_translation)]
    else:
        reference_candidate_dict[label] = {prompt: [(source, machine_translation, human_translation)]}

In [14]:
file_path = 'mt_for_eval.json'

# Save the dictionary as a .json file
with open(file_path, 'w') as json_file:
    json.dump(reference_candidate_dict, json_file, indent=4)

print(f"Dictionary saved as '{file_path}'")

Dictionary saved as 'mt_for_eval.json'
