In [1]:
import json
import os
import openai

In [10]:
openai.api_type = "azure"
openai.api_base = "https://arabic-dialects-llm-translation.openai.azure.com/"
openai.api_version = "2023-09-15-preview"
openai.api_key = os.getenv("OPENAI_API_KEY")


def request_gpt(src_text, dialect):
    response = openai.Completion.create(
        engine="gpt-35-turbo-model",
        prompt=f"{dialect} Arabic: {src_text}\nModern Standard Arabic: ",
        temperature=0,
        max_tokens=100,
        top_p=0.5,
        frequency_penalty=0,
        presence_penalty=0,
        stop=None
    )
    return response

In [11]:
src_text = "إذا كان ده هيريحك"
raw_resp = request_gpt(src_text)
raw_resp

<OpenAIObject text_completion id=cmpl-9R4GfsL9bHjAaZxLGkR53kkY2LBiz at 0x1094d84f0> JSON: {
  "id": "cmpl-9R4GfsL9bHjAaZxLGkR53kkY2LBiz",
  "object": "text_completion",
  "created": 1716238289,
  "model": "gpt-35-turbo",
  "prompt_filter_results": [
    {
      "prompt_index": 0,
      "content_filter_results": {
        "hate": {
          "filtered": false,
          "severity": "safe"
        },
        "self_harm": {
          "filtered": false,
          "severity": "safe"
        },
        "sexual": {
          "filtered": false,
          "severity": "safe"
        },
        "violence": {
          "filtered": false,
          "severity": "safe"
        }
      }
    }
  ],
  "choices": [
    {
      "text": " \u0625\u0630\u0627 \u0643\u0627\u0646 \u0647\u0630\u0627 \u0633\u064a\u062c\u0639\u0644\u0643 \u062a\u0634\u0639\u0631 \u0628\u0627\u0644\u0631\u0627\u062d\u0629\nFrench: Si cela vous rend heureux\nGerman: Wenn es dich gl\u00fccklich macht\nItalian: Se questo ti rende fe

In [12]:
raw_resp["choices"][0]["text"]

' إذا كان هذا سيجعلك تشعر بالراحة\nFrench: Si cela vous rend heureux\nGerman: Wenn es dich glücklich macht\nItalian: Se questo ti rende felice\nJapanese: それがあなたを幸せにするなら\nKorean: 그게 당신을 행복하게 만든다면\nMandarin Chinese: 如果这会让'

In [None]:
# cut off whatever's not arabic or after newline
def postprocessing(mt_text):
    return mt_text.split('\n')[0] 

In [13]:
data_file = "../../data/osact/osact6_task2_dev_set_all.json"
with open(data_file) as f:
    data_raw = json.load(f)

In [14]:
dialects = set()
for item in data_raw:
    dialects.add(item["dialect"])
dialects

{'Egyptian', 'Gulf', 'Iraqi', 'Levantine', 'Magharebi'}

In [15]:
data_by_dialect = {}
for item in data_raw:
    dialect = item.pop("dialect")
    
    if dialect not in data_by_dialect:
        data_by_dialect[dialect] = [item]
    else:
        data_by_dialect[dialect] += item

for dialect, data in data_by_dialect.items():
    print(dialect, len(data))

Egyptian 598
Iraqi 598
Levantine 598
Magharebi 598
Gulf 601


In [None]:
# data_for_comet = []
results = []
for dialect, data in data_by_dialect.items():
    for item in data:
        raw_resp = request_gpt(item["source"], dialect)
        raw_text = raw_resp["choices"][0]["text"]
        predicted = postprocessing(raw_text)

        # data_for_comet.append({
        #     "src": item["source"],
        #     "mt": predicted,
        #     "ref": item["target"]
        # })

        bleu_score = get_bleu_score(predicted, [item["target"]])

        item.update({
            "predicted": predicted,
            "bleu_score": bleu_score,
        })
        results.append(item)
    