In [6]:
from openai import OpenAI
import pandas as pd
client = OpenAI()

df = pd.read_csv('idioms.csv')
def equivalent(id):
    completion = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are an expert in Language translation from Moroccan Darija dialect to English. " +
            "You will be provided with an idiom in Moroccan Darija dialect, "+
            "and your task is to translate it into an equivalent idiom or construction in English with no explanations."},
            #{"role": "assistant", "content": "OK, I understand. I am ready for the translation."},
            {"role": "user", "content": "طلع تاكول الكرموس, نزل شكون قالها ليك."},
            {"role": "assistant", "content": "Caught between a rock and a hard place."},

            {"role": "user", "content":  f"{id}" }
            ], 
            #
        temperature=0  # Deterministic responses
        )
    return completion.choices[0].message.content
def literal(id):
    completion = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are an expert in Language translation from Moroccan Darija dialect to English. " +
            "You will be provided with a sentence in Moroccan Darija dialect, "+
            "and your task is to translate it into English with no explanations."},
            #{"role": "assistant", "content": "OK, I understand. I am ready for the translation."},
            {"role": "user", "content": "طلع تاكول الكرموس, نزل شكون قالها ليك."},
            {"role": "assistant", "content": "Come upstairs and have some figs, get downstairs who told you so!"},

            {"role": "user", "content":  f"{id}" }
            ], 
            #
        temperature=0  # Deterministic responses
        )
    return completion.choices[0].message.content
print(df.columns)

Index(['literal_eng', 'equivalent_eng', 'darija_ar', 'explanation',
       'eng_equivalent_case', 'transl', 'literal'],
      dtype='object')


In [7]:
dataset = df.copy()
columns_to_delete = [ 'explanation', 'eng_equivalent_case', 'transl', 'literal']
dataset.drop(columns=columns_to_delete, inplace=True)
dataset['equivalent'] = dataset['darija_ar'].apply(equivalent)
dataset['literal'] = dataset['darija_ar'].apply(literal)


In [8]:
dataset.to_csv('id1.csv', index=False)

In [9]:
from sentence_transformers import SentenceTransformer, util
import plotly.graph_objects as go

dataset = pd.read_csv('id1.csv')

model_name = 'bert-base-nli-mean-tokens'
model = SentenceTransformer(model_name)


def evaluate_pairs(transl, eng):
    results = []
    for i in range(len(transl)):
        sentence1, sentence2 = transl[i], eng[i]
        embedding1 = model.encode(sentence1, convert_to_tensor=True)
        embedding2 = model.encode(sentence2, convert_to_tensor=True)
        similarity = util.pytorch_cos_sim(embedding1, embedding2).item()
        results.append((i+2, sentence1, sentence2, similarity))

    return results

equivalent = dataset['equivalent']
literal = dataset['literal']
equivalent_eng =  dataset['equivalent_eng']
literal_eng =  dataset['equivalent_eng']

similarities_eq = evaluate_pairs(equivalent, equivalent_eng)
percentages_eq = [result[3] for result in similarities_eq]

similarities_lit = evaluate_pairs(literal, literal_eng)
percentages_lit = [result[3] for result in similarities_lit]

percentages = pd.DataFrame()
percentages['equivalent'] = percentages_eq
percentages['literal'] = percentages_lit
percentages.describe()



`resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.



Unnamed: 0,equivalent,literal
count,35.0,35.0
mean,0.423609,0.395377
std,0.231222,0.224429
min,0.095434,0.046513
25%,0.26911,0.192386
50%,0.385503,0.376093
75%,0.511702,0.576362
max,0.982657,0.797132


In [10]:
fig = go.Figure()

fig.add_trace(go.Histogram(histfunc="count", y=percentages_eq, x=percentages_eq, name="Equivalent translation"))
fig.add_trace(go.Histogram(histfunc="count", y=percentages_lit, x=percentages_lit, name="Literal translation"))

fig.show()