In [1]:
from openai import OpenAI
import pandas as pd
client = OpenAI()

def darija_equivalent(id):
    completion = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "نتا خبير فلهجة الدارجة المغربية وفالترجمة من الدارجة للنجليزية. " + 
            "غادي نعطيك واحد المثل شعبي فالدارجة المغربية، والمهمة ديالك هي تترجمها لمثل شعبي مكافئ أو جملة مكافئة فالانجليزية. " +
            "الترجمة خصها تكون بلا تفسيرات."},
            #{"role": "assistant", "content": "OK, I understand. I am ready for the translation."},
            {"role": "user", "content": "الگنازة كبيرة والميت فار."},
            {"role": "assistant", "content": "Making a mountain out of a molehill."},

            {"role": "user", "content":  f"{id}" }
            ], 
            #
        temperature=0  # Deterministic responses
        )
    return completion.choices[0].message.content

def eng_equivalent(id):
    completion = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are an expert in Language translation from Moroccan Darija dialect to English. " +
            "You will be provided with an idiom in Moroccan Darija dialect, "+
            "and your task is to translate it into an equivalent idiom or equivalent construction in English. "+
            "The translation must be with no explanations."},
            #{"role": "assistant", "content": "OK, I understand. I am ready for the translation."},
            {"role": "user", "content": "الگنازة كبيرة والميت فار."},
            {"role": "assistant", "content": "Making a mountain out of a molehill."},

            {"role": "user", "content":  f"{id}" }
            ], 
            #
        temperature=0  # Deterministic responses
        )
    return completion.choices[0].message.content


def darija_literal(id):
    completion = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "نتا خبير فلهجة الدارجة المغربية وفالترجمة من الدارجة لنجليزية. " + 
            "غادي نعطيك واحد الجملة بالدارجة المغربية، والمهمة ديالك هي تترجمها لجملة بالانجليزية. " +
            "الترجمة خصها تكون حرفية بلا تفسيرات."},
            #{"role": "assistant", "content": "OK, I understand. I am ready for the translation."},
            {"role": "user", "content": "طلع تاكول الكرموس, نزل شكون قالها ليك."},
            {"role": "assistant", "content": "Come upstairs and have some figs, get downstairs who told you so!"},

            {"role": "user", "content":  f"{id}" }
            ], 
            #
        temperature=0  # Deterministic responses
        )
    return completion.choices[0].message.content


def eng_literal(id):
    completion = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are an expert in Language translation from Moroccan Darija dialect to English. " +
            "You will be provided with a sentence in Moroccan Darija dialect, "+
            "and your task is to translate it into an English sentence. "+
            "The translation must be literal with no explanations."},
            #{"role": "assistant", "content": "OK, I understand. I am ready for the translation."},
            {"role": "user", "content": "طلع تاكول الكرموس, نزل شكون قالها ليك."},
            {"role": "assistant", "content": "Come upstairs and have some figs, get downstairs who told you so!"},

            {"role": "user", "content":  f"{id}" }
            ], 
            #
        temperature=0  # Deterministic responses
        )
    return completion.choices[0].message.content

In [7]:
df = pd.read_csv('idioms.csv')

In [None]:
dataset = df.copy()
columns_to_delete = ['explanation']
dataset.drop(columns=columns_to_delete, inplace=True)

dataset['eng_equivalent'] = dataset['darija_ar'].apply(eng_equivalent)
dataset['eng_literal'] = dataset['darija_ar'].apply(eng_literal)
dataset['darija_equivalent'] = dataset['darija_ar'].apply(darija_equivalent)
dataset['darija_literal'] = dataset['darija_ar'].apply(darija_literal)

In [8]:
dataset.to_csv('id1.csv', index=False)

In [9]:
from sentence_transformers import SentenceTransformer, util
import plotly.graph_objects as go

dataset = pd.read_csv('id1.csv')

model_name = 'bert-base-nli-mean-tokens'
model = SentenceTransformer(model_name)


def evaluate_pairs(transl, eng):
    results = []
    for i in range(len(transl)):
        sentence1, sentence2 = transl[i], eng[i]
        embedding1 = model.encode(sentence1, convert_to_tensor=True)
        embedding2 = model.encode(sentence2, convert_to_tensor=True)
        similarity = util.pytorch_cos_sim(embedding1, embedding2).item()
        results.append((i+2, sentence1, sentence2, similarity))

    return results

eng_equivalent = dataset['eng_equivalent']
eng_literal = dataset['eng_literal']
darija_equivalent =  dataset['darija_equivalent']
darija_literal =  dataset['darija_equivalent']

equivalent = dataset['equivalent']
literal = dataset['literal']

eng_similarities_eq = evaluate_pairs(eng_equivalent, equivalent)
eng_percentages_eq = [result[3] for result in eng_similarities_eq]

eng_similarities_lit = evaluate_pairs(eng_literal, literal)
eng_percentages_lit = [result[3] for result in eng_similarities_lit]

darija_similarities_eq = evaluate_pairs(darija_equivalent, equivalent)
darija_percentages_eq = [result[3] for result in darija_similarities_eq]

darija_similarities_lit = evaluate_pairs(darija_literal, literal)
darija_percentages_lit= [result[3] for result in darija_similarities_lit]

percentages = pd.DataFrame()
percentages['eng_equivalent'] = eng_percentages_eq
percentages['eng_literal'] = eng_percentages_lit
percentages['darija_equivalent'] = darija_percentages_eq
percentages['darija_literal'] = darija_percentages_lit

percentages.describe()




Unnamed: 0,eng_equivalent,eng_literal,darija_equivalent,darija_literal
count,21.0,21.0,21.0,21.0
mean,0.457229,0.673659,0.415567,0.541788
std,0.19118,0.20248,0.21314,0.262182
min,0.115195,0.20716,0.115195,0.10233
25%,0.273622,0.566386,0.245459,0.340761
50%,0.509145,0.725031,0.36305,0.496327
75%,0.602862,0.816178,0.602862,0.77073
max,0.75338,0.941359,0.806552,0.889043


In [15]:
import plotly.graph_objects as go
fig = go.Figure()

fig.add_trace(go.Box(y=percentages['eng_equivalent'], name='English equivalent translation prompt'))
fig.add_trace(go.Box(y=percentages['eng_literal'], name='English literal translation prompt'))
fig.add_trace(go.Box(y=percentages['darija_equivalent'], name='Darija equivalent translation prompt'))
fig.add_trace(go.Box(y=percentages['darija_literal'], name='Darija literal translation prompt'))

# Update layout
fig.update_layout(
    title='Box Plot of Similarity Scores for all the prompts.',
    yaxis_title='Similarity Score',
    xaxis_title='Prompt',
    boxmode='group'  # group the box plots together
)

# Show plot
fig.show()

In [19]:
import plotly.graph_objects as go


fig = go.Figure()

fig.add_trace(go.Histogram(histfunc="count", y=percentages['darija_equivalent'], x=percentages['darija_equivalent'], name="Darija equivalent translation prompt"))
fig.add_trace(go.Histogram(histfunc="count", y=percentages['darija_literal'], x=percentages['darija_literal'], name="Darija literal translation prompt"))

fig.show()

In [21]:
import plotly.graph_objects as go


fig = go.Figure()

fig.add_trace(go.Histogram(histfunc="count", y=percentages['eng_equivalent'], x=percentages['eng_equivalent'], name="English equivalent translation prompt"))
fig.add_trace(go.Histogram(histfunc="count", y=percentages['eng_literal'], x=percentages['eng_literal'], name="English literal translation prompt"))
fig.show()

In [22]:
print(eng_similarities_eq)
threshold = 0.6

filtered_elements = [element for element in eng_similarities_eq if element[3] <= threshold]
print(filtered_elements)


[(2, 'Mind your own business.', 'Putting the cart before the horse.', 0.21283793449401855), (3, 'The guilty dog barks the loudest.', 'The burnt child dreads the fire.', 0.4511890411376953), (4, 'He came to put kohl on her eyes.', 'Adding insult to injury.', 0.3413271903991699), (5, 'Empty vessels make the most noise.', 'Money talks.', 0.34882205724716187), (6, 'Speak well of your friend in public, but scold him in private.', 'Praise in public, criticize in private.', 0.7533799409866333), (7, 'Out of sight, out of mind.', 'Turning the blind eye.', 0.7438268661499023), (8, 'Mind your own business.', 'Turn the blind eye.', 0.6611579656600952), (9, 'The shoe only pinches the foot that wears it.', 'You never know a man until you walk a mile in his shoes.', 0.27362245321273804), (10, 'Mind your own beeswax.', 'breaking it down Barney style.', 0.542159914970398), (11, 'The crown needs a head.', 'It rains on the rich and poor alike.', 0.27181994915008545), (12, 'A shot in the dark.', 'Measure 

# The whole set

In [4]:
import pandas as pd
data = pd.read_csv('original.csv')

Index(['literal', 'equivalent', 'darija_ar'], dtype='object')


In [6]:
data['eng_equivalent'] = data['darija_ar'].apply(eng_equivalent)
data['eng_literal'] = data['darija_ar'].apply(eng_literal)
data['darija_equivalent'] = data['darija_ar'].apply(darija_equivalent)
data['darija_literal'] = data['darija_ar'].apply(darija_literal)

In [7]:
data.to_csv('original.csv', index=False)

In [23]:
from sentence_transformers import SentenceTransformer, util
import plotly.graph_objects as go

data = pd.read_csv('original.csv')

model_name = 'bert-base-nli-mean-tokens'
model = SentenceTransformer(model_name)


def evaluate_pairs(transl, eng):
    results = []
    for i in range(len(transl)):
        sentence1, sentence2 = transl[i], eng[i]
        embedding1 = model.encode(sentence1, convert_to_tensor=True)
        embedding2 = model.encode(sentence2, convert_to_tensor=True)
        similarity = util.pytorch_cos_sim(embedding1, embedding2).item()
        results.append((i+2, sentence1, sentence2, similarity))

    return results

eng_equivalent = data['eng_equivalent']
eng_literal = data['eng_literal']
darija_equivalent =  data['darija_equivalent']
darija_literal =  data['darija_equivalent']

equivalent = data['equivalent']
literal = data['literal']

eng_similarities_eq = evaluate_pairs(eng_equivalent, equivalent)
eng_percentages_eq = [result[3] for result in eng_similarities_eq]

eng_similarities_lit = evaluate_pairs(eng_literal, literal)
eng_percentages_lit = [result[3] for result in eng_similarities_lit]

darija_similarities_eq = evaluate_pairs(darija_equivalent, equivalent)
darija_percentages_eq = [result[3] for result in darija_similarities_eq]

darija_similarities_lit = evaluate_pairs(darija_literal, literal)
darija_percentages_lit= [result[3] for result in darija_similarities_lit]

percentages = pd.DataFrame()
percentages['eng_equivalent'] = eng_percentages_eq
percentages['eng_literal'] = eng_percentages_lit
percentages['darija_equivalent'] = darija_percentages_eq
percentages['darija_literal'] = darija_percentages_lit

percentages.describe()



`resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.



Unnamed: 0,eng_equivalent,eng_literal,darija_equivalent,darija_literal
count,35.0,35.0,35.0,35.0
mean,0.36135,0.684603,0.37911,0.521881
std,0.210571,0.188713,0.166336,0.235818
min,0.068562,0.20716,0.115195,0.202324
25%,0.185478,0.550585,0.251092,0.288982
50%,0.332063,0.725798,0.368509,0.505402
75%,0.498739,0.818048,0.469667,0.727899
max,0.860649,0.939632,0.806552,0.889043


In [24]:
import plotly.graph_objects as go
fig = go.Figure()

fig.add_trace(go.Box(y=percentages['eng_equivalent'], name='English equivalent translation prompt'))
fig.add_trace(go.Box(y=percentages['eng_literal'], name='English literal translation prompt'))
fig.add_trace(go.Box(y=percentages['darija_equivalent'], name='Darija equivalent translation prompt'))
fig.add_trace(go.Box(y=percentages['darija_literal'], name='Darija literal translation prompt'))

# Update layout
fig.update_layout(
    title='Box Plot of Similarity Scores for all the prompts.',
    yaxis_title='Similarity Score',
    xaxis_title='Prompt',
    boxmode='group'  # group the box plots together
)

# Show plot
fig.show()

In [32]:
threshold = 0.6
filtered_elements = [element for element in eng_similarities_lit if element[3] >= threshold]

# Create the DataFrame for_cot
for_cot = pd.DataFrame({
    'darija_ar': [data.loc[element[0]-2, 'darija_ar'] for element in filtered_elements],
    'equivalent translation': [data.loc[element[0]-2, 'equivalent'] for element in filtered_elements],
    'literal translation': [element[2] for element in filtered_elements],
    'prompt translation': [element[1] for element in filtered_elements],
    'similarity': [element[3] for element in filtered_elements]
})

# Output the for_cot DataFrame
print(for_cot)

                                            darija_ar  \
0      أش خاصك يا العريان, خاصني خاتم الذهب يا مولاي.   
1                      اللي عضو الحنش كيخاف من الحبل.   
2                       اللي ما عندو فلوس كلامو مسوس.   
3             امدح صاحبك مع الناس ولومو الراس فالراس.   
4                           دير عين شافت وعين ماشافت.   
5                                       دير عين ميكا.   
6                     اللي بغي لعسل يصبر لقريص النحل.   
7                           الگنازة كبيرة والميت فار.   
8                              قالو شمتك، قالو جربتك.   
9                    اللي تربي على شي حاجة كبر عليها.   
10                         لي حفر شي حفرة كيطيح فيها.   
11                              زواج ليلة تدبيره عام.   
12       الفقيه اللي نتسناو براكته دخل للجامع ببلغته.   
13  اربط حمارك مع الحمير، يتعلم الشيق والنهيق وخرج...   
14                        الفم المسدود ما يدخلو دبان.   
15              طلع تاكول الكرموس, نزل شكون قالها ليك   
16           ماديرش يدك في غيرا

In [36]:
for_cot.to_csv('id2.csv', index=False)