# Installations & Support Functions

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install --upgrade pip  # ensures that pip is current
!git clone https://github.com/google-research/bleurt.git
!pip install ./bleurt

Collecting pip
  Downloading pip-23.3.1-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 23.1.2
    Uninstalling pip-23.1.2:
      Successfully uninstalled pip-23.1.2
Successfully installed pip-23.3.1
Cloning into 'bleurt'...
remote: Enumerating objects: 134, done.[K
remote: Counting objects: 100% (18/18), done.[K
remote: Compressing objects: 100% (18/18), done.[K
remote: Total 134 (delta 0), reused 17 (delta 0), pack-reused 116[K
Receiving objects: 100% (134/134), 31.28 MiB | 23.50 MiB/s, done.
Resolving deltas: 100% (49/49), done.
Processing ./bleurt
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentencepiece (from BLEURT==0.0.2)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━

In [None]:
# Downloads the BLEURT-base checkpoint.
!wget https://storage.googleapis.com/bleurt-oss-21/BLEURT-20.zip .
!unzip BLEURT-20.zip

In [None]:
# Runs the scoring.
!python -m bleurt.score_files \
  -candidate_file=bleurt/test_data/candidates \
  -reference_file=bleurt/test_data/references \
  -bleurt_checkpoint=BLEURT-20

In [None]:
import tensorflow as tf
tf.test.gpu_device_name() # If the GPU is enabled, it will give the following output '/device:GPU:0'

from tensorflow.python.client import device_lib
device_lib.list_local_devices()

In [None]:
import nltk
import nltk.translate.bleu_score as bleu
import math
import numpy as np
import os
import bleurt
from bleurt import score as bleurt_score
import json

try:
  import openai
except:
  !pip install openai
  import openai

f = open("drive/MyDrive/files/api_key.txt", "r")
API_KEY=f.readline()
f.close()

#The OpenAI Key
os.environ['OPENAI_API_KEY'] =API_KEY
openai.api_key = os.getenv("OPENAI_API_KEY")

In [None]:
def save_file(file_path, my_string):
  with open(file_path, 'w') as file:
      for string in my_string:
          file.write(string + '\n')
  print(f"Data saved to {file_path}")

In [None]:
def update_file(file_path, text_to_append):
  with open(file_path, 'a') as file:
      file.write(text_to_append + '\n')

In [None]:
def batchify_list(input_list, batch_size):
    """Divide a list into batches of specified size."""
    return [input_list[i:i + batch_size] for i in range(0, len(input_list), batch_size)]

In [None]:
import numpy as np
from bleurt import score

def bleu_bleurt(predicted_labels=None, true_labels=None):
    """
    This function calculates the BLEU and BLEURT scores of predicted
    translations against the true labels.

    Formats:
        predicted_labels: list(list(str))
        true_labels: list(str)
    """

    scores_dict = {'bleu_score_avg': None,
                   'bleurt_score_avg': None}

    # flatten predicted_labels
    predicted_labels = np.array(predicted_labels).flatten().tolist()

    # scoring with bleu
    scores_bleu = [bleu.sentence_bleu([str(predicted).split()], str(true).split()) for predicted, true in zip(predicted_labels, true_labels)]
    scores_dict['bleu_score_avg'] = np.mean(scores_bleu)

    checkpoint = "/content/BLEURT-20"
    bleurt_scorer = bleurt_score.BleurtScorer(checkpoint)

    # scoring with bleurt
    scores_bleurt = bleurt_scorer.score(references=true_labels, candidates=predicted_labels,
                                        batch_size=128)
    scores_dict['bleurt_score_avg'] = np.mean(scores_bleurt)

    return scores_dict

In [None]:
def prompt_gpt(ft_model, prompt, test_lines, test_labels, few_shot, file_path):
  predicted_labels = []
  error_indices = []
  i = 0
  message_2 = [
          {"role": "system", "content": prompt},
          {"role": "user", "content": 'English: The Cook Islands do not have any cities but are composed of 15 different islands.'},
          {"role": "system", "content": 'Tagalog: Walang kahit among siyudad ang Cock Islands sublet kinabibilangan ng 15 iba-ibang pula.'},
          {"role": "user", "content": 'English: The main ones are Rarotonga and Aitutaki.'},
          {"role": "system", "content": 'Tagalog: Ang pinakamalaki sa mga ito ay ang Rarotonga at Aitutaki.'}]
  message_1 = [
        {"role": "system", "content": prompt},
        {"role": "user", "content": 'English: The Cook Islands do not have any cities but are composed of 15 different islands.'},
        {"role": "system", "content": 'Tagalog: Walang kahit among siyudad ang Cock Islands sublet kinabibilangan ng 15 iba-ibang pula.'}]
  message_0 = [
        {"role": "system", "content": prompt}]

  if few_shot == 2:
    message = message_2
  elif few_shot == 1:
    message = message_1
  else:
    message = message_0

  for batch in test_batch:
    print("Start of New Batch")
    for item in batch:
      try:
        message.append({"role": "user", "content": item})
        response = client.chat.completions.create(
          model= ft_model,
          messages=message
        )
        engtgl_translation = response.choices[0].message.content
        predicted_labels.append(str(engtgl_translation))
        update_file(file_path, engtgl_translation)
      except Exception as e:
        error_indices.append(i)
        print(f"Error at index {i}: {str(e)}")
      finally:
        i += 1
    print(str(i) + ' out of 402 completed')
  test_labels_noerror = [label for index, label in enumerate(test_labels) if index not in error_indices]
  scores_dict = bleu_bleurt(predicted_labels,test_labels_noerror)
  return predicted_labels, scores_dict, error_indices

# Load Flores200 Train & Val Datasets

In [None]:
# load train, val, and test datasets
train_path = '/content/drive/MyDrive/w266_project/FloRes200/train_eng_inputs.txt'
val_path = '/content/drive/MyDrive/w266_project/FloRes200/val_eng_inputs.txt'
test_path = '/content/drive/MyDrive/w266_project/FloRes200/test_eng_inputs.txt'

train_label = '/content/drive/MyDrive/w266_project/FloRes200/train_tgl_labels.txt'
val_label = '/content/drive/MyDrive/w266_project/FloRes200/val_tgl_labels.txt'
test_label = '/content/drive/MyDrive/w266_project/FloRes200/test_tgl_labels.txt'

with open(train_path, 'r') as file:
    train_lines = [line.strip() for line in file]

with open(val_path, 'r') as file:
    val_lines = [line.strip() for line in file]

with open(test_path, 'r') as file:
    test_lines = [line.strip() for line in file]

with open(train_label, 'r') as file:
    train_labels = [line.strip() for line in file]

with open(val_label, 'r') as file:
    val_labels = [line.strip() for line in file]

with open(test_label, 'r') as file:
    test_labels = [line.strip() for line in file]

In [None]:
print(f"Train dataset size: {len(train_lines)}")
print()
print(f"Val dataset size: {len(val_lines)}")
print()
print(f"Test dataset size: {len(test_lines)}")

Train dataset size: 1205

Val dataset size: 402

Test dataset size: 402


In [None]:
print(train_lines[0])
print(train_labels[0])

Scotturb Bus 403 travels regularly to Sintra, stopping at Cabo da Roca.
Regular na bumibiyahe ang Scotturb Bus 403 patungong Sintra, tumitigil sa Cabo da Roca.


In [None]:
test_batch = batchify_list(test_lines, 25)
print(len(test_batch))

17


# Load Compiled Test Dataset

In [None]:
import json

def print_json_structure(data, indent=0):
    for key, value in data.items():
        print('  ' * indent + str(key), end=': ')
        if isinstance(value, dict):
            print()
            print_json_structure(value, indent + 1)
        elif isinstance(value, list):
            print(f'List of {len(value)} items')
            if value and isinstance(value[0], dict):
                print_json_structure(value[0], indent + 1)
        else:
            print(type(value).__name__)

In [None]:
file_path = '/content/drive/MyDrive/w266_project/Final_Splits/corpora_splits.json'

# Open and read the JSON file
with open(file_path, 'r') as file:
    data = json.load(file)

# Print the structure of the JSON file
print_json_structure(data)


train_eng_inputs: List of 41653 items
val_eng_inputs: List of 4629 items
test_eng_inputs: List of 1000 items
train_tgl_labels: List of 41653 items
val_tgl_labels: List of 4629 items
test_tgl_labels: List of 1000 items


In [None]:
native_speaker_test_lines = data['test_eng_inputs'][:5]
native_speaker_test_labels = data['test_tgl_labels'][:5]

print(f"Test dataset size: {len(native_speaker_test_lines)}")
print(f"Test label dataset size: {len(native_speaker_test_labels)}")

Test dataset size: 5
Test label dataset size: 5


In [None]:
print(native_speaker_test_lines[0])
print(native_speaker_test_labels[0])

A bottle fell onto the floor and shattered.
Bote isang nahulog papunta sa sahig at shattered.


# Test Flores200 Test Dataset on Baseline Model

In [None]:
file_path = "/content/drive/MyDrive/w266_project/gpt_results/BASELINE_predicted_engtgl_1.txt"
my_string = []
save_file(file_path, my_string)

Data saved to /content/drive/MyDrive/w266_project/gpt_results/BASELINE_predicted_engtgl_1000.txt


In [None]:
ft_model = "gpt-3.5-turbo"
prompt = "Translate from English to Tagalog."
few_shot = 0
file_path = "/content/drive/MyDrive/w266_project/gpt_results/BASELINE_predicted_engtgl_1.txt"

from openai import OpenAI
client = OpenAI()

predicted_labels, scores_dict, error_indices = prompt_gpt(ft_model, prompt, test_lines, test_labels, few_shot, file_path)
print("Number of predicted_labels:", len(predicted_labels))
print()
print("Number of error_indices:", len(error_indices))
print()
total_lines = len(predicted_labels) + len(error_indices)
print("Total predicted_labels and error_indices:", total_lines)
print()
print("BLEU and BLEURT Scores:", scores_dict)

In [None]:
file_path = "/content/drive/MyDrive/w266_project/gpt_results/BASELINE_predicted_engtgl_1.txt"

with open(file_path, 'r') as file:
    predicted_labels_load = [line.strip() for line in file]

print(len(predicted_labels_load))
print(len(test_lines))

# print(len(predicted_labels))

402
402


In [None]:
print("User input:", test_lines[-1])
print("GPT 3.5 Turbo response:", predicted_labels_load[-1])
print("\n")

print("User input:", test_lines[100])
print("GPT 3.5 Turbo response:", predicted_labels_load[100])
print("\n")

print("User input:", test_lines[0])
print("GPT 3.5 Turbo response:", predicted_labels_load[0])
print("\n")

User input: First, the switch for the light fixture needs to be turned off or the cable disconnected.
GPT 3.5 Turbo response: Unang-una, kailangan patayin ang switch ng ilaw o tanggalin ang kable.


User input: On Wednesday, the United States' National Basketball Association (NBA) suspended its professional basketball season due to concerns regarding COVID-19.
GPT 3.5 Turbo response: Sa Miyerkules, itinigil ng National Basketball Association (NBA) ng Estados Unidos ang kanilang propesyonal na basketball season dahil sa mga alalahanin ukol sa COVID-19.


User input: In 1994, the ethnically Armenian Nagorno-Karabakh region of Azerbaijan waged war against the Azeris.
GPT 3.5 Turbo response: Noong 1994, ang rehiyong etnikong Armenian ng Nagorno-Karabakh sa Azerbaijan ay nagsimulang mag-giyera laban sa mga Azeri.




In [None]:
scores_dict_baseline = bleu_bleurt(predicted_labels_load,test_labels)
print(scores_dict_baseline)

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


{'bleu_score_avg': 0.21214525337448434, 'bleurt_score_avg': 0.5443149917754367}


# Fine-Tune Model on Flores200 Train & Val Dataset

In [None]:
# Ensure you have the same number of input and label sentences
# assert len(train_lines) == len(train_labels)

base = 'Translate from English to Tagalog.'

json_train = '/content/drive/MyDrive/w266_project/gpt-3.5_json_prompts/flores_eng_tgl_base_train.jsonl'
json_val = '/content/drive/MyDrive/w266_project/gpt-3.5_json_prompts/flores_eng_tgl_base_val.jsonl'

# Train JSON
with open(json_train, 'w', encoding='utf-8') as file:
    # Iterate over the paired sentences
    for eng, tgl in zip(train_lines, train_labels):
        # Create a dictionary for the current sentence pair
        record_1 = {"messages": [{"role": "system", "content": base}, {"role": "user", "content": eng}, {"role": "assistant", "content": tgl}]}
        # Convert the dictionary to a JSON string
        json_record_1 = json.dumps(record_1, ensure_ascii=False)
        # Write the JSON string to the file, followed by a newline
        file.write(json_record_1 + '\n')

# Val JSON
with open(json_val, 'w', encoding='utf-8') as file:
    # Iterate over the paired sentences
    for eng, tgl in zip(val_lines, val_labels):
        # Create a dictionary for the current sentence pair
        record_1 = {"messages": [{"role": "system", "content": base}, {"role": "user", "content": eng}, {"role": "assistant", "content": tgl}]}
        # Convert the dictionary to a JSON string
        json_record_1 = json.dumps(record_1, ensure_ascii=False)
        # Write the JSON string to the file, followed by a newline
        file.write(json_record_1 + '\n')

In [None]:
from openai import OpenAI
client = OpenAI()

training_file = client.files.create(
  file=open(json_train, "rb"),
  purpose="fine-tune"
)

val_file = client.files.create(
  file=open(json_val, "rb"),
  purpose="fine-tune"
)

In [None]:
print('Training File ID:', training_file.id)
print('Val File ID:', val_file.id)

Training File ID: file-thNXtFb9YUSBXq6nOT7tlBo9
Val File ID: file-wlnXi9Cg74YlVYDFo7E2p5Xg


In [None]:
response = client.fine_tuning.jobs.create(
  training_file= str(training_file.id),
  model="gpt-3.5-turbo",
  validation_file= str(val_file.id)
)

In [None]:
# check status
ft_jobs = client.fine_tuning.jobs.list(limit=1)
print(ft_jobs)

SyncCursorPage[FineTuningJob](data=[FineTuningJob(id='ftjob-KoKE1YnUtyMDDVq4BuE4R7xG', created_at=1700644029, error=None, fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='gpt-3.5-turbo-0613', object='fine_tuning.job', organization_id='org-5TeNP380fjLiOrsiQuUc1fET', result_files=[], status='validating_files', trained_tokens=None, training_file='file-ZTxKpM1RKL7AvRq8RD2sXSSK', validation_file='file-i1EWq3dS1fUDHoYKSKXMqP0h')], object='list', has_more=True)


# Hyperparameter-Tuning Iterations on Finetuned Model



In [None]:
from openai import OpenAI
client = OpenAI()

response = client.fine_tuning.jobs.create(
  training_file= str(training_file.id),
  model="gpt-3.5-turbo",
  validation_file= str(val_file.id),
  hyperparameters={
    "n_epochs":5,
    "batch_size" :'auto',
    "learning_rate_multiplier" : 'auto'}
)
# Val Loss: 0.4952
# ftjob-P2I8hieu9ShHIU8ok5pJH4uK

In [None]:
from openai import OpenAI
client = OpenAI()

response = client.fine_tuning.jobs.create(
  training_file= str(training_file.id),
  model="gpt-3.5-turbo",
  validation_file= str(val_file.id),
  hyperparameters={
    "n_epochs": 'auto',
    "batch_size" : 16,
    "learning_rate_multiplier" : 'auto'}
)
# Val Loss: 0.6515

In [None]:
from openai import OpenAI
client = OpenAI()

response = client.fine_tuning.jobs.create(
  training_file= str(training_file.id),
  model="gpt-3.5-turbo",
  validation_file= str(val_file.id),
  hyperparameters={
    "n_epochs": 'auto',
    "batch_size" : 'auto',
    "learning_rate_multiplier" : 1.0}
)
# Val Loss: 0.7853

In [None]:
from openai import OpenAI
client = OpenAI()

training_file = 'file-thNXtFb9YUSBXq6nOT7tlBo9'
val_file = 'file-wlnXi9Cg74YlVYDFo7E2p5Xg'

response = client.fine_tuning.jobs.create(
  training_file= str(training_file),
  model="gpt-3.5-turbo",
  validation_file= str(val_file),
  hyperparameters={
    "n_epochs":7,
    "batch_size" :'auto',
    "learning_rate_multiplier" : 'auto'}
)

# Val Loss: 0.9393

In [None]:
# FAILED due to lack of credits
from openai import OpenAI
client = OpenAI()

training_file = 'file-thNXtFb9YUSBXq6nOT7tlBo9'
val_file = 'file-wlnXi9Cg74YlVYDFo7E2p5Xg'

response = client.fine_tuning.jobs.create(
  training_file= str(training_file),
  model="gpt-3.5-turbo",
  validation_file= str(val_file),
  hyperparameters={
    "n_epochs":7,
    "batch_size" :16,
    "learning_rate_multiplier" : 'auto'}
)

In [None]:
# FAILED due to lack of credits

from openai import OpenAI
client = OpenAI()

response = client.fine_tuning.jobs.create(
  training_file= str(training_file),
  model="gpt-3.5-turbo",
  validation_file= str(val_file),
  hyperparameters={
    "n_epochs":5,
    "batch_size" : 16,
    "learning_rate_multiplier" : 'auto'}
)

In [None]:
# check status
ft_jobs = client.fine_tuning.jobs.list(limit=1)
print(ft_jobs)

# Apply Test Dataset on Finetuned Model

In [None]:
file_path = "/content/drive/MyDrive/w266_project/gpt_results/BASELINE_FINETUNE_predicted_engtgl_1000.txt"
my_string = []
save_file(file_path, my_string)

Data saved to /content/drive/MyDrive/w266_project/gpt_results/BASELINE_FINETUNE_predicted_engtgl_1.txt


In [None]:
test_batch = batchify_list(test_lines, 25)

In [None]:
ft_model = 'ft:gpt-3.5-turbo-0613:personal::8Nebv94z'
prompt = "Translate from English to Tagalog."
few_shot = 0
file_path = "/content/drive/MyDrive/w266_project/gpt_results/BASELINE_FINETUNE_predicted_engtgl_1000.txt"

from openai import OpenAI
client = OpenAI()

predicted_labels_ft, scores_dict_ft, error_indices_ft = prompt_gpt(ft_model, prompt, test_lines, test_labels, few_shot, file_path)

print("Number of predicted_labels:", len(predicted_labels_ft))
print()
print("Number of error_indices:", len(error_indices_ft))
print()
total_lines = len(predicted_labels_ft) + len(error_indices_ft)
print("Total predicted_labels and error_indices:", total_lines)
print()
print("BLEU and BLEURT Scores:", scores_dict_ft)

In [None]:
file_path = "/content/drive/MyDrive/w266_project/gpt_results/BASELINE_FINETUNE_predicted_engtgl_1.txt"

with open(file_path, 'r') as file:
    predicted_labels_load_ft = [line.strip() for line in file]

predicted_labels_load_ft = predicted_labels_load_ft[:-4]

In [None]:
print('This is the size of test_lines:', len(test_lines))
print('This is the size of predicted_labels:', len(predicted_labels_load_ft))

This is the size of test_lines: 402
This is the size of predicted_labels: 402


In [None]:
print("User input:", test_lines[-1])
print("FT GPT 3.5 response:", predicted_labels_load_ft[-1])
print("\n")

print("User input:", test_lines[100])
print("FT GPT 3.5 response:", predicted_labels_load_ft[100])
print("\n")

print("User input:", test_lines[0])
print("FT GPT 3.5 response:", predicted_labels_load_ft[0])
print("\n")

User input: First, the switch for the light fixture needs to be turned off or the cable disconnected.
FT GPT 3.5 response: Sa simula, ang parehong switch para sa mga ilaw sa kuwarto at sa labas nito ay kailangang naka-off o nakahiwalay ang kable.


User input: On Wednesday, the United States' National Basketball Association (NBA) suspended its professional basketball season due to concerns regarding COVID-19.
FT GPT 3.5 response: Ngayong Miyerkules, itinigil ng Pambansang Asosasyon ng Basketbol ng Estados Unidos (NBA) ang kanilang propesyonal na panahon ng basketbol dahil sa mga pangamba tungkol sa COVID-19.


User input: In 1994, the ethnically Armenian Nagorno-Karabakh region of Azerbaijan waged war against the Azeris.
FT GPT 3.5 response: Noong 1994, ang etnolinggwistikong rehiyon ng Nagorno-Karabakh ng Azebaijan ay sumalungat na sa giyeraan laban sa mga Azeri.




In [None]:
scores_dict_ft = bleu_bleurt(predicted_labels_load_ft,test_labels)
print(scores_dict_ft)

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


{'bleu_score_avg': 0.16440750059417866, 'bleurt_score_avg': 0.4320800552024177}


# Apply Test Data on HP-Tuned FT Model

In [None]:
hp_model = 'ft:gpt-3.5-turbo-0613:personal::8SKb9G6V'
prompt = "Translate from English to Tagalog."
few_shot = 0
file_path = "/content/drive/MyDrive/w266_project/gpt_results/HP_FINETUNE_predicted_engtgl_1.txt"

from openai import OpenAI
client = OpenAI()

HP_predicted_labels = []
i = 0
for batch in test_batch:
  for item in batch:
    response = client.chat.completions.create(
      model=hp_model,
      messages=[
        {"role": "system", "content": prompt},
        {"role": "user", "content": item}
      ]
    )
    engtgl_translation = response.choices[0].message.content
    update_file(file_path, engtgl_translation)
    HP_predicted_labels.append(str(engtgl_translation))
    i += 1
    print(str(i) + ' out of 402 completed')

1 out of 402 completed
2 out of 402 completed
3 out of 402 completed
4 out of 402 completed
5 out of 402 completed
6 out of 402 completed
7 out of 402 completed
8 out of 402 completed
9 out of 402 completed
10 out of 402 completed
11 out of 402 completed
12 out of 402 completed
13 out of 402 completed
14 out of 402 completed
15 out of 402 completed
16 out of 402 completed
17 out of 402 completed
18 out of 402 completed
19 out of 402 completed
20 out of 402 completed
21 out of 402 completed
22 out of 402 completed
23 out of 402 completed
24 out of 402 completed
25 out of 402 completed
26 out of 402 completed
27 out of 402 completed
28 out of 402 completed
29 out of 402 completed
30 out of 402 completed
31 out of 402 completed
32 out of 402 completed
33 out of 402 completed
34 out of 402 completed
35 out of 402 completed
36 out of 402 completed
37 out of 402 completed
38 out of 402 completed
39 out of 402 completed
40 out of 402 completed
41 out of 402 completed
42 out of 402 completed
4

In [None]:
file_path = "/content/drive/MyDrive/w266_project/gpt_results/HP_FINETUNE_predicted_engtgl_1.txt"

with open(file_path, 'r') as file:
    predicted_labels_load_ft = [line.strip() for line in file]

In [None]:
print('This is the size of test_lines:', len(test_lines))
print('This is the size of HP_predicted_labels:', len(HP_predicted_labels))
print('This is the size of predicted_labels:', len(predicted_labels_load_ft))

This is the size of test_lines: 402
This is the size of HP_predicted_labels: 402
This is the size of predicted_labels: 404


In [None]:
print("User input:", test_lines[-1])
print("FT GPT 3.5 response:", HP_predicted_labels[-1])
print("\n")

print("User input:", test_lines[100])
print("FT GPT 3.5 response:", HP_predicted_labels[100])
print("\n")

print("User input:", test_lines[0])
print("FT GPT 3.5 response:", HP_predicted_labels[0])
print("\n")

User input: First, the switch for the light fixture needs to be turned off or the cable disconnected.
FT GPT 3.5 response: Una sa lahat, ang switch para sa ilaw ay dapat i-off o ang kable ay alisin.


User input: On Wednesday, the United States' National Basketball Association (NBA) suspended its professional basketball season due to concerns regarding COVID-19.
FT GPT 3.5 response: Nitong Miyerkules, tinigil ng Pambansang Asosasyon ng Basketbol ng Estados Unidos (NBA) ang kanilang propesyonal na season ng basketbol dahil sa mga pangamba hinggil sa COVID-19.


User input: In 1994, the ethnically Armenian Nagorno-Karabakh region of Azerbaijan waged war against the Azeris.
FT GPT 3.5 response: Noong 1994, ang etnikong rehiyon ng Nagorno-Karabakh na Armenia sa Azerbaijan ay nanimulang makipagdigma sa mga Azeris.




In [None]:
# HP Model Summary

HP_scores_dict = bleu_bleurt(HP_predicted_labels,test_labels)
print(HP_scores_dict)

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


{'bleu_score_avg': 0.1712545398867303, 'bleurt_score_avg': 0.43872055613357036}


# Prompt Evaluation

In [None]:
promptA = 'Translate from English to Tagalog.'
promptB = 'You are a professional English-Tagalog translator. Translate from English to Tagalog.'
promptC = 'You are a fluent English-Tagalog speaker. Translate from English to Tagalog.'
promptD = 'Tagalog is an Austronesian language spoken in Luzon and neighboring islands and forming the basis of the standardized national language of the Philippines (Filipino). Its vocabulary has been much influenced by Spanish and English, and to some extent by Chinese, Sanskrit, Tamil, and Malay. Translate from English to Tagalog.'
promptE = 'Tagalog is an Austronesian language spoken in Luzon and neighboring islands and forming the basis of the standardized national language of the Philippines (Filipino). Its vocabulary has been much influenced by Spanish and English, and to some extent by Chinese, Sanskrit, Tamil, and Malay. You are a professional English-Tagalog translator. Translate from English to Tagalog.'
promptF = 'Tagalog is an Austronesian language spoken in Luzon and neighboring islands and forming the basis of the standardized national language of the Philippines (Filipino). Its vocabulary has been much influenced by Spanish and English, and to some extent by Chinese, Sanskrit, Tamil, and Malay. You are a fluent English-Tagalog speaker. Translate from English to Tagalog.'
promptG_1shot = 'Tagalog is an Austronesian language spoken in Luzon and neighboring islands and forming the basis of the standardized national language of the Philippines (Filipino). Its vocabulary has been much influenced by Spanish and English, and to some extent by Chinese, Sanskrit, Tamil, and Malay. You are a professional English-Tagalog translator. We would like to translate English sentences to Tagalog. Here is an example of a translation. Make sure to translate correctly.'
promptH_2shot = 'Tagalog is an Austronesian language spoken in Luzon and neighboring islands and forming the basis of the standardized national language of the Philippines (Filipino). Its vocabulary has been much influenced by Spanish and English, and to some extent by Chinese, Sanskrit, Tamil, and Malay. You are a fluent English-Tagalog speaker. We would like to translate English sentences to Tagalog. Here are some examples of translations. Make sure to translate correctly.'

# Prompt + Finetuning

In [None]:
file_path_1_bl = "/content/drive/MyDrive/w266_project/gpt_results/Prompt1_BASELINE_predicted_engtgl.txt"
file_path_2_bl = "/content/drive/MyDrive/w266_project/gpt_results/Prompt2_BASELINE_predicted_engtgl.txt"
file_path_3_bl = "/content/drive/MyDrive/w266_project/gpt_results/Prompt3_BASELINE_predicted_engtgl.txt"

P1_predicted_labels = []
P2_predicted_labels = []
P3_predicted_labels = []

save_file(file_path_1_bl, P1_predicted_labels)
save_file(file_path_2_bl, P2_predicted_labels)
save_file(file_path_3_bl, P3_predicted_labels)

Data saved to /content/drive/MyDrive/w266_project/gpt_results/Prompt1_BASELINE_predicted_engtgl.txt
Data saved to /content/drive/MyDrive/w266_project/gpt_results/Prompt2_BASELINE_predicted_engtgl.txt
Data saved to /content/drive/MyDrive/w266_project/gpt_results/Prompt3_BASELINE_predicted_engtgl.txt


In [None]:
prompt_1 = promptC
prompt_2 = promptD
prompt_3 = promptH_2shot

In [None]:
# Prompt 1

model = "gpt-3.5-turbo"
# ft_model = 'ft:gpt-3.5-turbo-0613:personal::8Nebv94z'

file_path = "/content/drive/MyDrive/w266_project/gpt_results/Prompt1_BASELINE_predicted_engtgl.txt"
prompt = promptC

from openai import OpenAI
client = OpenAI()

P1_predicted_labels = []
i = 0
for batch in test_batch:
  for item in batch:
    response = client.chat.completions.create(
      model=model,
      messages=[
        {"role": "system", "content": prompt},
        {"role": "user", "content": item}
      ]
    )
    engtgl_translation = response.choices[0].message.content
    update_file(file_path, engtgl_translation)
    P1_predicted_labels.append(str(engtgl_translation))
    i += 1
    print(str(i) + ' out of 402 completed')


In [None]:
file_path = "/content/drive/MyDrive/w266_project/gpt_results/Prompt1_BASELINE_predicted_engtgl.txt"

with open(file_path, 'r') as file:
    P1_predicted_labels_load = [line.strip() for line in file]

print("Number of predicted_labels:", len(P1_predicted_labels_load))
print()
print("Number of predicted_labels:", len(P1_predicted_labels))
print()

Number of predicted_labels: 403

Number of predicted_labels: 402



In [None]:
# Prompt 1 Summary

scores_dict_ft = bleu_bleurt(P1_predicted_labels,test_labels)
print(scores_dict_ft)

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


{'bleu_score_avg': 0.20798708301216756, 'bleurt_score_avg': 0.5378931139350234}


In [None]:
# Prompt 2

model = "gpt-3.5-turbo"
# ft_model = 'ft:gpt-3.5-turbo-0613:personal::8Nebv94z'

file_path = "/content/drive/MyDrive/w266_project/gpt_results/Prompt2_BASELINE_predicted_engtgl.txt"
prompt = promptD

from openai import OpenAI
client = OpenAI()

P2_predicted_labels = []
i = 0
for batch in test_batch:
  for item in batch:
    response = client.chat.completions.create(
      model=model,
      messages=[
        {"role": "system", "content": prompt},
        {"role": "user", "content": item}
      ]
    )
    engtgl_translation = response.choices[0].message.content
    update_file(file_path, engtgl_translation)
    P2_predicted_labels.append(str(engtgl_translation))
    i += 1
    print(str(i) + ' out of 402 completed')


In [None]:
file_path = "/content/drive/MyDrive/w266_project/gpt_results/Prompt2_BASELINE_predicted_engtgl.txt"

with open(file_path, 'r') as file:
    P2_predicted_labels_load = [line.strip() for line in file]

print("Number of predicted_labels:", len(P2_predicted_labels_load))
print()
print("Number of predicted_labels:", len(P2_predicted_labels))
print()

Number of predicted_labels: 402

Number of predicted_labels: 402



In [None]:
# Prompt 2 Summary

P2_scores_dict = bleu_bleurt(P2_predicted_labels,test_labels)
print(P2_scores_dict)

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


{'bleu_score_avg': 0.21840671367787864, 'bleurt_score_avg': 0.5481355164144466}


In [None]:
# Prompt 3

model = "gpt-3.5-turbo"
# ft_model = 'ft:gpt-3.5-turbo-0613:personal::8Nebv94z'

file_path = "/content/drive/MyDrive/w266_project/gpt_results/Prompt3_BASELINE_predicted_engtgl.txt"
prompt = promptH_2shot

from openai import OpenAI
client = OpenAI()

P3_predicted_labels = []
i = 0
for batch in test_batch:
  for item in batch:
    response = client.chat.completions.create(
      model=model,
      messages=[
        {"role": "system", "content": prompt},
        {"role": "user", "content": 'English: The Cook Islands do not have any cities but are composed of 15 different islands.'},
        {"role": "system", "content": 'Tagalog: Walang kahit among siyudad ang Cock Islands sublet kinabibilangan ng 15 iba-ibang pula.'},
        {"role": "user", "content": 'English: The main ones are Rarotonga and Aitutaki.'},
        {"role": "system", "content": 'Tagalog: Ang pinakamalaki sa mga ito ay ang Rarotonga at Aitutaki.'},
        {"role": "user", "content": item}
      ]
    )
    engtgl_translation = response.choices[0].message.content
    update_file(file_path, engtgl_translation)
    P3_predicted_labels.append(str(engtgl_translation))
    i += 1
    print(str(i) + ' out of 402 completed')


In [None]:
file_path = "/content/drive/MyDrive/w266_project/gpt_results/Prompt3_BASELINE_predicted_engtgl.txt"

with open(file_path, 'r') as file:
    P3_predicted_labels_load = [line.strip() for line in file]

print("Number of predicted_labels:", len(P3_predicted_labels_load))
print()
print("Number of predicted_labels:", len(P3_predicted_labels))
print()

Number of predicted_labels: 402

Number of predicted_labels: 402



In [None]:
# Prompt 3 Summary

P3_scores_dict = bleu_bleurt(P3_predicted_labels,test_labels)
print(P3_scores_dict)

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


{'bleu_score_avg': 0.21471100924878034, 'bleurt_score_avg': 0.5417923018595769}


# Gather Translations for Native Speaker Evaluation

In [None]:
BASELINE_filepath_ns = "/content/drive/MyDrive/w266_project/gpt_results/BASELINE_predicted_engtgl_ns.txt"
FINETUNE_filepath_ns = "/content/drive/MyDrive/w266_project/gpt_results/FINETUNE_predicted_engtgl_ns.txt"
P1_filepath_ns = "/content/drive/MyDrive/w266_project/gpt_results/Prompt1_BASELINE_predicted_engtgl_ns.txt"
P2_filepath_ns = "/content/drive/MyDrive/w266_project/gpt_results/Prompt2_BASELINE_predicted_engtgl_ns.txt"
P3_filepath_ns = "/content/drive/MyDrive/w266_project/gpt_results/Prompt3_BASELINE_predicted_engtgl_ns.txt"

P1_predicted_labels = []
P2_predicted_labels = []
P3_predicted_labels = []

save_file(BASELINE_filepath_ns, P1_predicted_labels)
save_file(FINETUNE_filepath_ns, P2_predicted_labels)
save_file(P1_filepath_ns, P3_predicted_labels)
save_file(P2_filepath_ns, P3_predicted_labels)
save_file(P3_filepath_ns, P3_predicted_labels)

Data saved to /content/drive/MyDrive/w266_project/gpt_results/Prompt2_BASELINE_predicted_engtgl_ns.txt


In [None]:
native_speaker_test_lines = ["A bottle fell onto the floor and shattered.",
                             "Look at me.",
                             "Saying sorry for what happened, I don't think Hae Ra would want that.",
                             "CAPULET Why, how now, kinsman! wherefore storm you so?",
                             "And what of Irene Adler?"]

In [None]:
for item in native_speaker_test_lines:
  print(item)
  print()

A bottle fell onto the floor and shattered.

Look at me.

Saying sorry for what happened, I don't think Hae Ra would want that.

CAPULET Why, how now, kinsman! wherefore storm you so?

And what of Irene Adler?



In [None]:
# Baseline
prompt_base = "Translate from English to Tagalog."
batch = native_speaker_test_lines
file_path = BASELINE_filepath_ns

from openai import OpenAI
client = OpenAI()

baseline_predicted_labels = []
i = 0
for item in batch:
  response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
      {"role": "system", "content": prompt_base},
      {"role": "user", "content": item}
    ]
  )
  engtgl_translation = response.choices[0].message.content
  update_file(file_path, engtgl_translation)
  P1_predicted_labels.append(str(engtgl_translation))
  i += 1
  print(str(i) + ' out of 5 completed')


1 out of 5 completed
2 out of 5 completed
3 out of 5 completed
4 out of 5 completed
5 out of 5 completed


In [None]:
# Finetune

prompt_base = "Translate from English to Tagalog."
batch = native_speaker_test_lines
file_path = FINETUNE_filepath_ns

from openai import OpenAI
client = OpenAI()

baseline_predicted_labels = []
i = 0
for item in batch:
  response = client.chat.completions.create(
    model="ft:gpt-3.5-turbo-0613:personal::8Nebv94z",
    messages=[
      {"role": "system", "content": prompt_base},
      {"role": "user", "content": item}
    ]
  )
  engtgl_translation = response.choices[0].message.content
  update_file(file_path, engtgl_translation)
  P1_predicted_labels.append(str(engtgl_translation))
  i += 1
  print(str(i) + ' out of 5 completed')


1 out of 5 completed
2 out of 5 completed
3 out of 5 completed
4 out of 5 completed
5 out of 5 completed


In [None]:
# Finetune + Hyperparameter Tuning

hp_model = 'ft:gpt-3.5-turbo-0613:personal::8SKb9G6V'
prompt = "Translate from English to Tagalog."
batch = native_speaker_test_lines
few_shot = 0
file_path = "/content/drive/MyDrive/w266_project/gpt_results/HP_FINETUNE_predicted_engtgl_ns.txt"

HP_predicted_labels = []
save_file(file_path, HP_predicted_labels)

from openai import OpenAI
client = OpenAI()

i = 0
for item in batch:
  response = client.chat.completions.create(
    model=hp_model,
    messages=[
      {"role": "system", "content": prompt},
      {"role": "user", "content": item}
    ]
  )
  engtgl_translation = response.choices[0].message.content
  update_file(file_path, engtgl_translation)
  HP_predicted_labels.append(str(engtgl_translation))
  i += 1
  print(str(i) + ' out of 5 completed')

Data saved to /content/drive/MyDrive/w266_project/gpt_results/HP_FINETUNE_predicted_engtgl_ns.txt
1 out of 5 completed
2 out of 5 completed
3 out of 5 completed
4 out of 5 completed
5 out of 5 completed


In [None]:
# Prompt 1

model = "gpt-3.5-turbo"
# ft_model = 'ft:gpt-3.5-turbo-0613:personal::8Nebv94z'

file_path = P1_filepath_ns
prompt = promptC
batch = native_speaker_test_lines

from openai import OpenAI
client = OpenAI()

P1_predicted_labels = []
i = 0
for item in batch:
  response = client.chat.completions.create(
    model=model,
    messages=[
      {"role": "system", "content": prompt},
      {"role": "user", "content": item}
    ]
  )
  engtgl_translation = response.choices[0].message.content
  update_file(file_path, engtgl_translation)
  P1_predicted_labels.append(str(engtgl_translation))
  i += 1
  print(str(i) + ' out of 5 completed')


1 out of 5 completed
2 out of 5 completed
3 out of 5 completed
4 out of 5 completed
5 out of 5 completed


In [None]:
# Prompt 2

model = "gpt-3.5-turbo"
# ft_model = 'ft:gpt-3.5-turbo-0613:personal::8Nebv94z'

file_path = P2_filepath_ns
prompt = promptD
batch = native_speaker_test_lines

from openai import OpenAI
client = OpenAI()

P2_predicted_labels = []
i = 0
for item in batch:
  response = client.chat.completions.create(
    model=model,
    messages=[
      {"role": "system", "content": prompt},
      {"role": "user", "content": item}
    ]
  )
  engtgl_translation = response.choices[0].message.content
  update_file(file_path, engtgl_translation)
  P2_predicted_labels.append(str(engtgl_translation))
  i += 1
  print(str(i) + ' out of 5 completed')


1 out of 5 completed
2 out of 5 completed
3 out of 5 completed
4 out of 5 completed
5 out of 5 completed


In [None]:
# Prompt 3

model = "gpt-3.5-turbo"
# ft_model = 'ft:gpt-3.5-turbo-0613:personal::8Nebv94z'

file_path = P3_filepath_ns
prompt = promptH_2shot
batch = native_speaker_test_lines

from openai import OpenAI
client = OpenAI()

P3_predicted_labels = []
i = 0
for item in batch:
  response = client.chat.completions.create(
    model=model,
    messages=[
      {"role": "system", "content": prompt},
      {"role": "user", "content": 'English: The Cook Islands do not have any cities but are composed of 15 different islands.'},
      {"role": "system", "content": 'Tagalog: Walang kahit among siyudad ang Cock Islands sublet kinabibilangan ng 15 iba-ibang pula.'},
      {"role": "user", "content": 'English: The main ones are Rarotonga and Aitutaki.'},
      {"role": "system", "content": 'Tagalog: Ang pinakamalaki sa mga ito ay ang Rarotonga at Aitutaki.'},
      {"role": "user", "content": item}
    ]
  )
  engtgl_translation = response.choices[0].message.content
  update_file(file_path, engtgl_translation)
  P3_predicted_labels.append(str(engtgl_translation))
  i += 1
  print(str(i) + ' out of 5 completed')


1 out of 5 completed
2 out of 5 completed
3 out of 5 completed
4 out of 5 completed
5 out of 5 completed
