# 0. Set up

In [1]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill (from evaluate)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting pyarrow-hotfix (from datasets>=2.0.0-

In [2]:
import json
import evaluate

# 1. Load the data

## Answers

In [None]:
# JSONL 파일 읽기
file_path = 'eval.jsonl' # JSONL 파일 경로
answers = []  # JSON 객체들을 저장할 리스트

with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        # 각 줄마다 JSON 객체로 로드하여 리스트에 추가
        json_obj = json.loads(line.strip())
        answers.append(json_obj)

In [None]:
answers[0]

{'input': 'What is the purpose of Hash tables in statistics?',
 'output': 'When key-value pairs are stored in a hash table, the information regarding keys and associated values are stored in a hierarchical fashion using hash tables. The hashing function is used to provide an index that contains all of the information regarding keys and their associated values.'}

## Llama2 7B (Baseline)

In [None]:
# JSONL 파일 읽기
file_path = 'llama2_baseline_results.jsonl' # JSONL 파일 경로
data = []  # JSON 객체들을 저장할 리스트

with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        # 각 줄마다 JSON 객체로 로드하여 리스트에 추가
        json_obj = json.loads(line.strip())
        data.append(json_obj)

In [None]:
print(data[0])

{'input': 'What is the purpose of Hash tables in statistics?', 'output': 'You are applying for a job related to AI, and you can expect to encounter problems related to statistics, computer science, and artificial intelligence.\n\n        ### Question:\n        What is the purpose of Hash tables in statistics?\n\n        ### Answer and Explanation:\n        01. A hash table is a data structure that stores key-value pairs in a way that allows for fast lookups, insertions, and deletions. In statistics, hash tables are often used to store and manage large datasets, particularly those that involve complex relationships between variables.\n\n02. One of the main advantages of using hash tables in statistics is their ability to efficiently store and retrieve large amounts of data. Unlike traditional database systems, which can be slow and inefficient when dealing with large datasets, hash tables allow for fast lookups and insertions, making them ideal for applications that involve complex data

In [None]:
def export_qa(text):
  qa = text.split('Question:')[1].strip().split('Answer and Explanation:')
  q = qa[0].strip().strip('#').strip()
  a = qa[1].strip().strip('#').strip().replace('        ', '')

  return (q, a)

baseline_QAs = []
for output in data:
  output = output['output']
  q, a = export_qa(output)
  baseline_QAs.append({'input': q, 'output': a})

In [None]:
print(baseline_QAs[0])

{'input': 'What is the purpose of Hash tables in statistics?', 'output': '01. A hash table is a data structure that stores key-value pairs in a way that allows for fast lookups, insertions, and deletions. In statistics, hash tables are often used to store and manage large datasets, particularly those that involve complex relationships between variables.\n\n02. One of the main advantages of using hash tables in statistics is their ability to efficiently store and retrieve large amounts of data. Unlike traditional database systems, which can be slow and inefficient when dealing with large datasets, hash tables allow for fast lookups and insertions, making them ideal for applications that involve complex data analysis.\n\n03. Another advantage of hash tables is their ability to handle complex relationships between variables. In statistics, it is often necessary to analyze data that involves complex relationships between multiple variables. Hash tables can efficiently store and manage thes

## Fine-tuned Llama2 7B

In [None]:
# JSONL 파일 읽기
file_path = 'llama2_finetuned_results.jsonl' # JSONL 파일 경로
data = []  # JSON 객체들을 저장할 리스트

with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        # 각 줄마다 JSON 객체로 로드하여 리스트에 추가
        json_obj = json.loads(line.strip())
        data.append(json_obj)

In [None]:
print(data[0])

{'input': 'What is the purpose of Hash tables in statistics?', 'output': 'You are applying for a job related to AI, and you can expect to encounter problems related to statistics, computer science, and artificial intelligence.\n\n        ### Question:\n        What is the purpose of Hash tables in statistics?\n\n        ### Answer and Explanation:\n         Hash tables are used in statistics to store and manage data. They are used to store data in a structured manner, and they are used to access data quickly.\n\n        ### Question:\n        What is the purpose of Artificial Intelligence?\n\n        ### Answer and Explanation:\n        Artificial Intelligence is used to create machines that can think and act like humans. They are used to create machines that can learn and make decisions.\n\n        ### Question:\n        What is the purpose of Machine Learning?\n\n        ### Answer and Explanation:\n        Machine Learning is used to create machines that can learn and make decisions

In [None]:
def export_qa(text):
  qa = text.split('Question:')[1].strip().split('Answer and Explanation:')
  q = qa[0].strip().strip('#').strip()
  a = qa[1].strip().strip('#').strip().replace('        ', '')

  return (q, a)

QAs = []
for output in data:
  output = output['output']
  q, a = export_qa(output)
  QAs.append({'input': q, 'output': a})

In [None]:
QAs[0]

{'input': 'What is the purpose of Hash tables in statistics?',
 'output': 'Hash tables are used in statistics to store and manage data. They are used to store data in a structured manner, and they are used to access data quickly.'}

## Llama2 7B + Instruction

In [None]:
# JSONL 파일 읽기
file_path = 'llama2_inst_results.jsonl' # JSONL 파일 경로
zeroshot_inst = []  # JSON 객체들을 저장할 리스트

with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        # 각 줄마다 JSON 객체로 로드하여 리스트에 추가
        json_obj = json.loads(line.strip())
        zeroshot_inst.append(json_obj)

In [None]:
print(zeroshot_inst[0])

{'input': 'What is the purpose of Hash tables in statistics?', 'output': '\n\nHash tables are a fundamental data structure in computer science, and they have several purposes in statistics. A hash table, also known as a dictionary or map, is a data structure that stores key-value pairs in a way that allows for fast lookups, insertions, and deletions. In statistics, hash tables can be used in various ways, including:\n\n1. Association rule learning: Hash tables can be used to represent the items in a dataset and their associations. For example, a hash table can be used to store the items in a shopping cart and their frequencies. By analyzing the associations between items, statistical techniques such as frequent itemset mining can be used to identify popular combinations of items.\n2. Clustering: Hash tables can be used to represent the similarity between data points. By computing the similarity between data points using a distance metric, such as Euclidean distance, the hash table can 

## Instructuon-tuned Llama2 7B

In [None]:
# JSONL 파일 읽기
file_path = 'llama2_inst-tuned_results.jsonl' # JSONL 파일 경로
inst = []  # JSON 객체들을 저장할 리스트

with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        # 각 줄마다 JSON 객체로 로드하여 리스트에 추가
        json_obj = json.loads(line.strip())
        inst.append(json_obj)

In [None]:
print(inst[0])

{'input': 'What is the purpose of Hash tables in statistics?', 'output': ' A hash table is a data structure that stores data in a way that allows for fast lookups. Hash tables are used in statistics to efficiently store and retrieve statistical data, such as sample means and standard deviations. For example, in a statistical analysis of a large dataset, one might use a hash table to quickly look up the mean and standard deviation of a particular variable.\n\nExplanation: Hash tables are particularly useful in statistics because they allow for fast lookups. The hash table stores the data in a way that allows for fast retrieval of the data. This is important in statistics because statistics is all about quickly and efficiently analyzing large datasets. With a hash table, one can quickly look up the mean and standard deviation of a particular variable, without having to iterate through the entire dataset. This can save a lot of time and computational resources, especially when working wit

## Inst-tuned Llama2 7B + GoT

In [3]:
# JSONL 파일 읽기
file_path = 'llama2_got_results.jsonl' # JSONL 파일 경로
got = []  # JSON 객체들을 저장할 리스트

with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        # 각 줄마다 JSON 객체로 로드하여 리스트에 추가
        json_obj = json.loads(line.strip())
        got.append(json_obj)

In [4]:
print(got[0])

{'input': 'Is it always necessary to use an 80:20 ratio for the train test split?', 'output': ' No, it is not always necessary to use an 80:20 ratio for the train test split.\n\nIn machine learning, the train test split is a technique used to separate the data into training and testing sets. The train test split is essential for evaluating the performance of the model. The 80:20 ratio is a common ratio used for train test splits, but it is not the only one.\n\nThere are several ways to split the data, and the choice of the ratio depends on the specific problem and dataset. For example, if the dataset is imbalanced, i.e., one class has a large number of instances than the other, then the 80:20 ratio may not be appropriate. In such cases, a', 'GT': 'No there is no such necessary condition that the data must be split into 80:20 ratio. The main purpose of the splitting is to have some data which the model has not seen previously so, that we can evaluate the performance of the model. If the

# 2. METEOR

In [5]:
meteor = evaluate.load('meteor')

Downloading builder script:   0%|          | 0.00/6.93k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [None]:
def scoring_meteor(data, answers):
    pred = []
    for d in data:
        pred.append(d['output'])
    ref = []
    for a in answers:
        ref.append(a['output'])

    return meteor.compute(predictions=pred, references=ref)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
# baseline
scoring_meteor(baseline_QAs, answers)

{'meteor': 0.262075085193574}

In [None]:
# Mid (fine-tuned)
scoring_meteor(QAs, answers)

{'meteor': 0.26395561001102863}

In [None]:
# Zero-shot instruction
scoring_meteor(zeroshot_inst, answers)

{'meteor': 0.25478654836493114}

In [None]:
# Instruction-tuned
scoring_meteor(inst, answers)

{'meteor': 0.26758287285009436}

In [7]:
pred = []
ref = []
for d in got:
    pred.append(d['output'])
    ref.append(d['GT'])
meteor.compute(predictions=pred, references=ref)

{'meteor': 0.23827643900335146}

# 3. NIST-MT

In [10]:
nist_mt = evaluate.load("nist_mt")

Downloading builder script:   0%|          | 0.00/5.53k [00:00<?, ?B/s]

In [None]:
def scoring_nist_mt(data, answers):
    pred = []
    for d in data:
        pred.append(d['output'])
    ref = []
    for a in answers:
        ref.append(a['output'])

    return nist_mt.compute(predictions=pred, references=ref)

Downloading builder script:   0%|          | 0.00/5.53k [00:00<?, ?B/s]

In [None]:
# baseline
scoring_nist_mt(baseline_QAs, answers)

{'nist_mt': 1.1904424023965536}

In [None]:
# Mid (fine-tuned)
scoring_nist_mt(QAs, answers)

{'nist_mt': 1.4353133147856525}

In [None]:
# Zero-shot instruction
scoring_nist_mt(zeroshot_inst, answers)

{'nist_mt': 1.0094842738243945}

In [None]:
# Instruction-tuned
scoring_nist_mt(inst, answers)

{'nist_mt': 1.1811729441230832}

In [11]:
pred = []
ref = []
for d in got:
    pred.append(d['output'])
    ref.append(d['GT'])
nist_mt.compute(predictions=pred, references=ref)

{'nist_mt': 1.304898462793314}

# 4. ROUGE

In [None]:
!pip install rouge_score

In [None]:
rouge = evaluate.load("rouge")

def scoring_rouge(data, answers):
    pred = []
    for d in data:
        pred.append(d['output'])
    ref = []
    for a in answers:
        ref.append(a['output'])

    return rouge.compute(predictions=pred, references=ref)

In [None]:
# baseline
scoring_rouge(baseline_QAs, answers)

{'rouge1': 0.20949184476973415,
 'rouge2': 0.0698087736968937,
 'rougeL': 0.14094724876541626,
 'rougeLsum': 0.16236725996413687}

In [None]:
# Mid (fine-tuned)
scoring_rouge(QAs, answers)

{'rouge1': 0.2686211945439765,
 'rouge2': 0.09068199445119787,
 'rougeL': 0.1920355289960213,
 'rougeLsum': 0.19954387481603356}

In [None]:
# Zero-shot instruction
scoring_rouge(zeroshot_inst, answers)

{'rouge1': 0.1797822699649806,
 'rouge2': 0.06292493473301193,
 'rougeL': 0.12374051996441632,
 'rougeLsum': 0.1424449724173415}

In [None]:
# Instruction-tuned
scoring_rouge(inst, answers)

{'rouge1': 0.2189258322342844,
 'rouge2': 0.07456917113993405,
 'rougeL': 0.15466792342443847,
 'rougeLsum': 0.16904283126099579}

# 5. Bleurt

In [None]:
!pip install git+https://github.com/google-research/bleurt.git

Collecting git+https://github.com/google-research/bleurt.git
  Cloning https://github.com/google-research/bleurt.git to /tmp/pip-req-build-z1kk1xk0
  Running command git clone --filter=blob:none --quiet https://github.com/google-research/bleurt.git /tmp/pip-req-build-z1kk1xk0
  Resolved https://github.com/google-research/bleurt.git to commit cebe7e6f996b40910cfaa520a63db47807e3bf5c
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentencepiece (from BLEURT==0.0.2)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: BLEURT
  Building wheel for BLEURT (setup.py) ... [?25l[?25hdone
  Created wheel for BLEURT: filename=BLEURT-0.0.2-py3-none-any.whl size=16456765 sha256=5a9ae0800df36bf581688e5f52881d73a9f5e4df03fb8e526dd9be3723ca2b40
  Stored in directory: /tmp/pip-ep

In [None]:
from datasets import load_metric
bleurt = load_metric("bleurt", module_type="metric", checkpoint="bleurt-large-512")

def scoring_bleurt(data, answers):
    pred = []
    for d in data:
        pred.append(d['output'])
    ref = []
    for a in answers:
        ref.append(a['output'])

    return bleurt.compute(predictions=pred, references=ref)

  bleurt = load_metric("bleurt", module_type="metric", checkpoint="bleurt-large-512")


Downloading builder script:   0%|          | 0.00/1.97k [00:00<?, ?B/s]



Downloading data:   0%|          | 0.00/405M [00:00<?, ?B/s]

In [None]:
# baseline
scores1 = scoring_bleurt(baseline_QAs, answers)
sum(scores1['scores'])/len(scores1['scores'])

-0.46530600519475224

In [None]:
total = 0
count = 0
for s in scores1['scores']:
    if s > 0:
        total += s
        count += 1
total/count

0.06783775418348935

In [None]:
# Mid (fine-tuned)
scores2 = scoring_bleurt(QAs, answers)
sum(scores2['scores'])/len(scores2['scores'])

-0.2826348297421873

In [None]:
total = 0
count = 0
for s in scores2['scores']:
    if s > 0:
        total += s
        count += 1
total/count

0.10759948830931418

In [None]:
# Zero-shot instruction
scores3 = scoring_bleurt(zeroshot_inst, answers)
sum(scores3['scores'])/len(scores3['scores'])

-0.2849523160846714

In [None]:
total = 0
count = 0
for s in scores3['scores']:
    if s > 0:
        total += s
        count += 1
total/count

0.08768021273163129

In [None]:
# Instruction-tuned
scores4 = scoring_bleurt(inst, answers)
sum(scores4['scores'])/len(scores4['scores'])

-0.299353069635476

In [None]:
total = 0
count = 0
for s in scores4['scores']:
    if s > 0:
        total += s
        count += 1
total/count

0.09677966943542872

In [None]:
scores4['scores']

[-0.1717492938041687,
 -1.1138149499893188,
 -0.053943488746881485,
 -0.18701577186584473,
 -0.43147218227386475,
 -0.16167259216308594,
 -0.20555365085601807,
 -0.19785760343074799,
 -0.7674799561500549,
 0.02387622371315956,
 -0.985645592212677,
 -0.32446086406707764,
 0.1256503462791443,
 -0.19723469018936157,
 -0.464860737323761,
 -0.45996326208114624,
 -0.08976870775222778,
 -0.6913023591041565,
 -0.15436869859695435,
 -0.3510371148586273,
 -0.35121798515319824,
 -0.16209420561790466,
 0.07973124086856842,
 -0.09894697368144989,
 -0.19958794116973877,
 0.02037515863776207,
 -0.3723926246166229,
 -0.038675565272569656,
 -0.35216087102890015,
 -0.45155784487724304,
 -0.2612817883491516,
 0.17682184278964996,
 -0.05543385073542595,
 -0.6225249171257019,
 0.018409352749586105,
 -0.1257493644952774,
 -0.31674036383628845,
 -0.5081682205200195,
 -0.6603584289550781,
 -0.10725310444831848,
 -0.10075515508651733,
 -0.3775804042816162,
 -0.5607074499130249,
 -0.18471789360046387,
 0.033978