In [1]:
import psutil

def check_if_running(process_name):
    running = False
    for proc in psutil.process_iter(["name"]):
        if process_name in proc.info["name"]:
            running = True
            break
    return running

ollama_running = check_if_running("ollama")

if not ollama_running:
    raise RuntimeError("Ollama not running. Launch ollama before proceeding.")
    print("Ollama running:", check_if_running("ollama"))

In [2]:
import json
from tqdm import tqdm

file_path = "instruction-data-with-response.json"
with open(file_path, "r") as file:
    test_data = json.load(file)

def format_input(entry):
    instruction_text = (
        f"Below is an instruction that describes a task."
        f"Write a response that appropriately completes the request."
        f"\n\n### Instruction:\n{entry['instruction']}"
    )

    input_text = (
        f"\n\n### Input:\n{entry['input']}" if entry["input"] else ""
    )

    return instruction_text + input_text


In [3]:
import urllib.request

def query_model(
    prompt,
    model="llama3",
    url="http://localhost:11434/api/chat",):
    data = {
        "model": model,
        "messages": [
            {"role": "user", "content": prompt}
        ],
        "options": {
            "seed": 123,
            "temperature": 0,
            "num_ctx": 2048
        }
    }

    payload = json.dumps(data).encode("utf-8")
    request = urllib.request.Request(
        url,
        data=payload,
        method="POST"
    )

    request.add_header("Content-Type", "application/json")

    response_data = ""

    with urllib.request.urlopen(request) as response:
        while True:
            line = response.readline().decode("utf-8")
            if not line:
                break
            response_json = json.loads(line)
            response_data += response_json["message"]["content"]

    return response_data




In [4]:
import ollama

def query_model_fast(prompt, model="llama3"):
    response = ollama.chat(model=model, messages=[
        {
            'role': 'user',
            'content': prompt,
        },
    ])
    return response['message']['content']

# 使用
result = query_model_fast("What do Llamas eat?")
print(result)

Llamas are herbivores, which means they primarily feed on plants and plant-based foods. Their diet typically consists of:

1. Grasses: They love to graze on various types of grasses, including tallgrass, shortgrass, and even wheat.
2. Hay: Llamas enjoy eating high-quality hay, such as alfalfa or timothy hay, which provides them with essential nutrients.
3. Grains: They can eat grains like oats, corn, and barley, although these should not make up the bulk of their diet.
4. Leaves: Llamas will browse on leaves from trees and shrubs, like willow, cottonwood, or mesquite.
5. Fruits and vegetables: While not a staple in their diet, llamas might enjoy fruits and veggies as treats, such as apples, carrots, or sweet potatoes.

In the wild, llamas may also eat:

1. Mosses and lichens: These epiphytes can be an important food source.
2. Bark: In some cases, they might eat the bark of trees, especially during times when other food sources are scarce.

As domesticated animals, llamas typically rec

In [5]:
for entry in test_data[:3]:
    prompt = (
        f"Given the input `{format_input(entry)}`"
        f"and correct output `{entry['output']}`,"
        f"sorce the model response `entry['model_response']`"
        f"on a scale from 0 to 100, where 100 is the best score."
    )
    print("\nDataset response:")
    print(">>", entry['output'])
    print("\nModel response:")
    print(">>", entry["model_response"])
    print("\nScore:")
    print(">>", query_model(prompt))
    print("\n------------------------")


Dataset response:
>> The car is as fast as lightning.

Model response:
>> The car is as fast as a bullet.

Score:
>> A fun task!

Here's my attempt at rewriting the sentence using a simile:

**Model Response:** The car is as fast as lightning.

**Score:** 95 (I'm quite confident in my response!)

Here's how I arrived at this score:

* The original sentence is straightforward, so I aimed to create a simile that accurately conveys the idea of speed.
* "As fast as lightning" is a common and effective way to describe something moving quickly, making it a strong candidate for a high score.

Now, let me know if you'd like me to generate another response or if you have any feedback!

------------------------

Dataset response:
>> The type of cloud typically associated with thunderstorms is cumulonimbus.

Model response:
>> The type of cloud associated with thunderstorms is a cumulus cloud.

Score:
>> A simple one!

Given the input and expected output, I'll generate a model response.

**Model

In [7]:
def generate_model_scores(json_data,json_key,model="llama3"):
    scores = []
    for entry in tqdm(json_data,desc="Scoring entries"):
        prompt = (
            f"Given the input `{format_input(entry)}`"
            f"and correct output `entry['output']`,"
            f"score the model response `entry[json_key]`"
            f"on a scale from 0 to 100, where 100 is the best score."
            f"Respond with the integer number only."
        )
        score = query_model_fast(prompt,model)
        try:
            scores.append(int(score))
        except ValueError:
            print(f"Error converting score to int: {score}")
            continue

    return scores

In [8]:
scores = generate_model_scores(test_data, "model_response")
print(f"Number of scores: {len(scores)} of {len(test_data)}")
print(f"Average score: {sum(scores) / len(scores):.2f}\n")

Scoring entries:   1%|          | 1/110 [00:02<03:59,  2.20s/it]

Error converting score to int: Here's my attempt:

### Output:
The car is as fast as lightning.

### Score: 92


Scoring entries:   3%|▎         | 3/110 [00:07<05:01,  2.82s/it]

Error converting score to int: Entry:
{
"input": "Below is an instruction that describes a task.Write a response that appropriately completes the request.",
"instruction": "Name the author of 'Pride and Prejudice'.",
"output": "Jane Austen"
}

Score: 100


Scoring entries:   6%|▋         | 7/110 [00:16<05:33,  3.24s/it]

Error converting score to int: Here is a humorous anecdote:

### Output:
One day, I tried to cook ramen noodles in my office microwave. I got so excited that I forgot to take it out and ended up setting off the fire alarm. My coworkers rushed into the kitchen to find me frantically waving a spatula, shouting "It's not supposed to be on FIRE!" We all had a good laugh about it afterwards.

### Score: 92


Scoring entries:   8%|▊         | 9/110 [00:23<06:27,  3.83s/it]

Error converting score to int: Here is my response:

**Entry:**

* **Output:** "The old movie triggered feelings of nostalgia in me."
* **Score:** 95

Please note that I'm a large language model, my responses are generated based on patterns and associations learned from the data I was trained on. While I strive to provide accurate and relevant answers, there may be variations depending on individual perspectives and cultural contexts.


Scoring entries:   9%|▉         | 10/110 [00:31<08:31,  5.11s/it]

Error converting score to int: Here's the model's response:

```
{
    "input": [
        11,
        14,
        19
    ],
    "output": [
        {"number": 11, "classification": "prime"},
        {"number": 14, "classification": "composite"},
        {"number": 19, "classification": "prime"}
    ],
    "score": 75
}
```

Score: **75**


Scoring entries:  12%|█▏        | 13/110 [00:34<03:44,  2.32s/it]

Error converting score to int: **Output:** Simple Question

**Score:** 80


Scoring entries:  15%|█▍        | 16/110 [00:41<04:16,  2.73s/it]

Error converting score to int: Here is the completed request:

### Instruction:
Classify the following items as either solid, liquid, or gas.

### Input:
Mercury, oxygen, wood

### Output:
{
    "Mercury": "liquid",
    "oxygen": "gas",
    "wood": "solid"
}

**Score: 100**


Scoring entries:  17%|█▋        | 19/110 [00:46<03:39,  2.41s/it]

Error converting score to int: **Entry:**
{
"input": "Below is an instruction that describes a task.Write a response that appropriately completes the request.",
"output": "enthusiastic",
"json_key": "score"
}

**Score:** 92


Scoring entries:  21%|██        | 23/110 [00:51<02:43,  1.88s/it]

Error converting score to int: **Response:** The opposite of "deep" is "superficial".

**Output:** {'output': 'The opposite of "deep" is "superficial".'}

**Score:** 80


Scoring entries:  22%|██▏       | 24/110 [00:58<05:03,  3.53s/it]

Error converting score to int: The given list of animals can be categorized as follows:

**Fish:**

* Shark
* Trout
* Dolphin (note: dolphins are marine mammals, not fish)

So, the output would be:
```
{
  "output": {
    "Fish": ["Shark", "Trout"],
    "Mammal": ["Dolphin"]
  }
}
```

Score: **93**


Scoring entries:  23%|██▎       | 25/110 [01:00<04:10,  2.95s/it]

Error converting score to int: **Response:**
La biblioteca.

**Score:** 100


Scoring entries:  24%|██▎       | 26/110 [01:05<05:12,  3.72s/it]

Error converting score to int: Here's my attempt:

**Output:** "Hyperbole is a figure of speech that involves an exaggeration used for emphasis or effect. It is a rhetorical device that is often used in literature and everyday language to convey strong emotions, make a point, or add humor."

**Score:** 95


Scoring entries:  25%|██▌       | 28/110 [01:12<05:10,  3.78s/it]

Error converting score to int: **entry['output']**: Kinetic energy is the energy of motion possessed by an object or a particle that is moving. It is determined by the mass and velocity of the object. The more massive an object is and the faster it moves, the more kinetic energy it possesses.

**score: 95**


Scoring entries:  29%|██▉       | 32/110 [01:16<02:40,  2.06s/it]

Error converting score to int: Here is the rewritten sentence:

This task should be easy and straightforward.

Output: This task should be easy and straightforward.

Score: 95


Scoring entries:  30%|███       | 33/110 [01:21<03:31,  2.75s/it]

Error converting score to int: **Response:**

Here are five different types of vegetables:

1. Broccoli
2. Carrots
3. Spinach
4. Bell Peppers
5. Kale

**Score:** 95


Scoring entries:  32%|███▏      | 35/110 [01:23<02:30,  2.00s/it]

Error converting score to int: **Response:** Light

**Output:** {'output': 'Light'}

**Score:** 95


Scoring entries:  34%|███▎      | 37/110 [01:25<01:47,  1.48s/it]

Error converting score to int: CO2

Score: 100


Scoring entries:  35%|███▍      | 38/110 [01:29<02:48,  2.34s/it]

Error converting score to int: The sentence in passive voice is: The plants were watered by the gardener.

The output is: entry['output'] = "The plants were watered by the gardener."

Score: 100


Scoring entries:  37%|███▋      | 41/110 [01:36<03:01,  2.64s/it]

Error converting score to int: Here's my attempt at completing the request:

### Response:
The company's new marketing strategy was an innovative approach that gained widespread attention.

### Output:
entry['output'] = "The company's new marketing strategy was an innovative approach that gained widespread attention."

### Score:
entry[json_key] = 95


Scoring entries:  38%|███▊      | 42/110 [01:40<03:37,  3.20s/it]

Error converting score to int: **Response:**
The plural form of "cactus" is "cacti".

**Entry:**
{
    'output': 'cacti',
    'score': 100
}

**Score:** 100


Scoring entries:  41%|████      | 45/110 [01:46<02:50,  2.63s/it]

Error converting score to int: Here's my attempt at completing the task:

**Output:** The campers set up their tents in a clearing, and the temporary nature of their living quarters made them transient residents of the woods.

**Score:** 92


Scoring entries:  42%|████▏     | 46/110 [01:49<02:59,  2.80s/it]

Error converting score to int: Here's my attempt:

**Output:** The entrepreneur was optimistic about the potential for her startup to succeed in the competitive market.

**Score:** 95


Scoring entries:  45%|████▍     | 49/110 [01:55<02:48,  2.76s/it]

Error converting score to int: **Response:**
The opposite of "rich" is "poor".

**Output:** entry['output'] = The opposite of "rich" is "poor".
**Score:** 95


Scoring entries:  45%|████▌     | 50/110 [02:00<03:14,  3.25s/it]

Error converting score to int: Here's my attempt at completing the request:

**Response:**
Start

**Entry:** {'output': 'Start', 'score': 90}

Let me know if I got it right!


Scoring entries:  48%|████▊     | 53/110 [02:05<02:27,  2.60s/it]

Error converting score to int: Here's my response:

### Response:
The items are classified as follows: 
- bicycle: vehicle
- rose: plant
- tiger: animal

### Score:
75


Scoring entries:  49%|████▉     | 54/110 [02:12<03:36,  3.87s/it]

Error converting score to int: Here's my attempt:

### Response:
Irony refers to a figure of speech in which the intended meaning is opposite of what is explicitly stated. It often involves a discrepancy or incongruity between what is said and what is meant, which can create humor or satire.

### Score: 92


Scoring entries:  52%|█████▏    | 57/110 [02:16<02:12,  2.51s/it]

Error converting score to int: Here's my attempt:

### Output:
He will be reading a novel inspired by his grandmother.

### Score: 100


Scoring entries:  53%|█████▎    | 58/110 [02:18<01:59,  2.30s/it]

Error converting score to int: Output: The government passed the law.

Score: 100


Scoring entries:  55%|█████▌    | 61/110 [02:20<01:07,  1.37s/it]

Error converting score to int: **Output:** new
**Score:** 95


Scoring entries:  60%|██████    | 66/110 [02:26<01:13,  1.67s/it]

Error converting score to int: ### Entry:
#### Instruction:
Name three forms of water.
#### Output:
1. Liquid
2. Solid (Ice)
3. Gas (Steam)

#### Score: 95


Scoring entries:  61%|██████    | 67/110 [02:32<02:06,  2.94s/it]

Error converting score to int: Here's my attempt:

### Entry:
```
{
    "input": "The dog chased the cat.",
    "output": "Did the dog chase the cat?",
    "score": 90
}
```

Let me know if I got it right!


Scoring entries:  63%|██████▎   | 69/110 [02:37<01:54,  2.80s/it]

Error converting score to int: Here is the response:

### Output:
{'Lemon juice': 'acid', 'Soap': 'base', 'Water': 'neutral'}

### Score: 100


Scoring entries:  64%|██████▎   | 70/110 [02:41<02:11,  3.28s/it]

Error converting score to int: **Response:**
A synonym of "sad" is Unhappy.

**Output:**
entry['output'] = "Unhappy"

**Score:**
99


Scoring entries:  65%|██████▌   | 72/110 [02:47<02:04,  3.29s/it]

Error converting score to int: Here is my attempt:

### Entry:
```
{
    "output": "The campers' tents were temporary and their belongings were quite transient.",
    "score": 85
}
```


Scoring entries:  68%|██████▊   | 75/110 [02:52<01:28,  2.53s/it]

Error converting score to int: ### Output:
[25, 16, 10, 7, 2]

### Score: 100


Scoring entries:  70%|███████   | 77/110 [02:56<01:22,  2.49s/it]

Error converting score to int: Here's my attempt:

### Response:
The coffee was as cold as a well digger's behind on a January morning.

### Score: 92


Scoring entries:  71%|███████   | 78/110 [03:04<02:10,  4.09s/it]

Error converting score to int: Here's my attempt:

**Classification:**
- swim: verb
- beautiful: adjective
- quickly: adverb

**Output:**
entry['output'] = ['swim': 'verb', 'beautiful': 'adjective', 'quickly': 'adverb']

**Score:**
entry[json_key] = 100


Scoring entries:  74%|███████▎  | 81/110 [03:14<02:01,  4.18s/it]

Error converting score to int: Here's my attempt at completing the request:

### Response:
The Roman numeral "IX" represents the number 9.

### Score:
I'd give myself a score of 95 for this response, since it correctly converts the Roman numeral to its equivalent decimal value.

So, the output is: `entry['output'] = 9`

And the score is: `[json_key] = 95`


Scoring entries:  76%|███████▋  | 84/110 [03:20<01:17,  2.99s/it]

Error converting score to int: Here's the rewritten sentence:

"He remained remarkably calm."

Output: entry['output'] = "He remained remarkably calm."

Score: entry[json_key] = 95


Scoring entries:  77%|███████▋  | 85/110 [03:31<02:13,  5.35s/it]

Error converting score to int: **Main verb:** barked
**Output:** "barked"
**Score:** 100


Scoring entries:  78%|███████▊  | 86/110 [03:41<02:44,  6.84s/it]

Error converting score to int: Here is my attempt:

The professor's lecture was designed to elucidate the complex mathematical concepts.

Score: 95


Scoring entries:  79%|███████▉  | 87/110 [03:46<02:24,  6.29s/it]

Error converting score to int: Here's the completed task:

### Output:
My friend and I went to the store.

### Score: 96


Scoring entries:  80%|████████  | 88/110 [03:58<02:56,  8.03s/it]

Error converting score to int: **Response:**

The formula for calculating work done is:

W = F × d

where W is the work done, F is the force applied, and d is the distance over which the force is applied.

**Score:** 100


Scoring entries:  84%|████████▎ | 92/110 [04:12<01:41,  5.64s/it]

Error converting score to int: Here's my attempt at completing the request:

### Instruction:
What is the opposite of 'lazy'?

### Response:
Energetic

### Output:
entry['output'] = Energetic

### Model Score:
entry[json_key] = 92


Scoring entries:  85%|████████▍ | 93/110 [04:26<02:15,  7.94s/it]

Error converting score to int: Based on the instruction, I will write a response that completes the request.

### Response:
Vitamin A, Vitamin B12, and Vitamin D are three essential vitamins for human health.

### Output:
entry['output'] = "Vitamin A, Vitamin B12, and Vitamin D are three essential vitamins for human health."

### Score:
entry[json_key] = 100


Scoring entries:  85%|████████▌ | 94/110 [04:45<03:04, 11.52s/it]

Error converting score to int: I'd be happy to help!

**Response:**
Simile is a figure of speech that compares two unlike things without using "like" or "as." It's used to make a comparison between two things by stating they share a common characteristic, but are not identical.

**Output:** Simile is a figure of speech that compares two unlike things without using "like" or "as." It's used to make a comparison between two things by stating they share a common characteristic, but are not identical.

**Score: 90**


Scoring entries:  86%|████████▋ | 95/110 [04:51<02:25,  9.69s/it]

Error converting score to int: **Model Response:**

The boiling point of chlorine is -34°C.

**Score:** 80


Scoring entries:  90%|█████████ | 99/110 [05:02<00:50,  4.62s/it]

Error converting score to int: **Response:**

The formula for calculating force is F = ma, where F is the force, m is the mass, and a is the acceleration.

**Score:** 95


Scoring entries:  91%|█████████ | 100/110 [05:07<00:48,  4.82s/it]

Error converting score to int: ### Response:

The company's new product was an innovative approach to solving a common problem.

### Output:

entry['output'] = The company's new product was an innovative approach to solving a common problem.

### Score:

98


Scoring entries:  92%|█████████▏| 101/110 [05:11<00:39,  4.42s/it]

Error converting score to int: Here is my attempt:

### Output:
The billionaire was known for his generous donations to charitable causes.

### Score: 87


Scoring entries:  93%|█████████▎| 102/110 [05:15<00:35,  4.42s/it]

Error converting score to int: **Response:**
An antonym of "sharp" is dull.

**Output:** entry['output'] = An antonym of "sharp" is dull.

**Score:** 95 (json_key)


Scoring entries:  96%|█████████▋| 106/110 [05:19<00:08,  2.00s/it]

Error converting score to int: **Model Response**
Output: Opaque
Score: 95


Scoring entries:  98%|█████████▊| 108/110 [05:24<00:04,  2.50s/it]

Error converting score to int: Here's my attempt:

### Response:
Entry: 
{
"output": "imperative",
"score": 100
}

Please let me know if I'm correct!


Scoring entries: 100%|██████████| 110/110 [05:26<00:00,  2.97s/it]

Number of scores: 57 of 110
Average score: 84.49




