# Model evaluation notebook

In [4]:
# Jupyter version
import ipywidgets as widgets
from IPython.display import display

model_list = [
  "gemma2:2b",
  "gemma2:9b",
  "gemma3:12b",
  "mistral:7b",
  "mixtral:8x7b",
  "phi:2.7b",
  "phi4:14b",
  "deepseek-r1:7b",
]
model_picker = widgets.Dropdown(options=model_list)

# GColab version
MODEL = 'mistral:7b' # @param ["gemma2:2b", "gemma2:9b", "gemma3:12b", "mistral:7b", "mixtral:8x7b", "phi:2.7b", "phi4:14b", "deepseek-r1:7b"] {allow-input: true}
MODEL

'mistral:7b'

## Setup

### Install Ollama

#### Google Collab

In [5]:
# Google Collab dependencies from: https://github.com/5aharsh/collama
!sudo apt update
!sudo apt install -y pciutils
!curl -fsSL https://ollama.com/install.sh | sh

[33m0% [Working][0m            Hit:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:6 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:7 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:8 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Hit:10 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Fetched 261 kB in 2s (171 kB/s)
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
35 packages can be upgraded. Run 'apt list --upgradable' to see them.
[1;33mW: [0mSkipping acq

#### Docker image
An ephemeral environment can be prepared with.

```shell
docker run -v "${PWD}":/home/jovyan/work -it --rm -p 10000:8888 quay.io/jupyter/scipy-notebook:2025-03-14
```

If on Jupyter Docker image, run
```shell
docker exec -it --user=0 <docker_container_name> sh -c 'apt update && apt install -y pciutils'
```

##### Install Ollama
```shell
docker exec -it --user=0 <docker_container_name> sh -c 'curl -fsSL https://ollama.com/install.sh | sh'
```

#### Prepare model

In [6]:
# Code taken from https://github.com/5aharsh/collama
import threading
import subprocess
import time
import socket

MAX_WAIT_SECONDS = 60
OLLAMA_PORT = 11434

def is_ollama_ready():
  s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
  try:
    s.connect(('localhost', OLLAMA_PORT))
    s.shutdown(socket.SHUT_RDWR)
    return True
  except:
    return False


def start_ollama_thread():
  def run_ollama_serve():
    subprocess.Popen(["ollama", "serve"])

  thread = threading.Thread(target=run_ollama_serve)
  thread.start()

  print("Waiting for OLLama to be ready...         ")

  for sec in range(MAX_WAIT_SECONDS):
    if is_ollama_ready():
      break
    print("\b\b\b\b\b\b\b\b\b" + "{:3d}s/{:3d}s".format(sec + 1, MAX_WAIT_SECONDS), end='', flush=True)
    time.sleep(1)
  print()

if not is_ollama_ready():
  start_ollama_thread()

In [7]:
!ollama pull "$MODEL"

[?2026h[?25l[1Gpulling manifest ⠋ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠙ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠹ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠸ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠼ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠴ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠦ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠧ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest [K
pulling ff82381e2bea: 100% ▕▏ 4.1 GB                         [K
pulling 43070e2d4e53: 100% ▕▏  11 KB                         [K
pulling 491dfa501e59: 100% ▕▏  801 B                         [K
pulling ed11eda7790d: 100% ▕▏   30 B                         [K
pulling 42347cd80dc8: 100% ▕▏  485 B                         [K
verifying sha256 digest [K
writing manifest [K
success [K[?25h[?2026l


### Prepare code

In [8]:
from getpass import getpass

def get_secret(prompt, secret_name, secret_input=True):
  try:
    from google.colab import userdata
    result = userdata.get(secret_name)
    assert result is not None
  except:
    if secret_input:
      result = getpass(prompt)
    else:
      result = input(prompt)
  return result


In [9]:
import os

if not os.path.exists('src'):
  import urllib

  user = get_secret('User name: ', 'TFM_GH_USER')
  password = get_secret('Password: ', 'TFM_GH_TOKEN')
  password = urllib.parse.quote(password) # your password is converted into url format
  repopath = "tfm-smp-2025/fine-tuning"

  !git clone https://"$user":"$password"@github.com/"$repopath" src

  del password

In [25]:
# Update code, if needed
!cd src && git pull

Already up to date.


#### Dependencies

In [11]:
!pip install -q -r src/requirements.txt

##### NLP model

In [12]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.5/33.5 MB[0m [31m46.7 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


#### Credentials

##### Result pusher setup

In [13]:
import os

if (
    (not os.path.exists(os.path.expanduser("~/.ssh/id_rsa")))
    or (not os.path.exists(os.path.expanduser("~/.ssh/known_hosts")))
):
  SSHKEY = get_secret('Result pusher SSH key: ', 'TFM_SSH_PUSHER_KEY')

  !mkdir ~/.ssh

  # Read locally with `cat ~/.ssh/result-pusher|tr '\n' '$';echo`
  with open(os.path.expanduser("~/.ssh/id_rsa"), 'wt') as f:
    f.write(SSHKEY.replace('$', '\n'))

  !chmod 0600 ~/.ssh/id_rsa
  !ssh-keygen -y -f ~/.ssh/id_rsa > ~/.ssh/id_rsa.pub
  !chmod 0600 ~/.ssh/id_rsa.pub

  # This won't copy the client key (not needed), but it will initialize the server's on the client
  !ssh-copy-id -i ~/.ssh/id_rsa -o StrictHostKeyChecking=accept-new result-pusher@kb.tfm.codigoparallevar.com

  del SSHKEY

##### Patch SPARQL wrapper to use KB_PASSWORD

In [14]:
import os
if not os.getenv('KB_PASSWORD'):
  os.environ['KB_PASSWORD'] = get_secret('Password: ', 'TFM_KB_ADMIN_PASS')

!export KB_PASSWORD=$KB_PASSWORD

In [15]:
!sed 's/from urllib.request import/import base64, os\nfrom urllib.request import/' \
   -i '/usr/local/lib/python3.11/dist-packages/SPARQLWrapper/Wrapper.py'
!sed 's/request = self._createRequest()/request = self._createRequest()\n        base64string = base64.b64encode("{}:{}".format("admin", os.environ["KB_PASSWORD"]).encode())\n        request.add_header("Authorization", "Basic " + base64string.decode())/' \
   -i '/usr/local/lib/python3.11/dist-packages/SPARQLWrapper/Wrapper.py'

##### Setup WEAVIATE API KEY

In [16]:
import os
if not os.getenv('WEAVIATE_API_KEY'):
  os.environ['WEAVIATE_API_KEY'] = get_secret('Password: ', 'TFM_VECTOR_DB_ADMIN_APIKEY')

!export WEAVIATE_API_KEY=$WEAVIATE_API_KEY

### Pull datasets

In [17]:
!python3 src/scripts/pull_datasets.py

qald-9 | Unified dataset...
  ✔ qald-9 unified dataset present

beastiary | Unified dataset...
  ✔ beastiary unified dataset present

VQuAnDA | Split dataset...
  ✔  Train file present
  ✔  Test file present

LC-QuAD 1.0 | Split dataset...
  ✔  Train file present
  ✔  Test file present

LC-QuAD 2.0 | Split dataset...
  ✔  Train file present
  ✔  Test file present

WebQuestions SP | Unified dataset...
  ↓  Downloading WebQuestions SP dataset
  ✔  WebQuestions SP dataset ready



## Run evaluation

In [None]:
assert MODEL != ''

if not is_ollama_ready():
  start_ollama_thread()

import tqdm
for idx in tqdm.tqdm(range(10), desc='Running tests'):
  !time WEAVIATE_HOST=kb.tfm.codigoparallevar.com \
      python3 -m src.src \
      --seed 42 \
      test --models="$MODEL" \
      --sparql-server 'http://kb.tfm.codigoparallevar.com' \
      --sample-offset "$idx" \
      --sample 1 \
      --dataset 'LC-QuAD 1.0'


Running tests:   0%|          | 0/10 [00:00<?, ?it/s]

2025-04-27T17:51:38.168918 info:	 [1mStarting operation[0m
  0% 0/1000 [00:00<?, ?it/s]2025-04-27T17:51:38.189022 [(mistral:7b on Ollama + prompt & search) DATASET: LC-QuAD 1.0]	INFO:	 [1mEntering context: (mistral:7b on Ollama + prompt & search) DATASET: LC-QuAD 1.0[0m
---------- 8< ---------- DATA
{
    "parent": "Root context",
    "parameters": {
        "translator": {
            "model_name": "mistral:7b"
        },
        "question": [
            "Which architect of Marine Corps Air Station Kaneohe Bay was also tenant of New Sanno...
            " SELECT DISTINCT ?uri WHERE { <http://dbpedia.org/resource/Marine_Corps_Air_Station_...
            null
        ],
        "dataset": {
            "name": "LC-QuAD 1.0",
            "sparql_endpoint": "dbpedia_2016_04"
        },
        "id": "1f671dd7-6ccc-44f2-a114-12198b8221a6"
    }
}
---------- >8 ----------
2025-04-27T17:51:38.189279 [(mistral:7b on Ollama + prompt & search) DATASET: LC-QuAD 1.0]	INFO:	 [1mInput: Which 

Running tests:  10%|█         | 1/10 [01:02<09:22, 62.48s/it]

2025-04-27T17:52:40.577107 info:	 [1mStarting operation[0m
  0% 0/1000 [00:00<?, ?it/s]2025-04-27T17:52:40.597799 [(mistral:7b on Ollama + prompt & search) DATASET: LC-QuAD 1.0]	INFO:	 [1mEntering context: (mistral:7b on Ollama + prompt & search) DATASET: LC-QuAD 1.0[0m
---------- 8< ---------- DATA
{
    "parent": "Root context",
    "parameters": {
        "translator": {
            "model_name": "mistral:7b"
        },
        "question": [
            "Which architect of Marine Corps Air Station Kaneohe Bay was also tenant of New Sanno...
            " SELECT DISTINCT ?uri WHERE { <http://dbpedia.org/resource/Marine_Corps_Air_Station_...
            null
        ],
        "dataset": {
            "name": "LC-QuAD 1.0",
            "sparql_endpoint": "dbpedia_2016_04"
        },
        "id": "1c7ec971-ad42-4cc4-8bca-84ab49ef685b"
    }
}
---------- >8 ----------
2025-04-27T17:52:40.598038 [(mistral:7b on Ollama + prompt & search) DATASET: LC-QuAD 1.0]	INFO:	 [1mInput: Which 

Running tests:  20%|██        | 2/10 [02:04<08:17, 62.15s/it]

^C
2025-04-27T17:53:42.767030 info:	 [1mStarting operation[0m
  0% 0/1000 [00:00<?, ?it/s]2025-04-27T17:53:42.787719 [(mistral:7b on Ollama + prompt & search) DATASET: LC-QuAD 1.0]	INFO:	 [1mEntering context: (mistral:7b on Ollama + prompt & search) DATASET: LC-QuAD 1.0[0m
---------- 8< ---------- DATA
{
    "parent": "Root context",
    "parameters": {
        "translator": {
            "model_name": "mistral:7b"
        },
        "question": [
            "Which architect of Marine Corps Air Station Kaneohe Bay was also tenant of New Sanno...
            " SELECT DISTINCT ?uri WHERE { <http://dbpedia.org/resource/Marine_Corps_Air_Station_...
            null
        ],
        "dataset": {
            "name": "LC-QuAD 1.0",
            "sparql_endpoint": "dbpedia_2016_04"
        },
        "id": "b8f0d1a2-4346-44ba-b36f-839431a7ccb7"
    }
}
---------- >8 ----------
2025-04-27T17:53:42.787977 [(mistral:7b on Ollama + prompt & search) DATASET: LC-QuAD 1.0]	INFO:	 [1mLeaving co

## Upload results

In [19]:
!rsync -HPrz --mkpath \
  src/experiment-viewer/logs/ \
  result-pusher@kb.tfm.codigoparallevar.com:experiment-viewer/logs

sending incremental file list
.gitignore
             13 100%    0.00kB/s    0:00:00 (xfr#1, to-chk=3/5)
log-2025-04-27 15:56:38.713768.jsonl
        558,318 100%  532.45MB/s    0:00:00 (xfr#2, to-chk=2/5)
log-2025-04-27 16:07:15.584164.jsonl
      5,677,526 100%   55.25MB/s    0:00:00 (xfr#3, to-chk=1/5)
log-2025-04-27 17:32:01.417951.jsonl
        550,931 100%    5.25MB/s    0:00:00 (xfr#4, to-chk=0/5)


## Cleanup

### Stop kernel

In [20]:
# exit(0)