# Model evaluation notebook

In [22]:
# Jupyter version
import ipywidgets as widgets
from IPython.display import display

model_list = [
  "gemma2:2b",
  "gemma2:9b",
  "gemma3:12b",
  "mistral:7b",
  "mixtral:8x7b",
  "phi:2.7b",
  "phi4:14b",
  "deepseek-r1:7b",
]
model_picker = widgets.Dropdown(options=model_list)

# GColab version
MODEL = 'mixtral:8x7b' # @param ["gemma2:2b", "gemma2:9b", "gemma3:12b", "mistral:7b", "mixtral:8x7b", "phi:2.7b", "phi4:14b", "deepseek-r1:7b"] {allow-input: true}
MODEL

'mixtral:8x7b'

In [23]:
# Start time
!date

Sun Apr 27 09:46:18 PM UTC 2025


## Setup

### Install Ollama

#### Google Colab

In [3]:
# Google Collab dependencies from: https://github.com/5aharsh/collama
!sudo apt update
!sudo apt install -y pciutils
!sh -c 'if which ollama; then echo Ollama already installed; else curl -fsSL https://ollama.com/install.sh | sh; fi'

Hit:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:3 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:5 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
35 packages can be upgraded. Run 'apt list --upgradable' to see them.
[1;33mW: [0mSkipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubunt

#### Docker image
An ephemeral environment can be prepared with.

```shell
docker run -v "${PWD}":/home/jovyan/work -it --rm -p 10000:8888 quay.io/jupyter/scipy-notebook:2025-03-14
```

If on Jupyter Docker image, run
```shell
docker exec -it --user=0 <docker_container_name> sh -c 'apt update && apt install -y pciutils'
```

##### Install Ollama
```shell
docker exec -it --user=0 <docker_container_name> sh -c 'curl -fsSL https://ollama.com/install.sh | sh'
```

#### Prepare model

In [24]:
# Code taken from https://github.com/5aharsh/collama
import threading
import subprocess
import time
import socket

MAX_WAIT_SECONDS = 60
OLLAMA_PORT = 11434

def is_ollama_ready():
  s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
  try:
    s.connect(('localhost', OLLAMA_PORT))
    s.shutdown(socket.SHUT_RDWR)
    return True
  except:
    return False


def start_ollama_thread():
  def run_ollama_serve():
    subprocess.Popen(["ollama", "serve"])

  thread = threading.Thread(target=run_ollama_serve)
  thread.start()

  print("Waiting for OLLama to be ready...         ")

  for sec in range(MAX_WAIT_SECONDS):
    if is_ollama_ready():
      break
    print("\b\b\b\b\b\b\b\b\b" + "{:3d}s/{:3d}s".format(sec + 1, MAX_WAIT_SECONDS), end='', flush=True)
    time.sleep(1)
  print()

if not is_ollama_ready():
  start_ollama_thread()

Waiting for OLLama to be ready...         
  1s/ 60s


In [25]:
!ollama pull "$MODEL"

[?2026h[?25l[1Gpulling manifest ⠙ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠙ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠹ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠼ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest [K
pulling f2dc41fa964b: 100% ▕▏  26 GB                         [K
pulling 53d74de0d84c: 100% ▕▏   84 B                         [K
pulling 43070e2d4e53: 100% ▕▏  11 KB                         [K
pulling ed11eda7790d: 100% ▕▏   30 B                         [K
pulling deae14c19dac: 100% ▕▏  486 B                         [K
verifying sha256 digest [K
writing manifest [K
success [K[?25h[?2026l
[?2026h[?25l[1Gpulling manifest ⠋ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠙ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠹ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠸ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest [K
pulling f2dc41fa964b: 100% ▕▏  26 GB                         [K
pulling 53d74de0d84c: 100

### Prepare code

In [26]:
from getpass import getpass

def get_secret(prompt, secret_name, secret_input=True):
  try:
    from google.colab import userdata
    result = userdata.get(secret_name)
    assert result is not None
  except:
    if secret_input:
      result = getpass(prompt)
    else:
      result = input(prompt)
  return result


In [27]:
import os

if not os.path.exists('src'):
  import urllib

  user = get_secret('User name: ', 'TFM_GH_USER')
  password = get_secret('Password: ', 'TFM_GH_TOKEN')
  password = urllib.parse.quote(password) # your password is converted into url format
  repopath = "tfm-smp-2025/fine-tuning"

  !git clone https://"$user":"$password"@github.com/"$repopath" src

  del password

Cloning into 'src'...
remote: Enumerating objects: 674, done.[K
remote: Counting objects: 100% (256/256), done.[K
remote: Compressing objects: 100% (158/158), done.[K
remote: Total 674 (delta 172), reused 166 (delta 95), pack-reused 418 (from 1)[K
Receiving objects: 100% (674/674), 142.30 KiB | 818.00 KiB/s, done.
Resolving deltas: 100% (429/429), done.


In [28]:
# Update code, if needed
!cd src && git pull

Already up to date.
Already up to date.


#### Dependencies

In [29]:
!pip install -q -r src/requirements.txt

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/442.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m442.3/442.3 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.5/43.5 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/565.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m565.1/565.1 kB[0m [31m37.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m223.8/223.8 kB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m93.4 MB/s[0m eta [36m0:00:00[0m
[?25h

##### NLP model

In [30]:
try:
  import spacy
  spacy.load("en_core_web_md")
except:
  !python -m spacy download en_core_web_md

#### Credentials

##### Result pusher setup

In [31]:
import os

if (
    (not os.path.exists(os.path.expanduser("~/.ssh/id_rsa")))
    or (not os.path.exists(os.path.expanduser("~/.ssh/known_hosts")))
):
  SSHKEY = get_secret('Result pusher SSH key: ', 'TFM_SSH_PUSHER_KEY')

  !mkdir ~/.ssh

  # Read locally with `cat ~/.ssh/result-pusher|tr '\n' '$';echo`
  with open(os.path.expanduser("~/.ssh/id_rsa"), 'wt') as f:
    f.write(SSHKEY.replace('$', '\n'))

  !chmod 0600 ~/.ssh/id_rsa
  !ssh-keygen -y -f ~/.ssh/id_rsa > ~/.ssh/id_rsa.pub
  !chmod 0600 ~/.ssh/id_rsa.pub

  # This won't copy the client key (not needed), but it will initialize the server's on the client
  !ssh-copy-id -i ~/.ssh/id_rsa -o StrictHostKeyChecking=accept-new result-pusher@kb.tfm.codigoparallevar.com

  del SSHKEY

##### Patch SPARQL wrapper to use KB_PASSWORD

In [32]:
import os
if not os.getenv('KB_PASSWORD'):
  os.environ['KB_PASSWORD'] = get_secret('Password: ', 'TFM_KB_ADMIN_PASS')

!export KB_PASSWORD=$KB_PASSWORD

In [33]:
!sed 's/from urllib.request import/import base64, os\nfrom urllib.request import/' \
   -i '/usr/local/lib/python3.11/dist-packages/SPARQLWrapper/Wrapper.py'
!sed 's/request = self._createRequest()/request = self._createRequest()\n        base64string = base64.b64encode("{}:{}".format("admin", os.environ["KB_PASSWORD"]).encode())\n        request.add_header("Authorization", "Basic " + base64string.decode())/' \
   -i '/usr/local/lib/python3.11/dist-packages/SPARQLWrapper/Wrapper.py'

##### Setup WEAVIATE API KEY

In [34]:
import os
if not os.getenv('WEAVIATE_API_KEY'):
  os.environ['WEAVIATE_API_KEY'] = get_secret('Password: ', 'TFM_VECTOR_DB_ADMIN_APIKEY')

!export WEAVIATE_API_KEY=$WEAVIATE_API_KEY

### Pull datasets

In [35]:
!python3 src/scripts/pull_datasets.py

qald-9 | Unified dataset...
  ↓  Downloading qald-9 dataset
  ✔  qald-9 dataset ready

beastiary | Unified dataset...
  ↓  Downloading beastiary dataset
  ✔  beastiary dataset ready

VQuAnDA | Split dataset...
  ↓  Downloading train file
  ✔  Train file ready
  ↓  Downloading test file
  ✔  Test file ready

LC-QuAD 1.0 | Split dataset...
  ↓  Downloading train file
  ✔  Train file ready
  ↓  Downloading test file
  ✔  Test file ready

LC-QuAD 2.0 | Split dataset...
  ↓  Downloading train file
  ✔  Train file ready
  ↓  Downloading test file
  ✔  Test file ready

WebQuestions SP | Unified dataset...
  ↓  Downloading WebQuestions SP dataset
  ✔  WebQuestions SP dataset ready

qald-9 | Unified dataset...
  ✔ qald-9 unified dataset present

beastiary | Unified dataset...
  ✔ beastiary unified dataset present

VQuAnDA | Split dataset...
  ✔  Train file present
  ✔  Test file present

LC-QuAD 1.0 | Split dataset...
  ✔  Train file present
  ✔  Test file present

LC-QuAD 2.0 | Split dataset..

## Run evaluation

In [36]:
# Prepare profiler
!pip install -q git+https://github.com/nickodell/python-flamegraph.git

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for flamegraph (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [37]:
assert MODEL != ''

if not is_ollama_ready():
  start_ollama_thread()

!time WEAVIATE_HOST=kb.tfm.codigoparallevar.com \
    python3 \
    -m flamegraph -o perf.log \
    src/src/as_script.py \
    --seed 42 \
    test --models="$MODEL" \
    --sparql-server 'http://kb.tfm.codigoparallevar.com' \
    --sample 100 \
    --dataset 'LC-QuAD 1.0'


[1;30;43mStreaming output truncated to the last 5000 lines.[0m

> Did Timm Gunn guest in Sunrise (HIMYM)?

Let's reason step by step. Identify the nouns on the query, skip the ones that can be solved by a SPARQL verb (ignore, for example, "count" or "number of"), and output a json list like this.

```json
[
    "entity1",
    "entity2",
    ...
    "entityN"
]
```[0m
---------- 8< ---------- DATA
{
    "type": "get_entities_in_query",
    "input": "Extract the nouns from this natural language query.\n\n> Did Timm Gunn guest in Sun...
}
---------- >8 ----------
2025-04-27T21:55:40.478538 [[Q-63](mixtral:8x7b on Ollama + prompt & search) DATASET: LC-QuAD 1.0]	INFO:	 [1mLLM response: 
[
    "Timm Gunn",
    "Sunrise",
    "HIMYM"
]

Here, "Timm Gunn" and "Sunrise" are proper nouns referring to specific entities, while "HIMYM" is an abbreviation for a TV show.[0m
---------- 8< ---------- DATA
{
    "type": "get_entities_in_query",
    "input": "Extract the nouns from this natural lang

## Upload results

### Time profile

In [38]:
import os
import time
if os.path.exists('perf.log'):
  new_perf_path = 'perf-{}.log'.format(int(time.time()))
  os.rename('perf.log', new_perf_path)
  !rsync -HPrz --mkpath \
    "$new_perf_path" \
    result-pusher@kb.tfm.codigoparallevar.com:experiment-viewer/perfs

sending incremental file list
perf-1745790371.log
      1,942,284 100%  910.53MB/s    0:00:00 (xfr#1, to-chk=0/1)
sending incremental file list
perf-1745791224.log
      1,665,320 100%    1.52GB/s    0:00:00 (xfr#1, to-chk=0/1)


### Logs

In [39]:
!rsync -HPrz --mkpath \
  src/experiment-viewer/logs/ \
  result-pusher@kb.tfm.codigoparallevar.com:experiment-viewer/logs

sending incremental file list
.gitignore
             13 100%    0.00kB/s    0:00:00               13 100%    0.00kB/s    0:00:00 (xfr#1, to-chk=1/3)
log-2025-04-27 21:07:30.100512.jsonl
      2,044,339 100%  389.93MB/s    0:00:00 (xfr#2, to-chk=0/3)
sending incremental file list
.gitignore
             13 100%    0.00kB/s    0:00:00 (xfr#1, to-chk=2/4)
log-2025-04-27 21:07:30.100512.jsonl
      2,044,339 100%    1.90GB/s    0:00:00 (xfr#2, to-chk=1/4)
log-2025-04-27 21:46:33.788215.jsonl
      1,404,420 100%  223.23MB/s    0:00:00 (xfr#3, to-chk=0/4)


## Cleanup

In [40]:
# Finish time
!date

Sun Apr 27 09:46:18 PM UTC 2025
Sun Apr 27 10:00:31 PM UTC 2025


### Stop kernel

In [41]:
# exit(0)