# Model evaluation notebook

In [1]:
# Jupyter version
import ipywidgets as widgets
from IPython.display import display

model_list = [
  "gemma2:2b",
  "gemma2:9b",
  "gemma3:12b",
  "mistral:7b",
  "mixtral:8x7b",
  "phi:2.7b",
  "phi4:14b",
  "deepseek-r1:7b",
]
model_picker = widgets.Dropdown(options=model_list)

# GColab version
MODEL = 'phi4:14b' # @param ["gemma2:2b", "gemma2:9b", "gemma3:12b", "mistral:7b", "mixtral:8x7b", "phi:2.7b", "phi4:14b", "deepseek-r1:7b"] {allow-input: true}
MODEL

'phi4:14b'

In [2]:
trainset = "1746474294"
checkpoint = "855"
quantization = "Q6_K"

In [3]:
# Start time
!date

Thu May  8 03:45:28 PM UTC 2025


## Setup

### Install Ollama

#### Google Colab

In [4]:
# Google Collab dependencies from: https://github.com/5aharsh/collama
!sudo apt update
!sudo apt install -y pciutils
!sh -c 'if which ollama; then echo Ollama already installed; else curl -fsSL https://ollama.com/install.sh | sh; fi'

[33m0% [Working][0m            Get:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,665 kB]
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:7 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease [18.1 kB]
Get:9 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [8,926 kB]
Hit:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:12 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [2,

#### Docker image
An ephemeral environment can be prepared with.

```shell
docker run -v "${PWD}":/home/jovyan/work -it --rm -p 10000:8888 quay.io/jupyter/scipy-notebook:2025-03-14
```

If on Jupyter Docker image, run
```shell
docker exec -it --user=0 <docker_container_name> sh -c 'apt update && apt install -y pciutils'
```

##### Install Ollama
```shell
docker exec -it --user=0 <docker_container_name> sh -c 'curl -fsSL https://ollama.com/install.sh | sh'
```

#### Prepare model

In [5]:
# Code taken from https://github.com/5aharsh/collama
import threading
import subprocess
import time
import socket

MAX_WAIT_SECONDS = 60
OLLAMA_PORT = 11434

def is_ollama_ready():
  s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
  try:
    s.connect(('localhost', OLLAMA_PORT))
    s.shutdown(socket.SHUT_RDWR)
    return True
  except:
    return False


def start_ollama_thread():
  def run_ollama_serve():
    subprocess.Popen(["ollama", "serve"])

  thread = threading.Thread(target=run_ollama_serve)
  thread.start()

  print("Waiting for OLLama to be ready...         ")

  for sec in range(MAX_WAIT_SECONDS):
    if is_ollama_ready():
      break
    print("\b\b\b\b\b\b\b\b\b" + "{:3d}s/{:3d}s".format(sec + 1, MAX_WAIT_SECONDS), end='', flush=True)
    time.sleep(1)
  print()

if not is_ollama_ready():
  start_ollama_thread()

Waiting for OLLama to be ready...         
  1s/ 60s


### Prepare code

In [6]:
from getpass import getpass

def get_secret(prompt, secret_name, secret_input=True):
  try:
    from google.colab import userdata
    result = userdata.get(secret_name)
    assert result is not None
  except:
    if secret_input:
      result = getpass(prompt)
    else:
      result = input(prompt)
  return result


In [7]:
import os

if not os.path.exists('src'):
  import urllib

  user = get_secret('User name: ', 'TFM_GH_USER')
  password = get_secret('Password: ', 'TFM_GH_TOKEN')
  password = urllib.parse.quote(password) # your password is converted into url format
  repopath = "tfm-smp-2025/fine-tuning"

  !git clone https://"$user":"$password"@github.com/"$repopath" src

  del password

Cloning into 'src'...
remote: Enumerating objects: 804, done.[K
remote: Counting objects: 100% (144/144), done.[K
remote: Compressing objects: 100% (74/74), done.[K
remote: Total 804 (delta 110), reused 96 (delta 70), pack-reused 660 (from 1)[K
Receiving objects: 100% (804/804), 148.47 KiB | 18.56 MiB/s, done.
Resolving deltas: 100% (552/552), done.


In [8]:
# Update code, if needed
!cd src && git pull

Already up to date.


#### Dependencies

In [9]:
!pip install -q -r src/requirements.txt

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/442.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m442.3/442.3 kB[0m [31m28.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.5/43.5 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/565.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m565.1/565.1 kB[0m [31m45.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/223.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m223.8/223.8 kB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

##### NLP model

In [10]:
try:
  import spacy
  spacy.load("en_core_web_md")
except:
  !python -m spacy download en_core_web_md

Collecting en-core-web-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.5/33.5 MB[0m [31m68.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: en-core-web-md
Successfully installed en-core-web-md-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


#### Credentials

##### Result pusher setup

In [11]:
import os

if (
    (not os.path.exists(os.path.expanduser("~/.ssh/id_rsa")))
    or (not os.path.exists(os.path.expanduser("~/.ssh/known_hosts")))
):
  SSHKEY = get_secret('Result pusher SSH key: ', 'TFM_SSH_PUSHER_KEY')

  !mkdir ~/.ssh

  # Read locally with `cat ~/.ssh/result-pusher|tr '\n' '$';echo`
  with open(os.path.expanduser("~/.ssh/id_rsa"), 'wt') as f:
    f.write(SSHKEY.replace('$', '\n'))

  !chmod 0600 ~/.ssh/id_rsa
  !ssh-keygen -y -f ~/.ssh/id_rsa > ~/.ssh/id_rsa.pub
  !chmod 0600 ~/.ssh/id_rsa.pub

  # This won't copy the client key (not needed), but it will initialize the server's on the client
  !ssh-copy-id -i ~/.ssh/id_rsa -o StrictHostKeyChecking=accept-new result-pusher@kb.tfm.codigoparallevar.com

  del SSHKEY

/usr/bin/ssh-copy-id: INFO: Source of key(s) to be installed: "/root/.ssh/id_rsa.pub"
/usr/bin/ssh-copy-id: INFO: attempting to log in with the new key(s), to filter out any that are already installed

		(if you think this is a mistake, you may want to use -f option)



##### Patch SPARQL wrapper to use KB_PASSWORD

In [12]:
import os
if not os.getenv('KB_PASSWORD'):
  os.environ['KB_PASSWORD'] = get_secret('Password: ', 'TFM_KB_ADMIN_PASS')

!export KB_PASSWORD=$KB_PASSWORD

In [13]:
!sed 's/from urllib.request import/import base64, os\nfrom urllib.request import/' \
   -i '/usr/local/lib/python3.11/dist-packages/SPARQLWrapper/Wrapper.py'
!sed 's/request = self._createRequest()/request = self._createRequest()\n        base64string = base64.b64encode("{}:{}".format("admin", os.environ["KB_PASSWORD"]).encode())\n        request.add_header("Authorization", "Basic " + base64string.decode())/' \
   -i '/usr/local/lib/python3.11/dist-packages/SPARQLWrapper/Wrapper.py'

##### Setup WEAVIATE API KEY

In [14]:
import os
if not os.getenv('WEAVIATE_API_KEY'):
  os.environ['WEAVIATE_API_KEY'] = get_secret('Password: ', 'TFM_VECTOR_DB_ADMIN_APIKEY')

!export WEAVIATE_API_KEY=$WEAVIATE_API_KEY

#### Pull fine-tuned model

In [15]:
# !ollama pull "$MODEL"

In [16]:
!rsync -HPrz --mkpath \
  result-pusher@kb.tfm.codigoparallevar.com:fine-tuning/fine-tuned/peft-kbs-summary-training-"$trainset"/checkpoint-"$checkpoint"/loadable/unsloth."$quantization".gguf \
    fine-tune/unsloth."$quantization".gguf


receiving incremental file list
created 1 directory for fine-tune
unsloth.Q6_K.gguf
 12,030,257,888 100%   10.29MB/s    0:18:35 (xfr#1, to-chk=0/1)


In [17]:
with open("fine-tune/Modelfile", "wt") as f:
  f.write('FROM ./unsloth.' + str(quantization) + '''.gguf

TEMPLATE {{ if .System }}<|im_start|>system<|im_sep|>{{ .System }}<|im_end|>{{ end }}{{ if .Prompt }}<|im_start|>user<|im_sep|>{{ .Prompt }}<|im_end|><|im_start|>assistant<|im_sep|>{{ end }}{{ .Response }}<|im_end|>
PARAMETER stop <|im_start|>
PARAMETER stop <|im_sep|>
PARAMETER stop <|im_end|>
PARAMETER stop <|im_file_sep|>
PARAMETER stop <|im_start|>user<|im_sep|>
''')

In [18]:
!ollama create phi4-ft -f fine-tune/Modelfile

[?2026h[?25l[1Ggathering model components ⠙ [K[?25h[?2026l[?2026h[?25l[1Ggathering model components ⠹ [K[?25h[?2026l[?2026h[?25l[1Ggathering model components ⠸ [K[?25h[?2026l[?2026h[?25l[1Ggathering model components ⠼ [K[?25h[?2026l[?2026h[?25l[1Ggathering model components ⠼ [K[?25h[?2026l[?2026h[?25l[1Ggathering model components ⠦ [K[?25h[?2026l[?2026h[?25l[1Ggathering model components ⠧ [K[?25h[?2026l[?2026h[?25l[1Ggathering model components ⠇ [K[?25h[?2026l[?2026h[?25l[1Ggathering model components ⠏ [K[?25h[?2026l[?2026h[?25l[1Ggathering model components ⠋ [K[?25h[?2026l[?2026h[?25l[1Ggathering model components ⠙ [K[?25h[?2026l[?2026h[?25l[1Ggathering model components ⠹ [K[?25h[?2026l[?2026h[?25l[1Ggathering model components ⠸ [K[?25h[?2026l[?2026h[?25l[1Ggathering model components ⠸ [K[?25h[?2026l[?2026h[?25l[1Ggathering model components ⠴ [K[?25h[?2026l[?2026h[?25l[1Ggathering model compon

### Pull datasets

In [19]:
!python3 src/scripts/pull_datasets.py

qald-9 | Unified dataset...
  ↓  Downloading qald-9 dataset
  ✔  qald-9 dataset ready

beastiary | Unified dataset...
  ↓  Downloading beastiary dataset
  ✔  beastiary dataset ready

VQuAnDA | Split dataset...
  ↓  Downloading train file
  ✔  Train file ready
  ↓  Downloading test file
  ✔  Test file ready

LC-QuAD 1.0 | Split dataset...
  ↓  Downloading train file
  ✔  Train file ready
  ↓  Downloading test file
  ✔  Test file ready

LC-QuAD 2.0 | Split dataset...
  ↓  Downloading train file
  ✔  Train file ready
  ↓  Downloading test file
  ✔  Test file ready

WebQuestions SP | Unified dataset...
  ↓  Downloading WebQuestions SP dataset
  ✔  WebQuestions SP dataset ready



## Run evaluation

In [20]:
with open("src/infra/models.txt", "rt") as f:
  model_ready = "phi4-ft" in f.read()

if not model_ready:
  with open("src/infra/models.txt", "at") as f:
    f.write("\nphi4-ft")

In [21]:
!cd src && git diff infra/models.txt

[1mdiff --git a/infra/models.txt b/infra/models.txt[m
[1mindex 4d24249..a14c25f 100644[m
[1m--- a/infra/models.txt[m
[1m+++ b/infra/models.txt[m
[36m@@ -6,3 +6,5 @@[m [mmixtral:8x7b[m
 phi:2.7b[m
 phi4:14b[m
 deepseek-r1:7b[m
[32m+[m
[32m+[m[32mphi4-ft[m
\ No newline at end of file[m


In [22]:
# Prepare profiler
!pip install -q git+https://github.com/nickodell/python-flamegraph.git

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for flamegraph (setup.py) ... [?25l[?25hdone


In [28]:
assert MODEL != ''

if not is_ollama_ready():
  start_ollama_thread()

!time WEAVIATE_HOST=kb.tfm.codigoparallevar.com MAX_STEP_TOKENS=1000 \
    python3 \
    -m flamegraph -o perf.log \
    src/src/as_script.py \
    --seed 42 \
    test --models="phi4-ft" \
    --sparql-server 'http://kb.tfm.codigoparallevar.com' \
    --sample 30 \
    --dataset 'LC-QuAD 1.0'


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
### Subject: <http://dbpedia.org/resource/U.S._Route_191> ; Predicate: <http://dbpedia.org/ontology/routeEndDirection>

```sparql
SELECT DISTINCT ?object WHERE { <http://dbpedia.org/resource/U.S._Route_191> <http://dbpedia.org/ontology/routeEndDirection> ?object }
```

### Subject: <http://dbpedia.org/resource/U.S._Route_191> ; Predicate: <http://dbpedia.org/ontology/routeStart>

```sparql
SELECT DISTINCT ?object WHERE { <http://dbpedia.org/resource/U.S._Route_191> <http://dbpedia.org/ontology/routeStart> ?object }
```

### Subject: <http://dbpedia.org/resource/U.S._Route_191> ; Predicate: <http://dbpedia.org/ontology/routeJunction>

```sparql
SELECT DISTINCT ?object WHERE { <http://dbpedia.org/resource/U.S._Route_191> <http://dbpedia.org/ontology/routeJunction> ?object }
```

### Subject: <http://dbpedia.org/resource/U.S._Route_191> ; Predicate: <http://dbpedia.org/ontology/routeStartDirection>

```sparql
SELECT DISTINCT

## Upload results

In [30]:
import os
import time
if os.path.exists('perf.log'):
  new_perf_path = 'perf-{}.log'.format(int(time.time()))
  os.rename('perf.log', new_perf_path)
  !rsync -HPrz --mkpath \
    "$new_perf_path" \
    result-pusher@kb.tfm.codigoparallevar.com:experiment-viewer/perfs

sending incremental file list
perf-1746726715.log
      1,699,593 100%  794.80MB/s    0:00:00 (xfr#1, to-chk=0/1)


In [31]:
!rsync -HPrz --mkpath \
  src/experiment-viewer/logs/ \
  result-pusher@kb.tfm.codigoparallevar.com:experiment-viewer/logs

sending incremental file list
.gitignore
             13 100%    0.00kB/s    0:00:00 (xfr#1, to-chk=2/4)
log-2025-05-08 16:07:23.101906.jsonl
        151,255 100%  144.25MB/s    0:00:00 (xfr#2, to-chk=1/4)
log-2025-05-08 16:40:19.190241.jsonl
      7,454,003 100%  710.87MB/s    0:00:00 (xfr#3, to-chk=0/4)


## Cleanup

In [32]:
# Finish time
!date

Thu May  8 05:52:04 PM UTC 2025


### Stop kernel

In [27]:
# exit(0)