<a href="https://colab.research.google.com/github/younes2808/Sci2XML/blob/main/evaluation/classifier/Code/ClassifierBenchmark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Classifier Benchmarking

>[Classifier Benchmarking](#scrollTo=yttrH5Ep2aS8)

>>[Setup](#scrollTo=Xr6MR20x2mDT)

>>>[Models](#scrollTo=CaXMPmtBe-t8)

>>>>[InternVL](#scrollTo=xQDEVObsfP7u)

>>>>[Bunny](#scrollTo=FcrIadU8fdXY)

>>>>[PaliGemma2](#scrollTo=4VU5IBcZazcc)

>>>>[ML](#scrollTo=WnPx5UC3imcx)

>>>>[Moondream](#scrollTo=cjnQC0Ffz43B)

>>>[Create VLM instance](#scrollTo=ogwEM4kmavC8)

>>[Benchmark](#scrollTo=jXfP25jsSdCu)

>>>[Classify Figures](#scrollTo=0aOXwhso27rs)

>>>[Parse results](#scrollTo=wFj2rBIR3CC7)

>>>[Start benchmarking](#scrollTo=pA8BXO8U3HmC)

>>[Formula classification](#scrollTo=tui2X8-yA12A)

>>>[Formula regex evaluation](#scrollTo=wAdDJyplC8Qq)



Example:
1. Choose which model to benchmark
    * Ex: PaliGemma2
    * Run the cells under the sub header "PaliGemma2"
    * Uncomment the line in the cell "Create VLM instance" which loads the PaliGemma2 model:
    ```
    classifiermodel = load_vlm_pali()
    ```
    * Uncomment the line in the cell "Classify Figures" which sets query and callVLM for PaliGemma2:
                  ## Pali:
                  query = "answer no Which ... "
                  
                  #result = call_vlm_pali(image, query)
    * In the cell Start benchmarking, set testname to what you want, and set the last argument in the call to benchmarkFigures() to "VLM"
2. Run the benchmarking
    * Run the cells "Classify Figures", " Parse results" and "Start benchmarking"
    * The resultfile will be saved at given path.
3. Prerequisites
    * Must have google drive mounted, with path to dataset of figures.
    * Must be connected to T4 GPU
    * For PaliGemma2: must have access token.
    * For ML model: must have modelfile uploaded.

## Setup

### Models

#### InternVL

In [None]:
# Code from: https://huggingface.co/OpenGVLab/InternVL2_5-2B
!pip install lmdeploy
!pip install transformers==4.47.1
!pip install bitsandbytes

import nest_asyncio
nest_asyncio.apply()

from lmdeploy import pipeline, TurbomindEngineConfig
from lmdeploy.vl import load_image

import os
import time
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
from PIL import Image
import warnings

In [None]:
def load_vlm_intern():
  print("\n--- Loading VLM ---")
  model = 'OpenGVLab/InternVL2_5-2B'
  pipe = pipeline(model, backend_config=TurbomindEngineConfig(session_len=8192))
  return pipe


In [None]:
def call_vlm_intern(pipe, image, query):
  print("\n- Calling VLM -")
  image = load_image(image)
  response = pipe((query, image))
  return response.text

#### Bunny

In [None]:
# Code from: https://huggingface.co/BAAI/Bunny-v1_0-3B
!pip install transformers==4.47.1
!pip install bitsandbytes

import nest_asyncio
nest_asyncio.apply()

from lmdeploy import pipeline, TurbomindEngineConfig
from lmdeploy.vl import load_image

import os
import time
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
from PIL import Image
import warnings

In [None]:
def load_vlm_bunny():
  # disable some warnings
  transformers.logging.set_verbosity_error()
  transformers.logging.disable_progress_bar()
  warnings.filterwarnings('ignore')

  global model, tokenizer, device

  # set device
  device = 'cuda'  # or cpu
  torch.set_default_device(device)

  # create model
  model = AutoModelForCausalLM.from_pretrained(
      'BAAI/Bunny-v1_0-3B',
      torch_dtype=torch.float16, # float32 for cpu
      device_map='auto',
      trust_remote_code=True)
  tokenizer = AutoTokenizer.from_pretrained(
      'BAAI/Bunny-v1_0-3B',
      trust_remote_code=True)

  return ""

In [None]:
def call_vlm_bunny(image, query):
  # text prompt
  prompt = query
  text = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, concise, and one-word answers to the user's questions. USER: <image>\n{prompt} ASSISTANT:"
  text_chunks = [tokenizer(chunk).input_ids for chunk in text.split('<image>')]
  input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1], dtype=torch.long).unsqueeze(0).to(device)

  # image, sample images can be found in images folder
  image = Image.open(image)
  image_tensor = model.process_images([image], model.config).to(dtype=model.dtype, device=device)

  # generate
  output_ids = model.generate(
      input_ids,
      images=image_tensor,
      max_new_tokens=100,
      use_cache=True,
      repetition_penalty=1.0 # increase this to avoid chattering
  )[0]

  #print(tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip())
  return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()

#### PaliGemma2

In [None]:
!pip install transformers==4.47.1
!pip install bitsandbytes

Collecting transformers==4.47.1
  Downloading transformers-4.47.1-py3-none-any.whl.metadata (44 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.47.1-py3-none-any.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m52.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.48.3
    Uninstalling transformers-4.48.3:
      Successfully uninstalled transformers-4.48.3
Successfully installed transformers-4.47.1
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_

In [None]:
# Code from: https://huggingface.co/google/paligemma2-3b-pt-224
from transformers import AutoProcessor, PaliGemmaForConditionalGeneration, BitsAndBytesConfig
from PIL import Image
import requests
import torch
def load_vlm_pali():
  model_id = "google/paligemma2-3b-pt-224"

  global model, processor
  bnb_config = BitsAndBytesConfig(
      load_in_4bit=True,
      bnb_4bit_quant_type="nf4",
      bnb_4bit_compute_dtype=torch.bfloat16
  )
  model = PaliGemmaForConditionalGeneration.from_pretrained(
      model_id,
      quantization_config=bnb_config,
      device_map={"":0}
  )

  model = model.to("cuda")
  processor = AutoProcessor.from_pretrained(model_id)
  return ""


In [None]:
def call_vlm_pali(image, query):
  prompt = f"<image> {query}"

  raw_image = Image.open(image).convert("RGB")

  inputs = processor(prompt, raw_image, return_tensors="pt").to("cuda")
  output = model.generate(**inputs, max_new_tokens=200)

  input_len = inputs["input_ids"].shape[-1]
  return processor.decode(output[0][input_len:], skip_special_tokens=True)

#### ML

In [None]:
!pip install -U skorch

Collecting skorch
  Downloading skorch-1.1.0-py3-none-any.whl.metadata (11 kB)
Downloading skorch-1.1.0-py3-none-any.whl (228 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m228.9/228.9 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: skorch
Successfully installed skorch-1.1.0


In [None]:
# Code from: https://www.kaggle.com/code/sunedition/classification-of-graphs

## Change n_classes, f_params, class_names

from skorch import NeuralNetClassifier
import torch.nn as nn
import torch
import multiprocessing as mp
from skorch.dataset import ValidSplit
from skorch.callbacks import LRScheduler, Checkpoint
from skorch.callbacks import Freezer, EarlyStopping
import torchvision

def load_ml():
  print("\n--- Loading ML ---")

  n_classes = 9
  batch_size = 128
  num_workers = mp.cpu_count()

  # callback functions for models

  # DenseNet169
  # callback for Reduce on Plateau scheduler
  lr_scheduler = LRScheduler(policy='ReduceLROnPlateau',
                                      factor=0.5, patience=1)
  # callback for saving the best on validation accuracy model
  checkpoint = Checkpoint(f_params='best_model_densenet169.pkl',
                                  monitor='valid_acc_best')
  # callback for freezing all layer of the model except the last layer
  freezer = Freezer(lambda x: not x.startswith('model.classifier'))
  # callback for early stopping
  early_stopping = EarlyStopping(patience=5)

  # ... (import other necessary libraries) ...
  class DenseNet169(nn.Module):
      def __init__(self, output_features, num_units=512, drop=0.5,
                  num_units1=512, drop1=0.5):
          super().__init__()
          model = torchvision.models.densenet169(pretrained=True)
          n_inputs = model.classifier.in_features
          model.classifier = nn.Sequential(
                                  nn.Linear(n_inputs, num_units),
                                  nn.ReLU(),
                                  nn.Dropout(p=drop),
                                  nn.Linear(num_units, num_units1),
                                  nn.ReLU(),
                                  nn.Dropout(p=drop1),
                                  nn.Linear(num_units1, output_features))
          self.model = model

      def forward(self, x):
          return self.model(x)
  # NeuralNetClassifier for based on DenseNet169 with custom parameters
  densenet = NeuralNetClassifier(
      # pretrained DenseNet169 + custom classifier
      module=DenseNet169,
      module__output_features=n_classes,
      # criterion
      criterion=nn.CrossEntropyLoss,
      # batch_size = 128
      batch_size=batch_size,
      # number of epochs to train
      max_epochs=5,
      # optimizer Adam used
      optimizer=torch.optim.Adam,
      optimizer__lr = 0.001,
      optimizer__weight_decay=1e-6,
      # shuffle dataset while loading
      iterator_train__shuffle=True,
      # load in parallel
      iterator_train__num_workers=num_workers,
      # stratified kfold split of loaded dataset
      train_split=ValidSplit(cv=5, stratified=True),
      # callbacks declared earlier
      callbacks=[lr_scheduler, checkpoint, freezer, early_stopping],
      # use GPU or CPU
      device="cuda:0" if torch.cuda.is_available() else "cpu"
  )

  densenet.initialize()  # Initialize the model before loading parameters
  densenet.load_params(f_params='best_model_densenet169_sentence_epoch20.pkl')
  # Load the saved model
  return densenet


In [None]:
from PIL import Image
import albumentations as A
import numpy as np
import time
import os

def call_ml(model, image):

  # Load the image
  image_path = image  # Replace with the path to your image
  image = Image.open(image_path)
  image = image.convert("RGB")  # Ensure the image is in RGB format

  img_size = 224


  # Define the same transformations used during training
  data_transforms = A.Compose([
      A.Resize(img_size, img_size),
      A.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
      A.pytorch.transforms.ToTensorV2()
  ])

  # Apply transformations
  transformed_image = data_transforms(image=np.array(image))["image"]

  # Add a batch dimension
  transformed_image = transformed_image.unsqueeze(0)

  # Move the image to the appropriate device (GPU or CPU)
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
  transformed_image = transformed_image.to(device)

  # Make prediction
  predicted_class = model.predict(transformed_image)

  # Get the class name
  class_names = ['just_image', 'bar_chart', 'diagram', 'flow_chart', 'graph',
                'growth_chart', 'pie_chart', 'table', 'text_sentence']
  predicted_class_name = class_names[predicted_class[0]]

  print(f"Predicted class: {predicted_class_name}")
  return predicted_class_name

#### Moondream

In [None]:
!apt-get install -y libvips
!pip install pyvips
!pip install --upgrade transformers

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
Note, selecting 'libvips42' instead of 'libvips'
The following additional packages will be installed:
  apparmor firefox fonts-droid-fallback fonts-noto-mono fonts-urw-base35 ghostscript gsfonts
  imagemagick-6-common libcgif0 libfftw3-double3 libfuse3-3 libgail-common libgail18 libgs9
  libgs9-common libgsf-1-114 libgsf-1-common libgsl27 libgslcblas0 libgtk2.0-0 libgtk2.0-bin
  libgtk2.0-common libidn12 libijs-0.35 libimagequant0 libjbig2dec0 liblqr-1-0 liblzo2-2
  libmagickcore-6.q16-6 libmatio11 libopenslide0 libpoppler-glib8 librsvg2-common nip2 poppler-data
  snapd squashfs-tools systemd-hwe-hwdb udev
Suggested packages:
  apparmor-profiles-extra apparmor-utils fonts-noto fonts-freefont-otf | fonts-freefont-ttf
  fonts-texgyre ghostscript-x libfftw3-bin libfftw3-dev fuse3 gsl-ref-psdoc | gsl-doc-pdf
  | gsl-doc-info | gsl-ref-html gvfs libmagickcore-6.q16-6-extra libvips-doc libvips-to

In [None]:
# Code from: https://github.com/vikhyat/moondream

import torch
from PIL import Image
from transformers import AutoModelForCausalLM, AutoTokenizer
from io import BytesIO

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def load_vlm_moondream():
  # Load Moondream2 model
  print("Loading Moondream2 model...")
  model = AutoModelForCausalLM.from_pretrained(
      "vikhyatk/moondream2",
      revision="2025-01-09",
      trust_remote_code=True,
      device_map={"": "cuda" if torch.cuda.is_available() else "cpu"}
  ).eval()

  tokenizer = AutoTokenizer.from_pretrained("vikhyatk/moondream2", trust_remote_code=True)
  print("Moondream2 model loaded successfully!")

  return model, tokenizer

In [None]:
from PIL import Image

def call_vlm_moondream(model, image, prompt):
  img = Image.open(image)

  try:
      # Ensure the image is loaded as a proper PIL Image
      image = img.convert('RGB')
  except Exception as e:
      print(f"Invalid image file: {str(e)}")


  try:
      answer = model.query(image, prompt)["answer"]
  except Exception as e:
      print(f"Model query failed: {str(e)}")
      answer = "Model query failed"

  return answer

### Create VLM instance

In [None]:
# Select and uncomment desired model:

#classifiermodel = load_vlm_intern()
#classifiermodel = load_vlm_bunny()
classifiermodel = load_vlm_pali()
#classifiermodel, tokenizer = load_vlm_moondream()
#classifiermodel = load_ml()


--- Loading ML ---


Downloading: "https://download.pytorch.org/models/densenet169-b2777c0a.pth" to /root/.cache/torch/hub/checkpoints/densenet169-b2777c0a.pth
100%|██████████| 54.7M/54.7M [00:00<00:00, 80.5MB/s]


## Benchmark

### Classify Figures

In [None]:
import nest_asyncio
nest_asyncio.apply()

from lmdeploy import pipeline, TurbomindEngineConfig
from lmdeploy.vl import load_image

import os
import time
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
from PIL import Image
import warnings

def benchmark_figures(model, path, mode):
    print("--- Figures ---")
    print("--- Classifying using ", mode, " --")
    metrics = {}
    metrics["setting&query"] = f"{mode} / "
    metrics["time"] = 0
    metrics["totalNR"] = 0
    metrics["totalCorrect"] = 0
    metrics["perTypeCorrectChart"] = "0/0"
    metrics["perTypeCorrectFigure"] = "0/0"
    metrics["perTypeCorrectOther"] = "0/0"

    wrongList = []

    startTime = time.time()
    for (dirpath, dirnames, filenames) in os.walk(path):
        print(dirpath, dirnames, filenames)
        if (len(filenames) != 0):
            for file in filenames:
                print(f"\n--------------- {file} ---------------")
                fileClass = "".join([char for char in file[:-4] if not char.isdigit()])
                print(fileClass)

                print("Running classification on this image...")
                result = ""
                image = dirpath + "/" + file
                if (mode == "VLM"):
                  # Select and uncomment desired query:

                  ## Old:
                  query = "with one word, classify this as either a chart, figure or other"
                  ## Intern and Bunny:
                  query = "with one word, classify this as one of these: [just_image - bar_chart - diagram - flow_chart - graph - growth_chart - pie_chart - table - text_sentence]"
                  ## Pali:
                  #query = "answer no Which of these classes [just_image, bar_chart, diagram, flow_chart, graph, growth_chart, pie_chart, table, text_sentence] could this image be classified as?\n"
                  ## Moondream:
                  query = "with one word, classify this as one of these: [just_image, bar_chart, diagram, flow_chart, graph, growth_chart, pie_chart, table, text_sentence]?"

                  # Select and uncomment desired model:
                  #result = call_vlm_intern(model, image, query)
                  #result = call_vlm_bunny(image, query)
                  #result = call_vlm_pali(image, query)
                  result = call_vlm_moondream(model, image, query)
                elif (mode == "ML"):
                  result = call_ml(model, image)

                result = result.lower()
                print(f"Classification done... Analyzing result -> {result} <- ...")

                ## Class names that the ML use:
                class_names = ['just_image', 'bar_chart', 'diagram', 'flow_chart', 'graph',
                'growth_chart', 'pie_chart', 'table', 'text_sentence']

                ## Analyze result:
                ### We roughly want to classify between Chart, Figure or Other:
                found = False
                if (fileClass == "chart"):
                  for word in ['bar_chart', 'bar chart', 'graph', 'pie_chart', 'pie chart']:
                    if (word in result):
                      found = True
                      break
                  if (found):
                    print("Hurra")
                    metrics["totalCorrect"] += 1
                    temp = metrics["perTypeCorrectChart"].split("/")
                    metrics["perTypeCorrectChart"] = str((int(temp[0])) + 1) + "/" + str((int(temp[1]) + 1))
                  else:
                    print("Oh no")
                    temp = metrics["perTypeCorrectChart"].split("/")
                    metrics["perTypeCorrectChart"] = str((int(temp[0]))) + "/" + str((int(temp[1]) + 1))
                    wrongList.append([file, result])
                elif (fileClass == "figure"):
                  for word in ['flow_chart', 'flow chart', 'growth_chart', 'growth chart', 'figure', 'diagram']:
                    if (word in result):
                      found = True
                      break
                  if (found):
                    print("Hurra")
                    metrics["totalCorrect"] += 1
                    temp = metrics["perTypeCorrectFigure"].split("/")
                    metrics["perTypeCorrectFigure"] = str((int(temp[0])) + 1) + "/" + str((int(temp[1]) + 1))
                  else:
                    print("Oh no")
                    temp = metrics["perTypeCorrectFigure"].split("/")
                    metrics["perTypeCorrectFigure"] = str((int(temp[0]))) + "/" + str((int(temp[1]) + 1))
                    wrongList.append([file, result])
                elif (fileClass in ["table", "other"]):
                  for word in ["just_image", "just image", "table", "text_sentence", "text sentence", 'other']:
                    if (word in result):
                      found = True
                      break
                  if (found):
                    print("Hurra")
                    metrics["totalCorrect"] += 1
                    temp = metrics["perTypeCorrectOther"].split("/")
                    metrics["perTypeCorrectOther"] = str((int(temp[0])) + 1) + "/" + str((int(temp[1]) + 1))
                  else:
                    print("Oh no")
                    temp = metrics["perTypeCorrectOther"].split("/")
                    metrics["perTypeCorrectOther"] = str((int(temp[0]))) + "/" + str((int(temp[1]) + 1))
                    wrongList.append([file, result])

                metrics["totalNR"] += 1
    endTime = time.time()
    elapsedTime = endTime-startTime
    metrics["time"] = elapsedTime
    if (mode == "VLM"):
        metrics["setting&query"] += query
    elif (mode == "ML"):
         metrics["setting&query"] += "just_image - bar_chart - diagram - flow_chart - graph - growth_chart - pie_chart - table"

    return metrics, wrongList

### Parse results

In [None]:
def handle_results(figureMetrics, wrongList, pathToSave, testName):
    print("--- Handle results ---")
    resultString = f"--- Results: {testName}--- \n"

    resultString += f"\n Total elements tested: {figureMetrics['totalNR']}\n"
    resultString += f"\n Elapsed time: {round(figureMetrics['time'], 5)} sec -> avg per element: {round((figureMetrics['time'])/(figureMetrics['totalNR']), 5)} sek"
    resultString += f"\n Total correct: {figureMetrics['totalCorrect']} / {figureMetrics['totalNR']} = {round(((figureMetrics['totalCorrect']) / (figureMetrics['totalNR']))*100, 2)} %\n"

    resultString += "\n-- Figures --"
    for metric in figureMetrics.keys():
        print(metric, figureMetrics[metric])
        resultString += f"\n Metric: {metric}: {figureMetrics[metric]}"
    if (figureMetrics["totalNR"] != 0):
        resultString += f"\n -> avg time per image: {figureMetrics['time']/figureMetrics['totalNR']} sec"
    else:
        resultString += f"\n -> avg time per image: unknown"
    if (figureMetrics["totalNR"] != 0):
        resultString += f"\n -> percentage correct: {figureMetrics['totalCorrect']}/{figureMetrics['totalNR']} = {(figureMetrics['totalCorrect']/figureMetrics['totalNR'])*100} %"
    else:
        resultString += f"\n -> percentage correct: 0 %"

    resultString += "\n\n\n-- List of wrongs: --"
    for wrong in wrongList:
        resultString += f"\n{wrong[0]} -> {wrong[1]}"

    with open(pathToSave + "/" + "overallResults" + testName + ".txt", "w") as file:
        file.write(resultString)
    # File is automatically closed after exiting the 'with' block

### Start benchmarking

In [None]:
import os
import time

def main():
    print("Starting...")
    testName = "Densenet-ML-2"
    metricResultsFigures, wrongListFigures = benchmark_figures(classifiermodel, "./drive/MyDrive/classifierBenchmarkVLM/dataset/figures", "ML")
    wrongList = wrongListFigures

    print("HACK: ", metricResultsFigures, wrongListFigures)
    handle_results(metricResultsFigures, wrongList, "./drive/MyDrive/classifierBenchmarkVLM/results/", testName)
main()

Starter...
--- Figures ---
--- Classifying using  ML  --
./drive/MyDrive/classifierBenchmarkVLM/dataset/figures [] ['figure2.png', 'figure5.png', 'other3.png', 'figure4.png', 'figure7.png', 'other2.png', 'figure6.png', 'figure3.png', 'other1.png', 'chart1.png', 'chart2.png', 'figure1.png', 'table3.png', 'other5.png', 'table2.png', 'figure9.png', 'other6.png', 'table1.png', 'other4.png', 'table4.png', 'figure8.png', 'other9.png', 'other10.png', 'table7.png', 'table9.png', 'other11.png', 'table6.png', 'other7.png', 'table5.png', 'other8.png', 'table10.png', 'other13.png', 'other14.png', 'other12.png', 'chart3.png', 'chart9.png', 'other16.png', 'chart4.png', 'chart7.png', 'figure12.png', 'chart6.png', 'chart8.png', 'figure10.png', 'table12.png', 'table11.png', 'chart5.png', 'figure11.png', 'other15.png', 'chart12.png', 'chart11.png', 'figure13.png', 'table15.png', 'table13.png', 'table14.png', 'chart10.png', 'other17.png', 'chart18.png', 'chart15.png', 'chart16.png', 'table16.png', 'table

## Formula classification

This is not a comparison assessment, it just evaluates whether a regex pattern is able to classify formulas (in string).

### Formula regex evaluation

In [None]:
import time
import re
import os

def check_formula(regex):
  pattern = r"^(?!\(+$)(?!\)+$).{3,}$"
  ## ^ and $ ensures that the whole string matches.
  ## (?!\(+$) is a negative lookahead that checks that the string doesnt only contain trailing "(".
  ## .{3,} matches any character at least three times, and ensures the string is longer than 2 characters.
  if (re.match(pattern, regex)):
      print("YES: ->", regex)

      print("Response from formulaParser: --> ", APIresponse["preferred"])
  else:
      print("NO: ", "Formula: ", elementNr, " ->", regex)
      print("The formula is NOT identified as an actual formula. Aborting...")
      return

def itereate(path):
    startTime = time.time()
    for (dirpath, dirnames, filenames) in os.walk(path):
        print("1: ", dirpath, dirnames, filenames)
        if (len(filenames) != 0):
            print("Her")
            for file in filenames:
                print(f"\n-- {file} --")
                fileClass = "".join([char for char in file[:-4] if not char.isdigit()])
                print("Testing. This sould be a: ", fileClass)

                print("Running classification on this image...")

                check_formula()


itereate("./drive/MyDrive/classifierBenchmarkVLM/dataset/formulas")