<a href="https://colab.research.google.com/github/twhool02/ptm-quantization/blob/main/Evaluation_of_original_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Model Evaluation

This notebook runs evaluation benchmarks on models that will be quantized during later evaluation runs. This is to allow for accurate comparisons.

Evaulation of models is carried out using the [Language Model Evaluation Harness ](https://github.com/EleutherAI/lm-evaluation-harness) from [EleutherAI](https://www.eleuther.ai/)

Models are evaluated on:

* MMLU (5-shot)
* HellaSwag (0-shot)
* BoolQ (0-shot)
* BBH (3-shot)

The number of shots for MMLU, HellaSwag and BBH are the same as those used on the [Llama 2: Open Foundation and Fine-Tuned Chat Models](https://arxiv.org/abs/2307.09288)




## Setup

### Map Google Drive

In [1]:
import shutil, os, subprocess

# mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Log into HuggingFace Hub

In [2]:
# Required when quantizing models/data that are gated on HuggingFace and required for pushing models to HuggingFace
!pip install -q --upgrade huggingface_hub

import huggingface_hub

print(f"Hugging Face Version is: {huggingface_hub.__version__}")

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/388.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━[0m [32m358.4/388.9 kB[0m [31m10.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m388.9/388.9 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hHugging Face Version is: 0.22.2


In [3]:
from google.colab import userdata

# using the HF_TOKEN secret, this has write permissions to Hugging Face
hftoken = userdata.get('HF_TOKEN')

In [4]:
from huggingface_hub import login

# Log into hugging face using the HF_TOKEN secrect
login(hftoken, add_to_git_credential=True)

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


### Install Requried Libraries

In [5]:
# The Transformers library provides APIs and tools to easily download and train pretrained model.
!pip install -q -U transformers -q

# Accelerate enables the same PyTorch code to be run across any distributed configuration
!pip install -q -U accelerate -q

# an unsupervised text tokenizer and detokenizer mainly for Neural Network-based text generation systems
!pip install sentencepiece -q

# 'bitsandbytes' includes quantization primitives for 8-bit & 4-bit operations
!pip install bitsandbytes -q

# PEFT (Parameter-Efficient Fine-Tuning) is a library for efficiently adapting large pretrained
# models to various downstream applications without fine-tuning all of a model’s parameters
!pip install peft -q

# trl is short for Transformers Reinforcement Learning, it's used for fine-tuning transformer models using Proximal Policy Optimization.
!pip install trl -q

# an extension of Transformers that provides a set of performance optimization tools to train and run models
!pip install -q -U optimum

# used for monitoring the training process.
!pip install -q -U wandb

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.8/8.8 MB[0m [31m72.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m63.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m68.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m90.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m731.7/731.7 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.6/410.6 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.6/121.6 MB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━

In [6]:
#print the version of transformers
import transformers
print(f"version of transformers: {transformers.__version__}")

# print the version of the tensorflow library
import accelerate
print(f"version of accelerate: {accelerate.__version__}")

version of transformers: 4.39.3
version of accelerate: 0.28.0


### Log into Weights and Biases

In [7]:
import wandb

wandb_token = userdata.get('wandb_api')
wandb.login(key=wandb_token)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

### Create folder for results

In [8]:
# Create a directory to store results
results_dir = f"/content/drive/MyDrive/Evaluation"
os.makedirs(results_dir, exist_ok=True)

In [9]:
# Install LM-Eval
!pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@big-refactor

Collecting git+https://github.com/EleutherAI/lm-evaluation-harness.git@big-refactor
  Cloning https://github.com/EleutherAI/lm-evaluation-harness.git (to revision big-refactor) to /tmp/pip-req-build-5ucrhn25
  Running command git clone --filter=blob:none --quiet https://github.com/EleutherAI/lm-evaluation-harness.git /tmp/pip-req-build-5ucrhn25
  Running command git checkout -b big-refactor --track origin/big-refactor
  Switched to a new branch 'big-refactor'
  Branch 'big-refactor' set up to track remote branch 'big-refactor' from 'origin'.
  Resolved https://github.com/EleutherAI/lm-evaluation-harness.git to commit 967eb4fa90b80ba4e8cc7a2fd171f44f0e384808
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting evaluate (from lm_eval==1.0.0)
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.

### Import libraries

In [10]:
!python3 -c "import tensorflow as tf; print(tf.config.list_physical_devices('GPU'))"

2024-04-04 10:53:04.084573: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-04 10:53:04.084624: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-04 10:53:04.086748: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [11]:
!pip show tensorflow

Name: tensorflow
Version: 2.15.0
Summary: TensorFlow is an open source machine learning framework for everyone.
Home-page: https://www.tensorflow.org/
Author: Google Inc.
Author-email: packages@tensorflow.org
License: Apache 2.0
Location: /usr/local/lib/python3.10/dist-packages
Requires: absl-py, astunparse, flatbuffers, gast, google-pasta, grpcio, h5py, keras, libclang, ml-dtypes, numpy, opt-einsum, packaging, protobuf, setuptools, six, tensorboard, tensorflow-estimator, tensorflow-io-gcs-filesystem, termcolor, typing-extensions, wrapt
Required-by: dopamine-rl, tf_keras


In [12]:
# os is a standard Python library that provides functions for interacting with the operating system.
import os

# torch is the main package of PyTorch, an open-source machine learning library for Python.
import torch

# load_dataset is a function from the datasets library by Hugging Face. It allows you to load and preprocess datasets for machine learning models.
from datasets import load_dataset

# The transformers library is a popular library for Natural Language Processing (NLP). It provides thousands of pre-trained models to perform tasks on texts such as classification, information extraction, summarization, translation, and more.
from transformers import (
    # AutoModelForCausalLM is a class in the transformers library. It represents a model for causal language modeling.
    AutoModelForCausalLM,

    # AutoTokenizer is a class in the transformers library. It is used for converting input data into a format that can be used by the model.
    AutoTokenizer,

    # BitsAndBytesConfig is a configuration class in the transformers library. It is used to configure a BitsAndBytes model.
    BitsAndBytesConfig,

    # HfArgumentParser is a class in the transformers library. It is used for parsing command-line arguments.
    HfArgumentParser,

    # TrainingArguments is a class in the transformers library. It defines the arguments used during training.
    TrainingArguments,

    # pipeline is a high-level function in the transformers library. It creates a pipeline that applies a model to some input data.
    pipeline,

    # logging is a module in the transformers library. It is used for logging events during training and evaluation.
    logging,

    # A generic model class that will be instantiated as one of the model classes of the library
    #(with a question answering head) when created with the from_pretrained() class method or the from_config() class method
    AutoModelForQuestionAnswering
)

# used for Parameter-Efficient Fine-Tuning
from peft import LoraConfig, PeftModel

# install SFTTrainer
from trl import SFTTrainer

# allows addition of progress bars to loops and iterable objects
from tqdm import tqdm

### Install lm-eval

In [13]:
!git clone https://github.com/EleutherAI/lm-evaluation-harness
!cd lm-evaluation-harness

Cloning into 'lm-evaluation-harness'...
remote: Enumerating objects: 32695, done.[K
remote: Counting objects: 100% (75/75), done.[K
remote: Compressing objects: 100% (57/57), done.[K
remote: Total 32695 (delta 31), reused 53 (delta 17), pack-reused 32620[K
Receiving objects: 100% (32695/32695), 22.81 MiB | 8.65 MiB/s, done.
Resolving deltas: 100% (22840/22840), done.


In [14]:
import os

# change directory
os.chdir("lm-evaluation-harness")

In [15]:
import os
import glob

# get current working dirctory and list files
print(f"current directory is: {os.getcwd()}\n")
# print(os.listdir('.'))

# Get a list of all files and directories in the current directory
files = glob.glob('./*')

# Create a list of tuples, each containing the name of the file/directory and its last modification time
files_with_times = [(file, os.path.getmtime(file)) for file in files]

# Sort the list by the modification time (the second element of each tuple)
files_with_times.sort(key=lambda x: x[1])

# Print the sorted list
print("Files in current directory:")
for file, mtime in files_with_times:
    print(f'{file}: {mtime}')

current directory is: /content/lm-evaluation-harness

Files in current directory:
./CITATION.bib: 1712228001.2430425
./README.md: 1712228001.2430425
./CODEOWNERS: 1712228001.2430425
./LICENSE.md: 1712228001.2430425
./docs: 1712228001.2460427
./examples: 1712228001.2460427
./ignore.txt: 1712228001.2460427
./lm_eval: 1712228001.3570511
./pyproject.toml: 1712228001.3580513
./pile_statistics.json: 1712228001.3580513
./requirements.txt: 1712228001.3580513
./mypy.ini: 1712228001.3580513
./setup.py: 1712228001.3590512
./scripts: 1712228001.3590512
./templates: 1712228001.3590512
./tests: 1712228001.3880534


In [16]:
!pip install -r requirements.txt

Obtaining file:///content/lm-evaluation-harness (from -r requirements.txt (line 1))
  Installing build dependencies ... [?25l[?25hdone
  Checking if build backend supports build_editable ... [?25l[?25hdone
  Getting requirements to build editable ... [?25l[?25hdone
  Preparing editable metadata (pyproject.toml) ... [?25l[?25hdone
Collecting word2number (from lm_eval==0.4.2->-r requirements.txt (line 1))
  Downloading word2number-1.1.zip (9.7 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lm_eval, word2number
  Building editable for lm_eval (pyproject.toml) ... [?25l[?25hdone
  Created wheel for lm_eval: filename=lm_eval-0.4.2-0.editable-py3-none-any.whl size=16122 sha256=d71f34563e05ce0a663d9b69e679be4a626d9c673dae3c2ca73cbbbd61a08bb3
  Stored in directory: /tmp/pip-ephem-wheel-cache-av5xbxk3/wheels/dc/8d/a0/ce1a137b6a29fcf5007da91566ee423695e01d20703991091d
  Building wheel for word2number (setup.py) ... [?25l[?25hdone
  Cr

In [17]:
from lm_eval import api

Downloading builder script:   0%|          | 0.00/5.67k [00:00<?, ?B/s]

### lm_eval Help

In [18]:
!lm_eval --help

2024-04-04 10:53:43.449568: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-04 10:53:43.449619: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-04 10:53:43.451355: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
usage: lm_eval [-h] [--model MODEL] [--tasks task1,task2] [--model_args MODEL_ARGS]
               [--num_fewshot N] [--batch_size auto|auto:N|N] [--max_batch_size N]
               [--device DEVICE] [--output_path DIR|DIR/file.json] [--limit N|0<N<1]
               [--use_cache DIR] [--cache_requests {true,refresh,delete}] [--check_integrity]
               [--w

## Non Quantized Model Evaluation

### Evaluate Llama2-7b-chat-HF

#### MMLU

5-Shot is used when running this evaluation to match the values used in the document [Llama 2: Open Foundation and Fine-Tuned Chat Models](https://arxiv.org/abs/2307.09288)

In [19]:
eval_model = "meta-llama/Llama-2-7b-chat-hf"

# create directory to store results
results_dir = f"/content/drive/MyDrive/Evaluation/{eval_model}"
os.makedirs(results_dir, exist_ok=True)

In [35]:
!lm_eval --model hf \
    --model_args pretrained=meta-llama/Llama-2-7b-chat-hf,trust_remote_code=True,do_sample=True \
    --tasks mmlu_stem,mmlu_social_sciences,mmlu_humanities,mmlu_other \
    --num_fewshot 5 \
    --device cuda:0 \
    --batch_size 4 \
    --verbosity INFO \
    --output_path results_dir_mmlu \
    --log_samples \
    --wandb_args project=quantized_model_evaluation,name=Llama2-7b-chat-HF-MMLU

2024-04-04 15:40:19.874917: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-04 15:40:19.874968: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-04 15:40:19.876911: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[34m[1mwandb[0m: Currently logged in as: [33mted-whooley[0m ([33matu-twhool02[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.16.6
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/content/lm-evaluation-harness/wandb/run-20240404_154026-lgvaw8sr[0m
[34m[1mwandb[0m: Run [1m`wandb 

#### HellaSwag

0-Shot is used when running this evaluation to match the values used in the document [Llama 2: Open Foundation and Fine-Tuned Chat Models](https://arxiv.org/abs/2307.09288)

In [21]:
!lm_eval --model hf \
    --model_args pretrained=meta-llama/Llama-2-7b-chat-hf,trust_remote_code=True, \
    --tasks hellaswag \
    --num_fewshot 0 \
    --device cuda:0 \
    --batch_size auto:4 \
    --verbosity INFO \
    --output_path results_dir_hellaswag \
    --log_samples \
    --wandb_args project=quantized_model_evaluation,name=Llama2-7b-chat-HF-Hellaswag

2024-04-04 11:07:20.618950: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-04 11:07:20.618997: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-04 11:07:20.620496: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[34m[1mwandb[0m: Currently logged in as: [33mted-whooley[0m ([33matu-twhool02[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.16.6
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/content/lm-evaluation-harness/wandb/run-20240404_110727-f1p9vo3o[0m
[34m[1mwandb[0m: Run [1m`wandb 

#### BoolQ

0-Shot is used when running this evaluation to match the values used in the document [Llama 2: Open Foundation and Fine-Tuned Chat Models](https://arxiv.org/abs/2307.09288)

In [22]:
!lm_eval --model hf \
    --model_args pretrained=meta-llama/Llama-2-7b-chat-hf,trust_remote_code=True,do_sample=True \
    --tasks boolq \
    --num_fewshot 0 \
    --device cuda:0 \
    --batch_size auto:4 \
    --verbosity INFO \
    --output_path results_dir_boolq \
    --log_samples \
    --wandb_args project=quantized_model_evaluation,name=Llama2-7b-chat-HF-boolq \
    --use_cache results_dir_boolq \
    --cache_requests true \
    --show_config

2024-04-04 11:14:28.760606: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-04 11:14:28.760660: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-04 11:14:28.762209: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[34m[1mwandb[0m: Currently logged in as: [33mted-whooley[0m ([33matu-twhool02[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.16.6
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/content/lm-evaluation-harness/wandb/run-20240404_111435-wf044pey[0m
[34m[1mwandb[0m: Run [1m`wandb 

#### BBH

3-Shot is used when running this evaluation to match the values used in the document [Llama 2: Open Foundation and Fine-Tuned Chat Models](https://arxiv.org/abs/2307.09288)

In [23]:
!lm_eval --model hf \
    --model_args pretrained=meta-llama/Llama-2-7b-chat-hf,trust_remote_code=True,do_sample=True \
    --tasks bbh_fewshot \
    --num_fewshot 3 \
    --device cuda:0 \
    --batch_size auto:4 \
    --verbosity INFO \
    --output_path results_dir_bbh \
    --log_samples \
    --wandb_args project=quantized_model_evaluation,name=Llama2-7b-chat-HF-bbh \
    --use_cache results_dir_bbh \
    --cache_requests true

2024-04-04 11:16:46.228178: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-04 11:16:46.228233: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-04 11:16:46.229720: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[34m[1mwandb[0m: Currently logged in as: [33mted-whooley[0m ([33matu-twhool02[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.16.6
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/content/lm-evaluation-harness/wandb/run-20240404_111652-igecq10y[0m
[34m[1mwandb[0m: Run [1m`wandb 

### Evaluate Falcon 7B Instruct

In [24]:
eval_model = "tiiuae/falcon-7b-instruct"

# create directory to store results
results_dir = f"/content/drive/MyDrive/Evaluation/{eval_model}"
os.makedirs(results_dir, exist_ok=True)

#### MMLU

5-Shot is used when running this evaluation to match the values used in the document [Llama 2: Open Foundation and Fine-Tuned Chat Models](https://arxiv.org/abs/2307.09288)

revision number is used in the code to ensure an newer version of falcon is not downloaded automatically

In [25]:
!lm_eval --model hf \
    --model_args pretrained=tiiuae/falcon-7b-instruct,trust_remote_code=True \
    --tasks mmlu_stem,mmlu_social_sciences,mmlu_humanities,mmlu_other \
    --num_fewshot 5 \
    --device cuda:0 \
    --batch_size auto:4 \
    --verbosity INFO \
    --output_path results_dir_falcon_mmlu \
    --log_samples \
    --wandb_args project=quantized_model_evaluation,name=Falcon-7B-instruct-MMLU \
    --use_cache results_dir_falcon_mmlu \
    --cache_requests true \
    --show_config

2024-04-04 12:07:08.704294: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-04 12:07:08.704353: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-04 12:07:08.705814: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[34m[1mwandb[0m: Currently logged in as: [33mted-whooley[0m ([33matu-twhool02[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.16.6
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/content/lm-evaluation-harness/wandb/run-20240404_120715-yytitm03[0m
[34m[1mwandb[0m: Run [1m`wandb 

#### HellaSwag

0-Shot is used when running this evaluation to match the values used in the document [Llama 2: Open Foundation and Fine-Tuned Chat Models](https://arxiv.org/abs/2307.09288)

In [26]:
!lm_eval --model hf \
    --model_args pretrained=tiiuae/falcon-7b-instruct,trust_remote_code=True \
    --tasks hellaswag \
    --num_fewshot 0 \
    --device cuda:0 \
    --batch_size auto:4 \
    --verbosity INFO \
    --output_path results_dir_hellaswag \
    --log_samples \
    --wandb_args project=quantized_model_evaluation,name=Falcon-7B-instruct-Hellaswag \
    --use_cache results_dir_hellaswag \
    --cache_requests true \
    --show_config

2024-04-04 12:45:36.498362: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-04 12:45:36.498410: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-04 12:45:36.499883: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[34m[1mwandb[0m: Currently logged in as: [33mted-whooley[0m ([33matu-twhool02[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.16.6
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/content/lm-evaluation-harness/wandb/run-20240404_124542-h51frpov[0m
[34m[1mwandb[0m: Run [1m`wandb 

#### BoolQ

0-Shot is used when running this evaluation to match the values used in the document [Llama 2: Open Foundation and Fine-Tuned Chat Models](https://arxiv.org/abs/2307.09288)

In [27]:
!lm_eval --model hf \
    --model_args pretrained=tiiuae/falcon-7b-instruct,trust_remote_code=True \
    --tasks boolq \
    --num_fewshot 0 \
    --device cuda:0 \
    --batch_size auto:4 \
    --verbosity INFO \
    --output_path results_dir_boolq \
    --log_samples \
    --wandb_args project=quantized_model_evaluation,name=Falcon-7B-instruct-BoolQ \
    --use_cache results_dir_boolq \
    --cache_requests true \
    --show_config

2024-04-04 12:53:01.782631: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-04 12:53:01.782685: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-04 12:53:01.784153: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[34m[1mwandb[0m: Currently logged in as: [33mted-whooley[0m ([33matu-twhool02[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.16.6
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/content/lm-evaluation-harness/wandb/run-20240404_125308-6seeegad[0m
[34m[1mwandb[0m: Run [1m`wandb 

#### BBH - no score returned

3-Shot is used when running this evaluation to match the values used in the document [Llama 2: Open Foundation and Fine-Tuned Chat Models](https://arxiv.org/abs/2307.09288)

In [28]:
!lm_eval --model hf \
    --model_args pretrained=tiiuae/falcon-7b-instruct,trust_remote_code=True,do_sample=True \
    --tasks bbh_fewshot \
    --num_fewshot 3 \
    --device cuda:0 \
    --batch_size 2 \
    --verbosity INFO \
    --output_path results_dir_falcon_bbh \
    --log_samples \
    --wandb_args project=quantized_model_evaluation,name=Falcon-7B-instruct-BBH \
    --use_cache results_dir_falcon_bbh \
    --cache_requests true \
    --write_out \
    --show_config

2024-04-04 12:53:53.374903: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-04 12:53:53.374955: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-04 12:53:53.376471: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[34m[1mwandb[0m: Currently logged in as: [33mted-whooley[0m ([33matu-twhool02[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.16.6
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/content/lm-evaluation-harness/wandb/run-20240404_125359-gpuiznyr[0m
[34m[1mwandb[0m: Run [1m`wandb 

### Evaluate Mistral-7B-Instruct

In [29]:
eval_model = "mistralai/Mistral-7B-Instruct-v0.2"

# create directory to store results
results_dir = f"/content/drive/MyDrive/Evaluation/{eval_model}"
os.makedirs(results_dir, exist_ok=True)

#### MMLU

5-Shot is used when running this evaluation to match the values used in the document [Llama 2: Open Foundation and Fine-Tuned Chat Models](https://arxiv.org/abs/2307.09288)

revision number is used in the code to ensure an newer version of falcon is not downloaded automatically

In [30]:
!lm_eval --model hf \
    --model_args pretrained=mistralai/Mistral-7B-Instruct-v0.2,trust_remote_code=True \
    --tasks mmlu_stem,mmlu_social_sciences,mmlu_humanities,mmlu_other\
    --num_fewshot 5 \
    --device cuda:0 \
    --batch_size auto:4 \
    --verbosity INFO \
    --output_path results_dir_mistral_mmlu \
    --log_samples \
    --wandb_args project=quantized_model_evaluation,name=Mistral-7B-Instruct-MMLU \
    --use_cache results_dir_mistral_mmlu \
    --cache_requests true \
    --show_config

2024-04-04 14:16:30.290687: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-04 14:16:30.290740: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-04 14:16:30.292249: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[34m[1mwandb[0m: Currently logged in as: [33mted-whooley[0m ([33matu-twhool02[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.16.6
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/content/lm-evaluation-harness/wandb/run-20240404_141636-ddcp3fwd[0m
[34m[1mwandb[0m: Run [1m`wandb 

#### HellaSwag

0-Shot is used when running this evaluation to match the values used in the document [Llama 2: Open Foundation and Fine-Tuned Chat Models](https://arxiv.org/abs/2307.09288)

In [31]:
!lm_eval --model hf \
    --model_args pretrained=mistralai/Mistral-7B-Instruct-v0.2,trust_remote_code=True \
    --tasks hellaswag \
    --num_fewshot 0 \
    --device cuda:0 \
    --batch_size auto:4 \
    --verbosity INFO \
    --output_path results_dir_mistral_hellaswag \
    --log_samples \
    --wandb_args project=quantized_model_evaluation,name=Mistral-7B-Instruct-hellaswag \
    --use_cache results_dir_mistral_hellaswag \
    --cache_requests true \
    --show_config

2024-04-04 14:44:36.365134: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-04 14:44:36.365184: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-04 14:44:36.366676: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[34m[1mwandb[0m: Currently logged in as: [33mted-whooley[0m ([33matu-twhool02[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.16.6
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/content/lm-evaluation-harness/wandb/run-20240404_144442-57hckogd[0m
[34m[1mwandb[0m: Run [1m`wandb 

#### BoolQ

0-Shot is used when running this evaluation to match the values used in the document [Llama 2: Open Foundation and Fine-Tuned Chat Models](https://arxiv.org/abs/2307.09288)

In [32]:
!lm_eval --model hf \
    --model_args pretrained=mistralai/Mistral-7B-Instruct-v0.2,trust_remote_code=True \
    --tasks boolq \
    --num_fewshot 0 \
    --device cuda:0 \
    --batch_size auto:4 \
    --verbosity INFO \
    --output_path results_dir_mistral_boolq \
    --log_samples \
    --wandb_args project=quantized_model_evaluation,name=Mistral-7B-Instruct-boolq \
    --use_cache results_dir_mistral_boolq\
    --cache_requests true \
    --show_config

2024-04-04 14:51:21.626203: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-04 14:51:21.626265: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-04 14:51:21.627751: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[34m[1mwandb[0m: Currently logged in as: [33mted-whooley[0m ([33matu-twhool02[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.16.6
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/content/lm-evaluation-harness/wandb/run-20240404_145127-k4cf672l[0m
[34m[1mwandb[0m: Run [1m`wandb 

#### BBH

3-Shot is used when running this evaluation to match the values used in the document [Llama 2: Open Foundation and Fine-Tuned Chat Models](https://arxiv.org/abs/2307.09288)

In [34]:
!lm_eval --model hf \
    --model_args pretrained=mistralai/Mistral-7B-Instruct-v0.2,trust_remote_code=True \
    --tasks bbh_fewshot \
    --num_fewshot 3 \
    --device cuda:0 \
    --batch_size auto:4 \
    --verbosity INFO \
    --output_path results_dir_mistral_bbh \
    --log_samples \
    --wandb_args project=quantized_model_evaluation,name=Mistral-7B-Instruct-bbh\
    --use_cache results_dir_mistral_bbh\
    --cache_requests true \
    --show_config

2024-04-04 15:06:50.514777: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-04 15:06:50.514836: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-04 15:06:50.516605: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[34m[1mwandb[0m: Currently logged in as: [33mted-whooley[0m ([33matu-twhool02[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.16.6
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/content/lm-evaluation-harness/wandb/run-20240404_150656-az7txizj[0m
[34m[1mwandb[0m: Run [1m`wandb 