# 99. Case Study

In [2]:
from master_thesis.experiments import Experiment

2023-09-30 18:32:00.829472: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX512_VNNI
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-09-30 18:32:01.038348: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


## 99.1 Create Custom Resources

In [3]:
from typing import Any, Callable, Dict, Optional

from datasets import Dataset

from master_thesis.datasets import SQUADv1Dataset
from master_thesis.defaults import RANDOM_SEED


class NoContextDataset(SQUADv1Dataset):
    _percent_no_context: float

    def __init__(
        self,
        percent_no_context: int = 0.10,
        train_limit: Optional[int] = None,
        test_limit: Optional[int] = None,
    ) -> None:
        super().__init__(train_limit=train_limit, test_limit=test_limit)

        self._percent_no_context = percent_no_context

    @property
    def train(self) -> Dataset:
        if not self._train:
            self._train = (
                self._dataset_dict["train"]
                .map(
                    self._format,
                    batched=True,
                    remove_columns=self._dataset_dict["train"].column_names,
                )
                .shuffle(seed=RANDOM_SEED)
            )

            # Add no context samples
            self._train = self._train.map(
                self._introduce_no_context_samples(
                    0,
                    self._train.num_rows * self._percent_no_context,
                ),
                batched=False,
                with_indices=True,
            ).shuffle(seed=RANDOM_SEED)

            if self._train_limit:
                self._train = self._train.select(range(self._train_limit))

        return self._train

    @property
    def test(self) -> Dataset:
        if not self._test:
            self._test = (
                self._dataset_dict["validation"]
                .map(
                    self._format,
                    batched=True,
                    remove_columns=self._dataset_dict["validation"].column_names,
                )
                .shuffle(seed=RANDOM_SEED)
            )

            # Add no context samples
            self._test = self._test.map(
                self._introduce_no_context_samples(
                    0,
                    self._test.num_rows * self._percent_no_context,
                ),
                batched=False,
                with_indices=True,
            ).shuffle(seed=RANDOM_SEED)

            if self._test_limit:
                self._test = self._test.select(range(self._test_limit))

        return self._test

    def _introduce_no_context_samples(
        self,
        start_idx: int,
        end_idx: int,
    ) -> Callable[[Dict[str, Any], int], Dict[str, Any]]:
        def map_function(sample: Dict[str, Any], idx: int) -> Dict[str, Any]:
            if idx >= start_idx and idx <= end_idx:
                return {**sample, "references": [""]}
            return sample

        return map_function

In [4]:
from langchain.chains.base import Chain
from langchain.evaluation import EmbeddingDistance, EvaluatorType, load_evaluator

from master_thesis.base import BaseMetric


class CosineSimilarityMetric(BaseMetric):
    _evaluator: Chain

    def __init__(self) -> None:
        super().__init__()

        self._evaluator = load_evaluator(
            EvaluatorType.EMBEDDING_DISTANCE,
            distance_metric=EmbeddingDistance.COSINE,
        )

    def test(self, question: str, answer: str, prediction: str) -> float:
        return self._evaluator.evaluate_strings(
            prediction=prediction,
            reference=answer,
        )["score"]

In [5]:
from typing import List

from master_thesis.base import BasePrompt


class SimpleLLaMAPrompt(BasePrompt):
    def run(self, references: Optional[List[str]], question: str) -> str:
        prompt = f"""<s>[INST] <<SYS>>
        {self._format_references(references)}
        <</SYS>>

        {question} [/INST]
        """
        
        print(self._format_prompt(prompt))

        return self._model.generate(self._format_prompt(prompt))

## 99.2 Test Base Model + Simple Prompt

* Model: `meta-llama/llama-2-7b-chat-hf`
* Finetune: `None`
* Prompt: `SimpleLLaMAPrompt`
* Dataset: `NoContextDataset`
* Metric: `CosineSimilarityMetric`

In [6]:
base_model_experiment = Experiment(
    experiment_name="case_study_base_model_simple_prompt",
    model="meta-llama/Llama-2-7b-chat-hf",
    model_config={
        "max_tokens": 256,
        "stop_sequences": ["\n"],
        "temperature": 0.7,
        "top_p": 1.0,
    },
    dataset=NoContextDataset,
    dataset_config={"train_limit": 10, "test_limit": 10},
    metric=CosineSimilarityMetric,
    metric_config={},
    prompt=SimpleLLaMAPrompt,
    prompt_config={},
)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [14]:
base_model_experiment.run()



Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

Map:   0%|          | 0/34726 [00:00<?, ? examples/s]

  0%|          | 0/100 [00:00<?, ?it/s]

<s>[INST] <<SYS>>
V
i
c
t
o
r
i
a
 
h
a
s
 
a
 
w
r
i
t
t
e
n
 
c
o
n
s
t
i
t
u
t
i
o
n
 
e
n
a
c
t
e
d
 
i
n
 
1
9
7
5
,
 
b
u
t
 
b
a
s
e
d
 
o
n
 
t
h
e
 
1
8
5
5
 
c
o
l
o
n
i
a
l
 
c
o
n
s
t
i
t
u
t
i
o
n
,
 
p
a
s
s
e
d
 
b
y
 
t
h
e
 
U
n
i
t
e
d
 
K
i
n
g
d
o
m
 
P
a
r
l
i
a
m
e
n
t
 
a
s
 
t
h
e
 
V
i
c
t
o
r
i
a
 
C
o
n
s
t
i
t
u
t
i
o
n
 
A
c
t
 
1
8
5
5
,
 
w
h
i
c
h
 
e
s
t
a
b
l
i
s
h
e
s
 
t
h
e
 
P
a
r
l
i
a
m
e
n
t
 
a
s
 
t
h
e
 
s
t
a
t
e
'
s
 
l
a
w
-
m
a
k
i
n
g
 
b
o
d
y
 
f
o
r
 
m
a
t
t
e
r
s
 
c
o
m
i
n
g
 
u
n
d
e
r
 
s
t
a
t
e
 
r
e
s
p
o
n
s
i
b
i
l
i
t
y
.
 
T
h
e
 
V
i
c
t
o
r
i
a
n
 
C
o
n
s
t
i
t
u
t
i
o
n
 
c
a
n
 
b
e
 
a
m
e
n
d
e
d
 
b
y
 
t
h
e
 
P
a
r
l
i
a
m
e
n
t
 
o
f
 
V
i
c
t
o
r
i
a
,
 
e
x
c
e
p
t
 
f
o
r
 
c
e
r
t
a
i
n
 
"
e
n
t
r
e
n
c
h
e
d
"
 
p
r
o
v
i
s
i
o
n
s
 
t
h
a
t
 
r
e
q
u
i
r
e
 
e
i
t
h
e
r
 
a
n
 
a
b
s
o
l
u
t
e
 
m
a
j
o
r
i
t
y
 
i
n
 
b
o
t
h
 
h
o
u
s
e
s
,
 
a
 
t
h
r
e
e
-
f
i
f
t
h
s
 
m
a
j
o
r
i
t
y
 
i
n
 
b
o
t


  1%|          | 1/100 [00:18<30:35, 18.54s/it]

<s>[INST] <<SYS>>
T
h
e
 
M
o
n
g
o
l
s
 
l
e
a
r
n
e
d
 
f
r
o
m
 
c
a
p
t
i
v
e
s
 
o
f
 
t
h
e
 
a
b
u
n
d
a
n
t
 
g
r
e
e
n
 
p
a
s
t
u
r
e
s
 
b
e
y
o
n
d
 
t
h
e
 
B
u
l
g
a
r
 
t
e
r
r
i
t
o
r
y
,
 
a
l
l
o
w
i
n
g
 
f
o
r
 
t
h
e
 
p
l
a
n
n
i
n
g
 
f
o
r
 
c
o
n
q
u
e
s
t
 
o
f
 
H
u
n
g
a
r
y
 
a
n
d
 
E
u
r
o
p
e
.
 
G
e
n
g
h
i
s
 
K
h
a
n
 
r
e
c
a
l
l
e
d
 
S
u
b
u
t
a
i
 
b
a
c
k
 
t
o
 
M
o
n
g
o
l
i
a
 
s
o
o
n
 
a
f
t
e
r
w
a
r
d
s
,
 
a
n
d
 
J
e
b
e
 
d
i
e
d
 
o
n
 
t
h
e
 
r
o
a
d
 
b
a
c
k
 
t
o
 
S
a
m
a
r
k
a
n
d
.
 
T
h
e
 
f
a
m
o
u
s
 
c
a
v
a
l
r
y
 
e
x
p
e
d
i
t
i
o
n
 
l
e
d
 
b
y
 
S
u
b
u
t
a
i
 
a
n
d
 
J
e
b
e
,
 
i
n
 
w
h
i
c
h
 
t
h
e
y
 
e
n
c
i
r
c
l
e
d
 
t
h
e
 
e
n
t
i
r
e
 
C
a
s
p
i
a
n
 
S
e
a
 
d
e
f
e
a
t
i
n
g
 
a
l
l
 
a
r
m
i
e
s
 
i
n
 
t
h
e
i
r
 
p
a
t
h
,
 
r
e
m
a
i
n
s
 
u
n
p
a
r
a
l
l
e
l
e
d
 
t
o
 
t
h
i
s
 
d
a
y
,
 
a
n
d
 
w
o
r
d
 
o
f
 
t
h
e
 
M
o
n
g
o
l
 
t
r
i
u
m
p
h
s
 
b
e
g
a
n
 
t
o
 
t
r
i
c
k
l
e
 
t
o
 
o
t


  2%|▏         | 2/100 [00:34<27:55, 17.09s/it]

<s>[INST] <<SYS>>
T
h
e
 
r
e
v
i
v
e
d
 
s
e
r
i
e
s
 
h
a
s
 
r
e
c
e
i
v
e
d
 
r
e
c
o
g
n
i
t
i
o
n
 
f
r
o
m
 
c
r
i
t
i
c
s
 
a
n
d
 
t
h
e
 
p
u
b
l
i
c
,
 
a
c
r
o
s
s
 
v
a
r
i
o
u
s
 
a
w
a
r
d
s
 
c
e
r
e
m
o
n
i
e
s
.
 
I
t
 
w
o
n
 
f
i
v
e
 
B
A
F
T
A
 
T
V
 
A
w
a
r
d
s
,
 
i
n
c
l
u
d
i
n
g
 
B
e
s
t
 
D
r
a
m
a
 
S
e
r
i
e
s
,
 
t
h
e
 
h
i
g
h
e
s
t
-
p
r
o
f
i
l
e
 
a
n
d
 
m
o
s
t
 
p
r
e
s
t
i
g
i
o
u
s
 
B
r
i
t
i
s
h
 
t
e
l
e
v
i
s
i
o
n
 
a
w
a
r
d
 
f
o
r
 
w
h
i
c
h
 
t
h
e
 
s
e
r
i
e
s
 
h
a
s
 
e
v
e
r
 
b
e
e
n
 
n
o
m
i
n
a
t
e
d
.
 
I
t
 
w
a
s
 
v
e
r
y
 
p
o
p
u
l
a
r
 
a
t
 
t
h
e
 
B
A
F
T
A
 
C
y
m
r
u
 
A
w
a
r
d
s
,
 
w
i
t
h
 
2
5
 
w
i
n
s
 
o
v
e
r
a
l
l
 
i
n
c
l
u
d
i
n
g
 
B
e
s
t
 
D
r
a
m
a
 
S
e
r
i
e
s
 
(
t
w
i
c
e
)
,
 
B
e
s
t
 
S
c
r
e
e
n
p
l
a
y
/
S
c
r
e
e
n
w
r
i
t
e
r
 
(
t
h
r
i
c
e
)
 
a
n
d
 
B
e
s
t
 
A
c
t
o
r
.
 
I
t
 
w
a
s
 
a
l
s
o
 
n
o
m
i
n
a
t
e
d
 
f
o
r
 
7
 
S
a
t
u
r
n
 
A
w
a
r
d
s
,
 
w
i
n
n
i
n
g
 
t
h
e
 


  2%|▏         | 2/100 [00:40<32:52, 20.13s/it]


KeyboardInterrupt: 

## 99.3 Test Finetuned Model + Simple Prompt

* Model: `meta-llama/llama-2-7b-chat-hf`
* Finetune: `NoContextDataset`
* Prompt: `SimpleLLaMAPrompt`
* Dataset: `NoContextDataset`
* Metric: `CosineSimilarityMetric`

In [8]:
# TODO: create finetune format

In [9]:
# TODO: create experiment (case_study_finetuned_model_simple_prompt)

In [10]:
# TODO: run experiment

## 99.4  Test Base Model + Advanced Prompt

* Model: `meta-llama/llama-2-7b-chat-hf`
* Finetune: `None`
* Prompt: `AdvancedLLaMAPrompt`
* Dataset: `NoContextDataset`
* Metric: `CosineSimilarityMetric`

In [11]:
# TODO: create advanced llama prompt

In [12]:
# TODO: create experiment (case_study_base_model_advanced_prompt)

In [13]:
# TODO: run experiment

## 99.5 Compare Results