# Fine tuning

### Load pretrained model

In [40]:
from sentence_transformers import SentenceTransformer
import os 

In [41]:
model_id = "BAAI/bge-small-en"
model = SentenceTransformer(model_id)

In [42]:
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': True}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

### Define dataloader

In [48]:
import json

from torch.utils.data import DataLoader
from sentence_transformers import InputExample


In [104]:
math_example_path = os.getcwd() + '/data/What_Is_Mathematics_An_Elementary_Approach_to_Ideas_and_Methods.txt'

with open(math_example_path, "r",  encoding="utf8") as f:
     math_example_text = f.read()
math_sentences = math_example_text.split("\n")

with open(TRAIN_DATASET_FPATH, 'r+') as f:
    train_dataset = json.load(f)
    
TRAIN_DATASET_FPATH = os.getcwd() + '/finetune_data/train_dataset.json'
VAL_DATASET_FPATH = os.getcwd() + '/finetune_data/val_dataset.json'

# We use a very small batchsize to run this toy example on a local machine. 
# This should typically be much larger. 
BATCH_SIZE = 10

In [85]:
with open(TRAIN_DATASET_FPATH, 'r+') as f:
    train_dataset = json.load(f)

with open(VAL_DATASET_FPATH, 'r+') as f:
    val_dataset = json.load(f)

In [108]:
def generate_db_dataset(train_dataset):
    dataset = train_dataset
    
    corpus = dataset['corpus']
    queries = dataset['queries']
    relevant_docs = dataset['relevant_docs']
    
    
    
    examples = []
    for query_id, query in queries.items():
        node_id = relevant_docs[query_id][0]
        text = corpus[node_id]
        example = InputExample(texts=[query, text])
        examples.append(example)

    return examples

def generate_math_dataset(math_sentences):
    examples = []
    for sentence in math_sentences:
        example = InputExample(texts=[sentence, sentence])
        examples.append(example)

    return examples

In [128]:

math_data = generate_math_dataset(math_sentences)[0:100]
loader = DataLoader(
    #generate_db_dataset(train_dataset), 
    math_data,
    batch_size=BATCH_SIZE
)

### Define loss

In [119]:
# https://www.sbert.net/docs/package_reference/losses.html#multiplenegativesrankingloss
from sentence_transformers import losses

In [120]:
loss = losses.MultipleNegativesRankingLoss(model)

### Define evaluator 

In [121]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator

In [122]:
dataset = val_dataset

corpus = dataset['corpus']
queries = dataset['queries']
relevant_docs = dataset['relevant_docs']

# evaluator = InformationRetrievalEvaluator(queries, corpus, relevant_docs)

### Run training 

In [124]:
EPOCHS = 10

In [125]:
import mlflow

# Definition of of callbak should be after model init
class MLFlowCallback:
    def __init__(self, model):
        self.model = model
    
    def __call__(self, score, epoch, steps) -> None:
        print(self.model)
        print(score, epoch, steps)
        mlflow.log_metric('score', score)
        # https://mlflow.org/docs/latest/tracking/artifacts-stores.html

mlflow_callback = MLFlowCallback(model)
           
warmup_steps = int(len(loader) * EPOCHS * 0.1)

with mlflow.start_run():
    model.fit(
        train_objectives=[(loader, loss)],
        epochs=EPOCHS,
        warmup_steps=warmup_steps,
        output_path='exp_finetune',
        show_progress_bar=True,
        #evaluator=evaluator, 
        evaluation_steps=50,
        callback=mlflow_callback 
    )

Epoch:   0%|                                                                                    | 0/10 [00:00<?, ?it/s]
Iteration:   0%|                                                                                | 0/10 [00:00<?, ?it/s][A
Iteration:  10%|███████▏                                                                | 1/10 [00:00<00:07,  1.22it/s][A
Iteration:  20%|██████████████▍                                                         | 2/10 [00:01<00:06,  1.33it/s][A
Iteration:  30%|█████████████████████▌                                                  | 3/10 [00:02<00:05,  1.29it/s][A
Iteration:  40%|████████████████████████████▊                                           | 4/10 [00:02<00:04,  1.38it/s][A
Iteration:  50%|████████████████████████████████████                                    | 5/10 [00:03<00:03,  1.41it/s][A
Iteration:  60%|███████████████████████████████████████████▏                            | 6/10 [00:04<00:02,  1.35it/s][A
Iteration:  70%|███

In [60]:
### llamaindex

In [126]:
from llama_index import ServiceContext, VectorStoreIndex
from llama_index.schema import TextNode
from llama_index.embeddings import HuggingFaceEmbedding# OpenAIEmbedding

MODEL_PATH = os.getcwd() + r'\exp_finetune'
embed_model = HuggingFaceEmbedding(MODEL_PATH)

print(embed_model)

model_name='C:\\Users\\tempdelta\\Desktop\\temp_l\\exp_finetune' embed_batch_size=10 callback_manager=<llama_index.callbacks.base.CallbackManager object at 0x0000012F44FDB810> tokenizer_name='C:\\Users\\tempdelta\\Desktop\\temp_l\\exp_finetune' max_length=512 pooling=<Pooling.CLS: 'cls'> normalize=True query_instruction=None text_instruction=None cache_folder=None


In [135]:
# https://github.com/run-llama/llama_index/issues/10051
top_k = 5
service_context = ServiceContext.from_defaults(embed_model=embed_model, llm=None)

math_nodes = [TextNode(id_=example.texts[0], text=example.texts[0]) for example in math_data] 

nodes = [TextNode(id_=id_, text=text) for id_, text in corpus.items()] 
index = VectorStoreIndex(
    nodes, 
    service_context=service_context, 
    show_progress=True
)

retriever = index.as_retriever(similarity_top_k=top_k)

LLM is explicitly disabled. Using MockLLM.


Generating embeddings: 100%|█████████████████████████████████████████████████████████| 395/395 [04:37<00:00,  1.42it/s]


In [136]:
index.storage_context.persist(persist_dir="./storage")

In [142]:
from llama_index import load_index_from_storage
from llama_index.storage.storage_context import StorageContext
loaded_index = load_index_from_storage(StorageContext.from_defaults(persist_dir="./storage"), llm=None)

ValueError: 
******
Could not load OpenAI model. If you intended to use OpenAI, please check your OPENAI_API_KEY.
Original error:
No API key found for OpenAI.
Please set either the OPENAI_API_KEY environment variable or openai.api_key prior to initialization.
API keys can be found or created at https://platform.openai.com/account/api-keys

To disable the LLM entirely, set llm=None.
******

 
retriever = index.as_retriever(similarity_top_k=top_k)))

loaded_index = load_index_from_disk(StorageContext.from_defaults(persist_dir="./storage"))

In [133]:
math_data[0].texts

['Skip to main content', 'Skip to main content']