# Fine tuning

### Load pretrained model

In [6]:
import os 
from sentence_transformers import SentenceTransformer
import json

from torch.utils.data import DataLoader
from sentence_transformers import InputExample


In [7]:
model_id = "BAAI/bge-small-en"
model = SentenceTransformer(model_id)

In [8]:
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': True}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

### Define dataloader

In [46]:
math_example_path = os.getcwd() + '/data/What_Is_Mathematics_An_Elementary_Approach_to_Ideas_and_Methods.txt'

with open(math_example_path, "r",  encoding="utf8") as f:
     math_example_text = f.read()

math_sentences = math_example_text.split("\n")

def generate_math_dataset(math_sentences):
    examples = []
    for sentence in math_sentences:
        if len(sentence) !=0: # dummy check
            example = InputExample(texts=[sentence]) # temp
            examples.append(example)

    return examples

math_data = generate_math_dataset(math_sentences)[1000:1100]

In [14]:

# TRAIN_DATASET_FPATH = os.getcwd() + '/finetune_data/train_dataset.json'
# VAL_DATASET_FPATH = os.getcwd() + '/finetune_data/val_dataset.json'

# # We use a very small batchsize to run this toy example on a local machine. 
# # This should typically be much larger. 
# BATCH_SIZE = 10

# with open(TRAIN_DATASET_FPATH, 'r+') as f:
#     train_dataset = json.load(f)

# with open(VAL_DATASET_FPATH, 'r+') as f:
#     val_dataset = json.load(f)

# dataset = val_dataset

# from sentence_transformers.evaluation import InformationRetrievalEvaluator

# corpus = dataset['corpus']
# queries = dataset['queries']
# relevant_docs = dataset['relevant_docs']
# evaluator = InformationRetrievalEvaluator(queries, corpus, relevant_docs)

In [18]:
BATCH_SIZE = 10

In [19]:

loader = DataLoader(
    #generate_db_dataset(train_dataset), 
    math_data,
    batch_size=BATCH_SIZE
)

### Define loss

In [119]:
# https://www.sbert.net/docs/package_reference/losses.html#multiplenegativesrankingloss
from sentence_transformers import losses

In [120]:
loss = losses.MultipleNegativesRankingLoss(model)

### Define evaluator 

### Run training 

In [124]:
EPOCHS = 10

In [125]:
import mlflow

# Definition of of callbak should be after model init
class MLFlowCallback:
    def __init__(self, model):
        self.model = model
    
    def __call__(self, score, epoch, steps) -> None:
        print(self.model)
        print(score, epoch, steps)
        mlflow.log_metric('score', score)
        # https://mlflow.org/docs/latest/tracking/artifacts-stores.html

mlflow_callback = MLFlowCallback(model)
           
warmup_steps = int(len(loader) * EPOCHS * 0.1)

with mlflow.start_run():
    model.fit(
        train_objectives=[(loader, loss)],
        epochs=EPOCHS,
        warmup_steps=warmup_steps,
        output_path='exp_finetune',
        show_progress_bar=True,
        #evaluator=evaluator, 
        evaluation_steps=50,
        callback=mlflow_callback 
    )

Epoch:   0%|                                                                                    | 0/10 [00:00<?, ?it/s]
Iteration:   0%|                                                                                | 0/10 [00:00<?, ?it/s][A
Iteration:  10%|███████▏                                                                | 1/10 [00:00<00:07,  1.22it/s][A
Iteration:  20%|██████████████▍                                                         | 2/10 [00:01<00:06,  1.33it/s][A
Iteration:  30%|█████████████████████▌                                                  | 3/10 [00:02<00:05,  1.29it/s][A
Iteration:  40%|████████████████████████████▊                                           | 4/10 [00:02<00:04,  1.38it/s][A
Iteration:  50%|████████████████████████████████████                                    | 5/10 [00:03<00:03,  1.41it/s][A
Iteration:  60%|███████████████████████████████████████████▏                            | 6/10 [00:04<00:02,  1.35it/s][A
Iteration:  70%|███

In [60]:
### llamaindex

In [64]:
from llama_index import ServiceContext, VectorStoreIndex
from llama_index.schema import TextNode
from llama_index.embeddings import HuggingFaceEmbedding# OpenAIEmbedding
import os

MODEL_PATH = "BAAI/bge-small-en-v1.5" #os.getcwd() + r'\exp_finetune'
embed_model = HuggingFaceEmbedding(MODEL_PATH)

print(embed_model)

model.safetensors: 100%|█████████████████████████████████████████████████████████████| 133M/133M [02:29<00:00, 895kB/s]
tokenizer_config.json: 100%|███████████████████████████████████████████████████████████| 366/366 [00:00<00:00, 366kB/s]
vocab.txt: 100%|█████████████████████████████████████████████████████████████████████| 232k/232k [00:00<00:00, 983kB/s]
tokenizer.json: 100%|███████████████████████████████████████████████████████████████| 711k/711k [00:00<00:00, 1.40MB/s]
special_tokens_map.json: 100%|█████████████████████████████████████████████████████████| 125/125 [00:00<00:00, 125kB/s]

model_name='BAAI/bge-small-en-v1.5' embed_batch_size=10 callback_manager=<llama_index.callbacks.base.CallbackManager object at 0x000001D86B0DBD50> tokenizer_name='BAAI/bge-small-en-v1.5' max_length=512 pooling=<Pooling.CLS: 'cls'> normalize=True query_instruction=None text_instruction=None cache_folder=None





In [65]:
# https://github.com/run-llama/llama_index/issues/10051
top_k = 5
service_context = ServiceContext.from_defaults(embed_model=embed_model, llm=None)

math_nodes = [TextNode(id_=example.texts[0], text=example.texts[0]) for example in math_data] 

nodes = [TextNode(id_=index, text=input_example.texts[0]) for index, input_example in enumerate(math_data)] #math_data.items()] 
index = VectorStoreIndex(
    nodes, 
    service_context=service_context, 
    show_progress=True
)

retriever = index.as_retriever(similarity_top_k=top_k)

LLM is explicitly disabled. Using MockLLM.


Generating embeddings: 100%|████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 215.46it/s]


In [68]:
# TEST 

#retriever = loaded_index.as_retriever(similarity_top_k=top_k)
new_query_engine = index.as_query_engine()
response = new_query_engine.query("Who am I?")
print(response)

Context information is below.
---------------------
_m

as anything can be, but the character of this statement is not the same
---------------------
Given the context information and not prior knowledge, answer the query.
Query: Who am I?
Answer: 


In [54]:
# index.storage_context.persist(persist_dir="./storage")

In [29]:
# Wrap seq 2 seq model into custom llm model

from typing import Optional, List, Mapping, Any
from transformers import GPT2Tokenizer, GPT2LMHeadModel


from llama_index.llms import (
    CustomLLM,
    CompletionResponse,
    CompletionResponseGen,
    LLMMetadata,
)
from llama_index.llms.base import llm_completion_callback

class OurLLM(CustomLLM):
    context_window: int = 3900
    num_output: int = 256
    model_name: str = "custom"
    dummy_response: str = "My response"
    tokenizer: GPT2Tokenizer = None
    model: GPT2LMHeadModel = None

    def __init__(self, tokenizer, model):
        super(CustomLLM, self).__init__()
        
        self.tokenizer = tokenizer
        self.model = model

    @property
    def metadata(self) -> LLMMetadata:
        """Get LLM metadata."""
        return LLMMetadata(
            context_window=self.context_window,
            num_output=self.num_output,
            model_name=self.model_name,
        )

    @llm_completion_callback()
    def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
        input_ids = tokenizer.encode(prompt, add_special_tokens=True, return_tensors='pt')
        output = model.generate(input_ids)

        return tokenizer.decode(output[0])

    @llm_completion_callback()
    def stream_complete(
        self, prompt: str, **kwargs: Any
    ) -> CompletionResponseGen:
        response = ""
        for token in self.dummy_response:
            response += token
            yield CompletionResponse(text=response, delta=token)


output_dir = "./finetuned_llm"

tokenizer = GPT2Tokenizer.from_pretrained(output_dir)
llm_model = GPT2LMHeadModel.from_pretrained(output_dir) 
input_ids = tokenizer.encode("Djjas", add_special_tokens=True, return_tensors='pt')
output = llm_model.generate(input_ids)

print(tokenizer.decode(output[0]))


# define our LLM
llm = OurLLM(tokenizer, llm_model)



Djjas, who is also a member of the National Council of the Muslim Brotherhood, said the


In [47]:
from llama_index import load_index_from_storage
from llama_index.storage.storage_context import StorageContext
from llama_index import ServiceContext


service_context = ServiceContext.from_defaults(
    llm=None, #LLM
    embed_model=embed_model,
)

loaded_index = load_index_from_storage(storage_context=StorageContext.from_defaults(persist_dir="./storage"), service_context=service_context)

LLM is explicitly disabled. Using MockLLM.


In [48]:
retriever = loaded_index.as_retriever(similarity_top_k=top_k)
new_query_engine = loaded_index.as_query_engine()

In [49]:
response = new_query_engine.query("What is math?")
print(response)

Context information is below.
---------------------
requirements in effect on such day (including basic, supplemental, marginal and emergency reserves under any regulations of the Board orother Governmental Authority having jurisdiction with respect thereto) dealing with reserve requirements prescribed for eurocurrencyfunding (currently referred to as “Eurocurrency Liabilities” in Regulation D of the Board) maintained by a member bank of the FederalReserve System.“Application” means a Letter of Credit application or agreement in the form approved by the applicable Issuing Bank, executed anddelivered by the Borrower to the Administrative Agent and the applicable Issuing Bank requesting such Issuing Bank to issue a Letter ofCredit.“Approved Fund” means any Person (other than a natural person) that is engaged in making, purchasing, holding or investing inbank loans and similar extensions of credit in the ordinary course of its activities and that is administered or managed by (a) a Lender