# Fine tuning

### Load pretrained model

In [10]:
import os 
from sentence_transformers import SentenceTransformer
import json

from torch.utils.data import DataLoader
from sentence_transformers import InputExample


In [11]:
model_id = "BAAI/bge-small-en"
model = SentenceTransformer(model_id)

In [12]:
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': True}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

### Define dataloader

In [21]:
math_example_path = os.getcwd() + '/data/What_Is_Mathematics_An_Elementary_Approach_to_Ideas_and_Methods.txt'

with open(math_example_path, "r",  encoding="utf8") as f:
     math_example_text = f.read()

math_sentences = math_example_text.split("\n")

def generate_math_dataset(math_sentences):
    examples = []
    for sentence in math_sentences:
        example = InputExample(texts=[sentence]) # temp
        examples.append(example)

    return examples

math_data = generate_math_dataset(math_sentences)[0:100]

In [14]:

# TRAIN_DATASET_FPATH = os.getcwd() + '/finetune_data/train_dataset.json'
# VAL_DATASET_FPATH = os.getcwd() + '/finetune_data/val_dataset.json'

# # We use a very small batchsize to run this toy example on a local machine. 
# # This should typically be much larger. 
# BATCH_SIZE = 10

# with open(TRAIN_DATASET_FPATH, 'r+') as f:
#     train_dataset = json.load(f)

# with open(VAL_DATASET_FPATH, 'r+') as f:
#     val_dataset = json.load(f)

# dataset = val_dataset

# from sentence_transformers.evaluation import InformationRetrievalEvaluator

# corpus = dataset['corpus']
# queries = dataset['queries']
# relevant_docs = dataset['relevant_docs']
# evaluator = InformationRetrievalEvaluator(queries, corpus, relevant_docs)

In [18]:
BATCH_SIZE = 10

In [19]:

loader = DataLoader(
    #generate_db_dataset(train_dataset), 
    math_data,
    batch_size=BATCH_SIZE
)

### Define loss

In [119]:
# https://www.sbert.net/docs/package_reference/losses.html#multiplenegativesrankingloss
from sentence_transformers import losses

In [120]:
loss = losses.MultipleNegativesRankingLoss(model)

### Define evaluator 

### Run training 

In [124]:
EPOCHS = 10

In [125]:
import mlflow

# Definition of of callbak should be after model init
class MLFlowCallback:
    def __init__(self, model):
        self.model = model
    
    def __call__(self, score, epoch, steps) -> None:
        print(self.model)
        print(score, epoch, steps)
        mlflow.log_metric('score', score)
        # https://mlflow.org/docs/latest/tracking/artifacts-stores.html

mlflow_callback = MLFlowCallback(model)
           
warmup_steps = int(len(loader) * EPOCHS * 0.1)

with mlflow.start_run():
    model.fit(
        train_objectives=[(loader, loss)],
        epochs=EPOCHS,
        warmup_steps=warmup_steps,
        output_path='exp_finetune',
        show_progress_bar=True,
        #evaluator=evaluator, 
        evaluation_steps=50,
        callback=mlflow_callback 
    )

Epoch:   0%|                                                                                    | 0/10 [00:00<?, ?it/s]
Iteration:   0%|                                                                                | 0/10 [00:00<?, ?it/s][A
Iteration:  10%|███████▏                                                                | 1/10 [00:00<00:07,  1.22it/s][A
Iteration:  20%|██████████████▍                                                         | 2/10 [00:01<00:06,  1.33it/s][A
Iteration:  30%|█████████████████████▌                                                  | 3/10 [00:02<00:05,  1.29it/s][A
Iteration:  40%|████████████████████████████▊                                           | 4/10 [00:02<00:04,  1.38it/s][A
Iteration:  50%|████████████████████████████████████                                    | 5/10 [00:03<00:03,  1.41it/s][A
Iteration:  60%|███████████████████████████████████████████▏                            | 6/10 [00:04<00:02,  1.35it/s][A
Iteration:  70%|███

In [60]:
### llamaindex

In [20]:
from llama_index import ServiceContext, VectorStoreIndex
from llama_index.schema import TextNode
from llama_index.embeddings import HuggingFaceEmbedding# OpenAIEmbedding
import os

MODEL_PATH = os.getcwd() + r'\exp_finetune'
embed_model = HuggingFaceEmbedding(MODEL_PATH)

print(embed_model)

model_name='C:\\Users\\tempdelta\\Desktop\\temp_l\\exp_finetune' embed_batch_size=10 callback_manager=<llama_index.callbacks.base.CallbackManager object at 0x0000026D001A91D0> tokenizer_name='C:\\Users\\tempdelta\\Desktop\\temp_l\\exp_finetune' max_length=512 pooling=<Pooling.CLS: 'cls'> normalize=True query_instruction=None text_instruction=None cache_folder=None


In [23]:
# https://github.com/run-llama/llama_index/issues/10051
top_k = 5
service_context = ServiceContext.from_defaults(embed_model=embed_model, llm=None)

math_nodes = [TextNode(id_=example.texts[0], text=example.texts[0]) for example in math_data] 

nodes = [TextNode(id_=id_, text=text) for id_, text in corpus.items()] 
index = VectorStoreIndex(
    nodes, 
    service_context=service_context, 
    show_progress=True
)

retriever = index.as_retriever(similarity_top_k=top_k)

LLM is explicitly disabled. Using MockLLM.


NameError: name 'corpus' is not defined

In [None]:
# index.storage_context.persist(persist_dir="./storage")

In [7]:
# Wrap seq 2 seq model into custom llm model

from typing import Optional, List, Mapping, Any
from transformers import GPT2Tokenizer, GPT2LMHeadModel


from llama_index.llms import (
    CustomLLM,
    CompletionResponse,
    CompletionResponseGen,
    LLMMetadata,
)
from llama_index.llms.base import llm_completion_callback

class OurLLM(CustomLLM):
    context_window: int = 3900
    num_output: int = 256
    model_name: str = "custom"
    dummy_response: str = "My response"
    tokenizer: GPT2Tokenizer = None
    model: GPT2LMHeadModel = None

    def __init__(self, tokenizer, model):
        super(CustomLLM, self).__init__()
        
        self.tokenizer = tokenizer
        self.model = model

    @property
    def metadata(self) -> LLMMetadata:
        """Get LLM metadata."""
        return LLMMetadata(
            context_window=self.context_window,
            num_output=self.num_output,
            model_name=self.model_name,
        )

    @llm_completion_callback()
    def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
        input_ids = tokenizer.encode(prompt, add_special_tokens=True, return_tensors='pt')
        output = model.generate(input_ids)

        return tokenizer.decode(output[0])

    @llm_completion_callback()
    def stream_complete(
        self, prompt: str, **kwargs: Any
    ) -> CompletionResponseGen:
        response = ""
        for token in self.dummy_response:
            response += token
            yield CompletionResponse(text=response, delta=token)


output_dir = "./finetuned_llm"

tokenizer = GPT2Tokenizer.from_pretrained(output_dir)
llm_model = GPT2LMHeadModel.from_pretrained(output_dir) 
input_ids = tokenizer.encode("Djjas", add_special_tokens=True, return_tensors='pt')
output = llm_model.generate(input_ids)

print(tokenizer.decode(output[0]))


# define our LLM
llm = OurLLM(tokenizer, llm_model)

Djjas, who is also a member of the National Council of the Muslim Brotherhood, said the


In [95]:
from llama_index import load_index_from_storage
from llama_index.storage.storage_context import StorageContext
from llama_index import ServiceContext


service_context = ServiceContext.from_defaults(
    llm=llm,
    embed_model=embed_model,
)

loaded_index = load_index_from_storage(storage_context=StorageContext.from_defaults(persist_dir="./storage"), service_context=service_context)

loaded_index = load_index_from_disk(StorageContext.from_defaults(persist_dir="./storage"))

In [96]:
retriever = loaded_index.as_retriever(similarity_top_k=top_k)

In [98]:
nodes

[NodeWithScore(node=TextNode(id_='79f581ef-80dd-4d04-9e8e-ec128f95e6b1', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='190905e6b7e3d6391293c392785096ddd4a66fc5553b42668936fbd624a55c83', text='Beingclassified  as  a  transportation  provider  would  result  in  a  VAT  (20%)  on  Gross  Bookings  or  on  the  service  fee  that  we  charge  Drivers,  both  retroactively  andprospectively.HMRC is considering a number of factors including our contractual Driver, Rider and intercompany arrangements, and HMRC is also expected toconsider the U.K. Supreme Court’s February 19, 2021 ruling on Drivers’ worker classification, in determining whether we should be classified as a provider oftransportation services.HMRC may update its assessment, which we would then review and discuss with HMRC.If we do not reach a satisfactory resolution afterexhausting HMRC’s review and appeals process, we would still be able to argue our case a