# Fine tuning

### Load pretrained model

In [2]:
from sentence_transformers import SentenceTransformer
import os 

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
model_id = "BAAI/bge-small-en"
model = SentenceTransformer(model_id)

In [42]:
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': True}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

### Define dataloader

In [4]:
import json

from torch.utils.data import DataLoader
from sentence_transformers import InputExample


In [3]:
math_example_path = os.getcwd() + '/data/What_Is_Mathematics_An_Elementary_Approach_to_Ideas_and_Methods.txt'

with open(math_example_path, "r",  encoding="utf8") as f:
     math_example_text = f.read()

def generate_math_dataset(math_sentences):
    examples = []
    for sentence in math_sentences:
        example = InputExample(texts=[sentence, sentence]) # temp
        examples.append(example)

    return examples

math_sentences = math_example_text.split("\n")
math_data = generate_math_dataset(math_sentences)[0:100]


NameError: name 'InputExample' is not defined

In [8]:


with open(TRAIN_DATASET_FPATH, 'r+') as f:
    train_dataset = json.load(f)
    
TRAIN_DATASET_FPATH = os.getcwd() + '/finetune_data/train_dataset.json'
VAL_DATASET_FPATH = os.getcwd() + '/finetune_data/val_dataset.json'

# We use a very small batchsize to run this toy example on a local machine. 
# This should typically be much larger. 
BATCH_SIZE = 10

NameError: name 'generate_math_dataset' is not defined

In [85]:
with open(TRAIN_DATASET_FPATH, 'r+') as f:
    train_dataset = json.load(f)

with open(VAL_DATASET_FPATH, 'r+') as f:
    val_dataset = json.load(f)

In [108]:
def generate_db_dataset(train_dataset):
    dataset = train_dataset
    
    corpus = dataset['corpus']
    queries = dataset['queries']
    relevant_docs = dataset['relevant_docs']
    
    
    
    examples = []
    for query_id, query in queries.items():
        node_id = relevant_docs[query_id][0]
        text = corpus[node_id]
        example = InputExample(texts=[query, text])
        examples.append(example)

    return examples


In [7]:

loader = DataLoader(
    #generate_db_dataset(train_dataset), 
    math_data,
    batch_size=BATCH_SIZE
)

NameError: name 'generate_math_dataset' is not defined

### Define loss

In [119]:
# https://www.sbert.net/docs/package_reference/losses.html#multiplenegativesrankingloss
from sentence_transformers import losses

In [120]:
loss = losses.MultipleNegativesRankingLoss(model)

### Define evaluator 

In [121]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator

In [122]:
dataset = val_dataset

corpus = dataset['corpus']
queries = dataset['queries']
relevant_docs = dataset['relevant_docs']

# evaluator = InformationRetrievalEvaluator(queries, corpus, relevant_docs)

### Run training 

In [124]:
EPOCHS = 10

In [125]:
import mlflow

# Definition of of callbak should be after model init
class MLFlowCallback:
    def __init__(self, model):
        self.model = model
    
    def __call__(self, score, epoch, steps) -> None:
        print(self.model)
        print(score, epoch, steps)
        mlflow.log_metric('score', score)
        # https://mlflow.org/docs/latest/tracking/artifacts-stores.html

mlflow_callback = MLFlowCallback(model)
           
warmup_steps = int(len(loader) * EPOCHS * 0.1)

with mlflow.start_run():
    model.fit(
        train_objectives=[(loader, loss)],
        epochs=EPOCHS,
        warmup_steps=warmup_steps,
        output_path='exp_finetune',
        show_progress_bar=True,
        #evaluator=evaluator, 
        evaluation_steps=50,
        callback=mlflow_callback 
    )

Epoch:   0%|                                                                                    | 0/10 [00:00<?, ?it/s]
Iteration:   0%|                                                                                | 0/10 [00:00<?, ?it/s][A
Iteration:  10%|███████▏                                                                | 1/10 [00:00<00:07,  1.22it/s][A
Iteration:  20%|██████████████▍                                                         | 2/10 [00:01<00:06,  1.33it/s][A
Iteration:  30%|█████████████████████▌                                                  | 3/10 [00:02<00:05,  1.29it/s][A
Iteration:  40%|████████████████████████████▊                                           | 4/10 [00:02<00:04,  1.38it/s][A
Iteration:  50%|████████████████████████████████████                                    | 5/10 [00:03<00:03,  1.41it/s][A
Iteration:  60%|███████████████████████████████████████████▏                            | 6/10 [00:04<00:02,  1.35it/s][A
Iteration:  70%|███

In [60]:
### llamaindex

In [1]:
from llama_index import ServiceContext, VectorStoreIndex
from llama_index.schema import TextNode
from llama_index.embeddings import HuggingFaceEmbedding# OpenAIEmbedding
import os

MODEL_PATH = os.getcwd() + r'\exp_finetune'
embed_model = HuggingFaceEmbedding(MODEL_PATH)

print(embed_model)

  from .autonotebook import tqdm as notebook_tqdm


model_name='C:\\Users\\tempdelta\\Desktop\\temp_l\\exp_finetune' embed_batch_size=10 callback_manager=<llama_index.callbacks.base.CallbackManager object at 0x0000015CB0C00410> tokenizer_name='C:\\Users\\tempdelta\\Desktop\\temp_l\\exp_finetune' max_length=512 pooling=<Pooling.CLS: 'cls'> normalize=True query_instruction=None text_instruction=None cache_folder=None


In [2]:
# https://github.com/run-llama/llama_index/issues/10051
top_k = 5
service_context = ServiceContext.from_defaults(embed_model=embed_model, llm=None)

math_nodes = [TextNode(id_=example.texts[0], text=example.texts[0]) for example in math_data] 

nodes = [TextNode(id_=id_, text=text) for id_, text in corpus.items()] 
index = VectorStoreIndex(
    nodes, 
    service_context=service_context, 
    show_progress=True
)

retriever = index.as_retriever(similarity_top_k=top_k)

LLM is explicitly disabled. Using MockLLM.


NameError: name 'math_data' is not defined

In [None]:
index.storage_context.persist(persist_dir="./storage")

In [18]:
import nest_asyncio

nest_asyncio.apply()

In [5]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
llm_model = GPT2LMHeadModel.from_pretrained('gpt2') # 548 mb

In [66]:

input_ids = tokenizer.encode("Do you love me", add_special_tokens=True, return_tensors='pt')
output = llm_model.generate(input_ids)
print(tokenizer.decode(output[0]))

Do you love me?"

"I love you," she said. "I love you."


In [9]:
llm_model.train()

for i, params in enumerate(llm_model.parameters()):
    print(params.shape)
    if i > len(list((llm_model.parameters()))) - 5:
        params.requires_grad = True
    else:
        params.requires_grad = False

    print(params.requires_grad)

torch.Size([50257, 768])
False
torch.Size([1024, 768])
False
torch.Size([768])
False
torch.Size([768])
False
torch.Size([768, 2304])
False
torch.Size([2304])
False
torch.Size([768, 768])
False
torch.Size([768])
False
torch.Size([768])
False
torch.Size([768])
False
torch.Size([768, 3072])
False
torch.Size([3072])
False
torch.Size([3072, 768])
False
torch.Size([768])
False
torch.Size([768])
False
torch.Size([768])
False
torch.Size([768, 2304])
False
torch.Size([2304])
False
torch.Size([768, 768])
False
torch.Size([768])
False
torch.Size([768])
False
torch.Size([768])
False
torch.Size([768, 3072])
False
torch.Size([3072])
False
torch.Size([3072, 768])
False
torch.Size([768])
False
torch.Size([768])
False
torch.Size([768])
False
torch.Size([768, 2304])
False
torch.Size([2304])
False
torch.Size([768, 768])
False
torch.Size([768])
False
torch.Size([768])
False
torch.Size([768])
False
torch.Size([768, 3072])
False
torch.Size([3072])
False
torch.Size([3072, 768])
False
torch.Size([768])
False


In [67]:
tokenizer.pad_token = tokenizer.eos_token
encoded_data = tokenizer.batch_encode_plus(math_sentences, add_special_tokens=True, return_tensors='pt', padding=True)
BATH_SIZE = 10

def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]
        
batch_data = chunks(encoded_data["input_ids"], BATH_SIZE)        

In [68]:
from transformers import TrainingArguments, Trainer
import torch
output_dir = "./finetuned_llm"
EPOCHS=5

#encoded_data = tokenizer.encode(math_sentences, add_special_tokens=True, return_tensors='pt')

llm_model.config.pad_token_id = tokenizer.eos_token_id
llm_model.config.eos_token_id = tokenizer.eos_token_id
llm_model.config.vocab_size = llm_model.config.vocab_size + len(tokenizer.get_added_vocab())
llm_model.resize_token_embeddings(len(tokenizer))
optimizer = torch.optim.AdamW(llm_model.parameters(), lr=1e-5)

for _ in range(EPOCHS):
    for batch in batch_data: 
        outputs = llm_model(batch, labels=batch)
        loss = outputs.loss
        optimizer.zero_grad()
        loss.backward()
        print("batch loss ", loss)
        optimizer.step()

    # Save the fine-tuned model every epoch 
    print("MODEL saved loss ", loss)
    llm_model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

batch loss  tensor(9.2864, grad_fn=<NllLossBackward0>)
batch loss  tensor(9.3437, grad_fn=<NllLossBackward0>)
batch loss  tensor(8.9947, grad_fn=<NllLossBackward0>)
batch loss  tensor(9.3165, grad_fn=<NllLossBackward0>)
batch loss  tensor(9.1783, grad_fn=<NllLossBackward0>)
batch loss  tensor(8.9976, grad_fn=<NllLossBackward0>)
batch loss  tensor(8.9499, grad_fn=<NllLossBackward0>)
batch loss  tensor(9.0616, grad_fn=<NllLossBackward0>)
batch loss  tensor(9.1298, grad_fn=<NllLossBackward0>)
batch loss  tensor(9.0342, grad_fn=<NllLossBackward0>)
batch loss  tensor(8.9833, grad_fn=<NllLossBackward0>)
batch loss  tensor(9.0470, grad_fn=<NllLossBackward0>)
batch loss  tensor(8.8640, grad_fn=<NllLossBackward0>)
batch loss  tensor(8.8077, grad_fn=<NllLossBackward0>)
batch loss  tensor(8.8197, grad_fn=<NllLossBackward0>)
batch loss  tensor(8.8974, grad_fn=<NllLossBackward0>)
batch loss  tensor(8.5935, grad_fn=<NllLossBackward0>)
batch loss  tensor(8.8767, grad_fn=<NllLossBackward0>)
batch loss

KeyboardInterrupt: 

In [73]:
input_ids = tokenizer.encode("Do you love me ?", add_special_tokens=True, return_tensors='pt')
output = llm_model.generate(input_ids)
print(tokenizer.decode(output[0]))

Do you love me?
I love you.


I love you.
I love


In [90]:
from typing import Optional, List, Mapping, Any

from llama_index.llms import (
    CustomLLM,
    CompletionResponse,
    CompletionResponseGen,
    LLMMetadata,
)
from llama_index.llms.base import llm_completion_callback

class OurLLM(CustomLLM):
    context_window: int = 3900
    num_output: int = 256
    model_name: str = "custom"
    dummy_response: str = "My response"

    @property
    def metadata(self) -> LLMMetadata:
        """Get LLM metadata."""
        return LLMMetadata(
            context_window=self.context_window,
            num_output=self.num_output,
            model_name=self.model_name,
        )

    @llm_completion_callback()
    def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
        input_ids = tokenizer.encode(prompt, add_special_tokens=True, return_tensors='pt')
        output = model.generate(input_ids)

        return tokenizer.decode(output[0])

    @llm_completion_callback()
    def stream_complete(
        self, prompt: str, **kwargs: Any
    ) -> CompletionResponseGen:
        response = ""
        for token in self.dummy_response:
            response += token
            yield CompletionResponse(text=response, delta=token)


# define our LLM
llm = OurLLM()

In [95]:
from llama_index import load_index_from_storage
from llama_index.storage.storage_context import StorageContext
from llama_index import ServiceContext


service_context = ServiceContext.from_defaults(
    llm=llm,
    embed_model=embed_model,
)

loaded_index = load_index_from_storage(storage_context=StorageContext.from_defaults(persist_dir="./storage"), service_context=service_context)

loaded_index = load_index_from_disk(StorageContext.from_defaults(persist_dir="./storage"))

In [96]:
retriever = loaded_index.as_retriever(similarity_top_k=top_k)

In [98]:
nodes

[NodeWithScore(node=TextNode(id_='79f581ef-80dd-4d04-9e8e-ec128f95e6b1', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='190905e6b7e3d6391293c392785096ddd4a66fc5553b42668936fbd624a55c83', text='Beingclassified  as  a  transportation  provider  would  result  in  a  VAT  (20%)  on  Gross  Bookings  or  on  the  service  fee  that  we  charge  Drivers,  both  retroactively  andprospectively.HMRC is considering a number of factors including our contractual Driver, Rider and intercompany arrangements, and HMRC is also expected toconsider the U.K. Supreme Court’s February 19, 2021 ruling on Drivers’ worker classification, in determining whether we should be classified as a provider oftransportation services.HMRC may update its assessment, which we would then review and discuss with HMRC.If we do not reach a satisfactory resolution afterexhausting HMRC’s review and appeals process, we would still be able to argue our case a