In [5]:
import json
from dataclasses import asdict, dataclass, field
from datasets import Dataset
from pathlib import Path

# 1.  Load data

In [10]:
cwd = Path.cwd()
path_json = cwd.parent.parent / "dataset" / "training_data.json"
path_json

WindowsPath('c:/Users/su.osmond/Documents/LLM/llm_sft/dataset/training_data.json')

In [11]:
# Load data from specified path
with open(path_json, "r") as f:
    data = json.load(f)

type(data), len(data), data[0]

(list,
 125,
 {'about_me': "I'm a 26 year old investor.\nI want to explore opportunities in emerging markets.",
  'context': 'Eastern European economies attract investments with improving infrastructure and skilled workforce.',
  'question': '\nHow do you view the growth potential of Eastern European markets?',
  'response': '\nEastern European markets have great potential for growth. The improving infrastructure and skilled workforce make them attractive to investors. The region is also home to some of the fastest-growing economies in the world, such as Poland and Romania. Investing in stocks and cryptocurrencies in these markets can be a great way to diversify your portfolio and benefit from the potential for high returns. Additionally, investing in emerging markets can help to reduce risk and provide a hedge against global economic downturns.'})

In [13]:
# Deserialize data
@dataclass(frozen=True)
class DataSample:
    """
    A data sample for a question answering model.

    Attributes:
        user_context (str): The user's context for the question.
        news_context (str): The news context for the question.
        chat_history (str): The chat history for the question.
        question (str): The question to be answered.
        answer (str): The answer to the question.
    """

    user_context: str = field(repr=False)
    news_context: str = ""
    chat_history: str = ""
    question: str = ""
    answer: str = ""

raw_data = [
    DataSample(
        user_context=sample["about_me"],
        news_context=sample["context"],
        chat_history=sample.get("chat_history", ""),
        question=sample["question"],
        answer=sample["response"],
    )
    for sample in data
]

print(raw_data[0])

DataSample(news_context='Eastern European economies attract investments with improving infrastructure and skilled workforce.', chat_history='', question='\nHow do you view the growth potential of Eastern European markets?', answer='\nEastern European markets have great potential for growth. The improving infrastructure and skilled workforce make them attractive to investors. The region is also home to some of the fastest-growing economies in the world, such as Poland and Romania. Investing in stocks and cryptocurrencies in these markets can be a great way to diversify your portfolio and benefit from the potential for high returns. Additionally, investing in emerging markets can help to reduce risk and provide a hedge against global economic downturns.')


In [17]:
## Get llm template
from typing import Dict, List, Union

@dataclass
class PromptTemplate:
    """A class that manages prompt templates"""

    name: str
    system_template: str = "{system_message}"
    context_template: str = "{user_context}\n{news_context}"
    chat_history_template: str = "{chat_history}"
    question_template: str = "{question}"
    answer_template: str = "{answer}"
    system_message: str = ""
    sep: str = "\n"
    eos: str = ""

    @property
    def train_raw_template(self):
        """Returns the training prompt template format"""

        system = self.system_template.format(system_message=self.system_message)
        context = f"{self.sep}{self.context_template}"
        chat_history = f"{self.sep}{self.chat_history_template}"
        question = f"{self.sep}{self.question_template}"
        answer = f"{self.sep}{self.answer_template}"

        return f"{system}{context}{chat_history}{question}{answer}{self.eos}"


    def format_train(self, sample: Dict[str, str]) -> Dict[str, Union[str, Dict]]:
        """Formats the data sample to a training sample"""

        prompt = self.train_raw_template.format(
            user_context=sample["user_context"],
            news_context=sample["news_context"],
            chat_history=sample.get("chat_history", ""),
            question=sample["question"],
            answer=sample["answer"],
        )

        return {"prompt": prompt, "payload": sample}

# Global Templates registry
templates: Dict[str, PromptTemplate] = {}

def register_llm_template(template: PromptTemplate):
    """Register a new template to the global templates registry"""

    templates[template.name] = template


def get_llm_template(name: str) -> PromptTemplate:
    """Returns the template assigned to the given name"""

    return templates[name]

#get_llm_template("falcon")

register_llm_template(
    PromptTemplate(
        name="falcon",
        system_template=">>INTRODUCTION<< {system_message}",
        system_message="You are a helpful assistant, with financial expertise.",
        context_template=">>DOMAIN<< {user_context}\n{news_context}",
        chat_history_template=">>SUMMARY<< {chat_history}",
        question_template=">>QUESTION<< {question}",
        answer_template=">>ANSWER<< {answer}",
        sep="\n",
        eos="<|endoftext|>",
    )
)

get_llm_template("falcon")

PromptTemplate(name='falcon', system_template='>>INTRODUCTION<< {system_message}', context_template='>>DOMAIN<< {user_context}\n{news_context}', chat_history_template='>>SUMMARY<< {chat_history}', question_template='>>QUESTION<< {question}', answer_template='>>ANSWER<< {answer}', system_message='You are a helpful assistant, with financial expertise.', sep='\n', eos='<|endoftext|>')

In [18]:
get_llm_template("falcon").format_train

<bound method PromptTemplate.format_train of PromptTemplate(name='falcon', system_template='>>INTRODUCTION<< {system_message}', context_template='>>DOMAIN<< {user_context}\n{news_context}', chat_history_template='>>SUMMARY<< {chat_history}', question_template='>>QUESTION<< {question}', answer_template='>>ANSWER<< {answer}', system_message='You are a helpful assistant, with financial expertise.', sep='\n', eos='<|endoftext|>')>

In [16]:
templates

{'falcon': PromptTemplate(name='falcon', system_template='>>INTRODUCTION<< {system_message}', context_template='>>DOMAIN<< {user_context}\n{news_context}', chat_history_template='>>SUMMARY<< {chat_history}', question_template='>>QUESTION<< {question}', answer_template='>>ANSWER<< {answer}', system_message='You are a helpful assistant, with financial expertise.', sep='\n', eos='<|endoftext|>')}

In [19]:
from unstructured.cleaners.core import clean_extra_whitespace, group_broken_paragraphs

def clean(samples: Dict[str, str]) -> Dict[str, str]:
    """
    Cleans the samples.

    Args:
        samples (Dict[str, str]): The samples to clean.

    Returns:
        Dict[str, str]: The cleaned samples.
    """

    for key, sample in samples.items():
        cleaned_sample = clean_extra_whitespace(sample)
        cleaned_sample = group_broken_paragraphs(cleaned_sample)

        samples[key] = cleaned_sample

    return samples


In [21]:
# Preprocesses the data & returns a HuggingFace dataset

data_as_dict = [asdict(sample) for sample in raw_data]
dataset = Dataset.from_list(data_as_dict)

template_mapping_func = get_llm_template("falcon").format_train

dataset = dataset.map(clean)
dataset = dataset.map(template_mapping_func, remove_columns=dataset.column_names)
dataset[0]

Map:   0%|          | 0/125 [00:00<?, ? examples/s]

Map: 100%|██████████| 125/125 [00:00<00:00, 3983.10 examples/s]
Map: 100%|██████████| 125/125 [00:00<00:00, 6934.57 examples/s]


{'user_context': "I'm a 26 year old investor. I want to explore opportunities in emerging markets.",
 'news_context': 'Eastern European economies attract investments with improving infrastructure and skilled workforce.',
 'chat_history': '',
 'question': 'How do you view the growth potential of Eastern European markets?',
 'answer': 'Eastern European markets have great potential for growth. The improving infrastructure and skilled workforce make them attractive to investors. The region is also home to some of the fastest-growing economies in the world, such as Poland and Romania. Investing in stocks and cryptocurrencies in these markets can be a great way to diversify your portfolio and benefit from the potential for high returns. Additionally, investing in emerging markets can help to reduce risk and provide a hedge against global economic downturns.',
 'prompt': ">>INTRODUCTION<< You are a helpful assistant, with financial expertise.\n>>DOMAIN<< I'm a 26 year old investor. I want to 