In [22]:
from tqdm.notebook import tqdm

def augment_data_with_retrieval(dataset, retriever, separate_context=False):
    data_list = dataset.qr_pairs
    new_data_list = []
    for query_str, response in tqdm(data_list):
        retrieved_nodes = retriever.retrieve(query_str)
        retrieved_txts = [n.get_content() for n in retrieved_nodes]
        if separate_context:
            for retrieved_txt in retrieved_txts:
                fmt_query_str = qa_prompt_tmpl.format(
                    query_str=query_str, context_str=retrieved_txt
                )
                new_data_list.append((fmt_query_str, response))
        else:
            context_str = "\n\n".join(retrieved_txts)
            fmt_query_str = qa_prompt_tmpl.format(
                query_str=query_str, context_str=context_str
            )
            new_data_list.append((fmt_query_str, response))
    return new_data_list
    
def scrape_stripe_treasury_marketing_policy(url):
    try:
        # Fetch HTML content
        response = requests.get(url)
        response.raise_for_status()  # Raise an HTTPError for bad responses

        # Parse HTML content
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract text from specific elements (headings, paragraphs)
        extracted_text = ""
        for tag in soup.find_all(['h1', 'h2', 'h3', 'p', 'li']):
            extracted_text += tag.get_text(strip=True) + '\n'

        return extracted_text

    except requests.exceptions.RequestException as e:
        print(f"Error fetching content from {url}: {e}")
        return None
    except Exception as e:
        print(f"Error: {e}")
        return None

In [23]:
import os
import openai
import requests
from bs4 import BeautifulSoup
from llama_index import ServiceContext
from llama_index.llms import OpenAI
url = "https://stripe.com/docs/treasury/marketing-treasury"

In [24]:
from llama_index import Document

doc_text = scrape_stripe_treasury_marketing_policy(url)
metadata = {
    "policy_document": "Stripe: Treasury and Issuing product marketing, design, and compliance"
}
docs = [Document(text=doc_text, metadata=metadata)]

In [21]:
from llama_index.callbacks import CallbackManager

callback_manager = CallbackManager([])

gpt_35_context = ServiceContext.from_defaults(
    llm=OpenAI(model="gpt-3.5-turbo-0613", temperature=0.3),
    callback_manager=callback_manager,
)
gpt_4_context = ServiceContext.from_defaults(
    llm=OpenAI(model="gpt-3.5-turbo-0613", temperature=0.3), # this should ideally be "gpt-4-0613", Not have access to it.
    callback_manager=callback_manager,
)

In [26]:
# get nodes, setup VectorIndex
from llama_index.node_parser import SentenceSplitter
from llama_index import VectorStoreIndex

node_parser = SentenceSplitter()
nodes = node_parser.get_nodes_from_documents(docs)
vector_index = VectorStoreIndex(nodes)

In [27]:
# generate dataset
from llama_index.evaluation import (
    DatasetGenerator,
    QueryResponseDataset,
)

In [44]:
eval_context = ServiceContext.from_defaults(
    llm=OpenAI(model="gpt-3.5-turbo-0613", temperature=0), callback_manager=callback_manager
)
dataset_generator = DatasetGenerator(
    nodes[:1],
    service_context=eval_context,
    show_progress=True,
    num_questions_per_chunk=20,
)

  dataset_generator = DatasetGenerator(


In [45]:
eval_dataset = await dataset_generator.agenerate_dataset_from_nodes(num=1) # rate limit error is a problem here

  0%|                                                     | 0/1 [00:00<?, ?it/s]

RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-3.5-turbo-0613 in organization org-Zw1a4l2tkrKPAQyR2i5dTN8z on requests per day (RPD): Limit 200, Used 200, Requested 1. Please try again in 7m12s. Visit https://platform.openai.com/account/rate-limits to learn more. You can increase your rate limit by adding a payment method to your account at https://platform.openai.com/account/billing.', 'type': 'requests', 'param': None, 'code': 'rate_limit_exceeded'}}

In [46]:
from llama_index import VectorStoreIndex
from llama_index.prompts import PromptTemplate

qa_prompt_tmpl_str = (
    "Context information is below.\n"
    "---------------------\n"
    "{context_str}\n"
    "---------------------\n"
    "Given the context information and not prior knowledge, "
    "answer the query.\n"
    "Query: {query_str}\n"
    "Answer: "
)
qa_prompt_tmpl = PromptTemplate(qa_prompt_tmpl_str)

vector_retriever = vector_index.as_retriever(similarity_top_k=1)

In [None]:
new_qr_pairs = augment_data_with_retrieval(
    eval_dataset, vector_retriever, separate_context=False
)
new_eval_dataset = QueryResponseDataset.from_qr_pairs(new_qr_pairs)
new_eval_dataset.save_json("data/qa_pairs_ra.json") # saving the dataset
new_eval_dataset = QueryResponseDataset.from_json("data/qa_pairs_ra.json") # loading the dataset

In [None]:
from copy import deepcopy
import random


def split_train_val(dataset, train_split=0.7):
    lines = dataset.qr_pairs

    # shuffle the lines to make sure that the "train questions" cover most fo the context
    shuffled_lines = deepcopy(lines)
    random.shuffle(shuffled_lines)

    split_idx = int(train_split * len(shuffled_lines))
    train_lines = shuffled_lines[:split_idx]
    val_lines = shuffled_lines[split_idx:]

    return train_lines, val_lines

In [None]:
train_lines, val_lines = split_train_val(new_eval_dataset, train_split=0.7)

train_dataset = QueryResponseDataset.from_qr_pairs(train_lines)
val_dataset = QueryResponseDataset.from_qr_pairs(val_lines)

train_dataset.save_json("data/qa_pairs_train.json")
val_dataset.save_json("data/qa_pairs_val.json")

In [None]:
train_dataset = QueryResponseDataset.from_json("data/qa_pairs_train.json")
val_dataset = QueryResponseDataset.from_json("data/qa_pairs_val.json")

In [None]:
def save_openai_data(dataset, out_path):
    out_fp = open(out_path, "w")
    # TODO: try with different system prompts
    system_prompt = {
        "role": "system",
        "content": (
            "As the Marketing Compliance Specialist, review the following text and identify any phrases that do not comply with the provided compliance guidelines."
        ),
    }
    train_qr_pairs = dataset.qr_pairs
    for line in train_qr_pairs:
        query, response = line
        user_prompt = {"role": "user", "content": query}
        assistant_prompt = {"role": "assistant", "content": response}
        out_dict = {
            "messages": [system_prompt, user_prompt, assistant_prompt],
        }
        out_fp.write(json.dumps(out_dict) + "\n")

### Fine tuning RAG

In [None]:
from llama_index.finetuning import OpenAIFinetuneEngine
finetune_engine = OpenAIFinetuneEngine(
    "gpt-3.5-turbo",
    "data/qa_pairs_openai.jsonl",
)

In [None]:
finetune_engine.finetune()
finetune_engine.get_current_job()

In [None]:
ft_model = finetune_engine.get_finetuned_model()

In [None]:
# Use fine-tuned model in RAG system
from llama_index import ServiceContext

ft_context = ServiceContext.from_defaults(
    llm=ft_model,
    callback_manager=callback_manager,
    system_prompt=(
        "As the Marketing Compliance Specialist, review the following text and identify any phrases that do not comply with the provided compliance guidelines."    ),
)

# fine-tuned RAG system
ft_query_engine = vector_index.as_query_engine(
    similarity_top_k=1, service_context=ft_context
)

In [None]:
response = ft_query_engine.query(
    "Does this 'Discovery of advanced financial solutions for effective planning' comply with the Stripe policy?"
)
print(str(response))