In [2]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from tqdm.auto import tqdm

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/multi-qa-MiniLM-L6-cos-v1")
model = AutoModel.from_pretrained("sentence-transformers/multi-qa-MiniLM-L6-cos-v1")


#Mean Pooling - Take average of all tokens
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


#Encode text
def encode(texts):
    # Tokenize sentences
    encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input, return_dict=True)

    # Perform pooling
    embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # Normalize embeddings
    embeddings = F.normalize(embeddings, p=2, dim=1)
    
    return embeddings



In [3]:
import pandas as pd
df_olympics = pd.read_csv('docs/olympics_sections_text.csv')

In [4]:
docs = df_olympics['content'].tolist()

In [8]:
doc_emb_list = []
for d in tqdm(docs):
    doc_emb_list.append(encode(d).tolist())

  0%|          | 0/3964 [00:00<?, ?it/s]

In [13]:
doc_emb = torch.tensor(doc_emb_list).squeeze()

In [14]:
doc_emb.shape

torch.Size([3964, 384])

In [15]:
torch.save(doc_emb, 'olympics_embeddings.pt')

In [23]:
import tiktoken

MAX_SECTION_LEN = 500
SEPARATOR = "\n* "
ENCODING = "cl100k_base"  # encoding for text-embedding-ada-002

encoding = tiktoken.get_encoding(ENCODING)
separator_len = len(encoding.encode(SEPARATOR))

f"Context separator contains {separator_len} tokens"

'Context separator contains 3 tokens'

In [35]:
# Sentences we want sentence embeddings for
query = "Who won the women's long jump?"
#Encode query and docs
query_emb = encode(query)

In [36]:
contexts = []
top_n = 5


#Compute dot score between query and all document embeddings
scores = torch.mm(query_emb, doc_emb.transpose(0, 1))[0].cpu().tolist()

#Combine docs & scores
doc_score_pairs = list(zip(docs, scores))

#Sort by decreasing score
doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)

#Output passages & scores
for doc, score in doc_score_pairs[:top_n]:
    print(round(score,3), doc)
    contexts.append(doc)

joint_context = SEPARATOR.join(contexts)

0.66 The women's long jump event at the 2020 Summer Olympics took place on 1 and 3 August 2021 at the Japan National Stadium. 30 athletes from 23 nations competed. Germany's 2019 world champion Malaika Mihambo moved up from third  to first with her final round jump of 7.00 metres, to win the gold medal. 2012 Olympic champion Brittney Reese of the USA won the silver and Nigeria's Ese Brume the bronze.
0.615 The women's triple jump event at the 2020 Summer Olympics took place between 30 July and 1 August 2021 at the Japan National Stadium.The event was won by Yulimar Rojas of Venezuela: Her winning jump of 15.67 meters also broke the 26-year-old world record.
0.587 The men's long jump event at the 2020 Summer Olympics took place between 31 July and 2 August 2021 at the Japan National Stadium. Approximately 35 athletes were expected to compete; the exact number was dependent on how many nations use universality places to enter athletes in addition to the 32 qualifying through time or rank

In [37]:
header = """Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't know."\n\nContext:\n"""
    
prompt =  header + joint_context + "\n\n Q: " + query + "\n A:"

In [28]:
import openai
import toml

secrets = toml.load('.streamlit/secrets.toml')

openai_api_key = secrets['openai_api_key']
openai.api_key = openai_api_key

COMPLETIONS_MODEL = "text-davinci-003"

COMPLETIONS_API_PARAMS = {
    # We use temperature of 0.0 because it gives the most predictable, factual answer.
    "temperature": 0.0,
    "max_tokens": 300,
    "model": COMPLETIONS_MODEL,
}

In [38]:
response = openai.Completion.create(
                prompt=prompt,
                **COMPLETIONS_API_PARAMS
            )

response["choices"][0]["text"].strip(" \n")

"Malaika Mihambo won the women's long jump."

In [39]:
response

<OpenAIObject text_completion id=cmpl-6g3luFyMUQzvZcpR5gKoWxoMIZoJM at 0x7f1b7c722680> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "text": " Malaika Mihambo won the women's long jump."
    }
  ],
  "created": 1675482294,
  "id": "cmpl-6g3luFyMUQzvZcpR5gKoWxoMIZoJM",
  "model": "text-davinci-003",
  "object": "text_completion",
  "usage": {
    "completion_tokens": 13,
    "prompt_tokens": 449,
    "total_tokens": 462
  }
}

In [25]:
joint_context

'The men\'s high jump event at the 2020 Summer Olympics took place between 30 July and 1 August 2021 at the Olympic Stadium. 33 athletes from 24 nations competed; the total possible number depended on how many nations would use universality places to enter athletes in addition to the 32 qualifying through mark or ranking (no universality places were used in 2021). Italian athlete Gianmarco Tamberi along with Qatari athlete Mutaz Essa Barshim emerged as joint winners of the event following a tie between both of them as they cleared 2.37m. Both Tamberi and Barshim agreed to share the gold medal in a rare instance where the athletes of different nations had agreed to share the same medal in the history of Olympics. Barshim in particular was heard to ask a competition official "Can we have two golds?" in response to being offered a \'jump off\'. Maksim Nedasekau of Belarus took bronze. The medals were the first ever in the men\'s high jump for Italy and Belarus, the first gold in the men\'

## Use openai ada embeddings

In [41]:
EMBEDDING_MODEL = "text-embedding-ada-002"
def get_embedding(text: str, model: str=EMBEDDING_MODEL):
    result = openai.Embedding.create(
      model=model,
      input=text
    )
    return result["data"][0]["embedding"]

In [43]:
query

"Who won the women's long jump?"

In [45]:
query_ada_emb = get_embedding(query)

In [47]:
import numpy as np
query_ada_emb_np = np.array(query_ada_emb)

In [48]:
query_ada_emb_np.shape

(1536,)

In [49]:
from bs4 import BeautifulSoup

In [57]:
# with open('docs/govtext/md/summarisation.md', 'r') as f:
with open('docs/govtext/raw_text/summarisation.txt', 'r') as f:
    data = f.read()

In [58]:
soup = BeautifulSoup(data, 'html.parser')

In [59]:
import re
text = soup.get_text()
cleaned_text = re.sub('\n+', '\n', text)

In [60]:
cleaned_text

'---\nsidebar_position: 7\n---\n# Summarisation\n## CONCEPT\nGovText offers two summarisation options:\n1. **Normal**: using abstractive summarisation, the main points of a document are consolidated and paraphrased into a short paragraph which reads like a human written one  \n2. **Quick**: using extractive summarisation, the most important sentences of a document are "lifted" and highlighted\nDue to the complexity involved in performing an abstractive summarisation (normal summary), it takes up more computing resources and time. Therefore, users need to enter the maximum length (number of words) of the summary they want, and only one summary which is shorter or equal to this length will be returned.  \nExtractive summarisation (quick summary) is very much faster than abstractive summarisation. When this option is activated, summaries of the following lengths will be returned for each document: \n1. Short (around 15% of original document length) \n2. Medium (around 30% of original docu

In [68]:
summarization_query = "Why are the results not close to my maximum length?"
summarization_prompt =  header + cleaned_text + "\n\n Q: " + summarization_query + "\n A:"

response = openai.Completion.create(
                prompt=summarization_prompt,
                **COMPLETIONS_API_PARAMS
            )

response["choices"][0]["text"].strip(" \n")

'The model scores the words in the summary and decides on the optimum length which provides the most coherent one. From our experiments, the model produces the most coherent summaries with lengths around 200 words.'

In [None]:
def construct_prompt(question: str, context_embeddings: dict, df: pd.DataFrame) -> str:
    """
    Fetch relevant 
    """
    most_relevant_document_sections = order_document_sections_by_query_similarity(question, context_embeddings)
    
    chosen_sections = []
    chosen_sections_len = 0
    chosen_sections_indexes = []
     
    for _, section_index in most_relevant_document_sections:
        # Add contexts until we run out of space.        
        document_section = df.loc[section_index]
        
        chosen_sections_len += document_section.tokens + separator_len
        if chosen_sections_len > MAX_SECTION_LEN:
            break
            
        chosen_sections.append(SEPARATOR + document_section.content.replace("\n", " "))
        chosen_sections_indexes.append(str(section_index))
            
    # Useful diagnostic information
    print(f"Selected {len(chosen_sections)} document sections:")
    print("\n".join(chosen_sections_indexes))
    
    header = """Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't know."\n\nContext:\n"""
    
    return header + "".join(chosen_sections) + "\n\n Q: " + question + "\n A:"