In [1]:
from bs4 import BeautifulSoup
import requests
import json

from sentence_transformers import SentenceTransformer
import faiss

from langchain import PromptTemplate, LLMChain
from langchain.callbacks.base import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.llms import GPT4All

### Step-1: Download the HTML Content and find all headers. The headers will be used for the prompts. 
### Step-2: Convert the Headers and the appropriates HTML content to receive the following structure

In [2]:
urls = ['https://en.wikipedia.org/wiki/Eurovision_Song_Contest_2023',]
result = []

In [3]:
# Send HTTP request to the specified URL and save the response from server in a response object called r
for url in urls:
  
    response = requests.get(url)
    
    # Create a BeautifulSoup object and specify the parser library at the same time
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all the h and p tags on the page
    headers = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p'])
    
    current_prompt = ""
    current_response = "" 
    
    for tag in headers:
        if tag.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
            if current_prompt and current_response:  # ensuring both prompt and response are not empty
                result.append({"prompt": current_prompt, "response": current_response.strip(), "source": url})
            current_prompt = tag.text
            current_response = ""
        elif tag.name == 'p':
            current_response += ' ' + tag.text

    # Don't forget the last one
    if current_prompt and current_response:
        result.append({"prompt": current_prompt, "response": current_response.strip(), "source": url})

# Convert the list to JSON
json_result = json.dumps(result, indent=4)

In [6]:
#print(json_result)

### Step-3: Load SentenceTransformer and create appropriate embeddings

In [86]:
# Load the sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Step 3: Convert all the entries into embeddings, based on the prompt.

entries = [{'prompt': entry['prompt'], 'response': entry['response']} for entry in result]

# Generate the embeddings for the prompts
prompt_embeddings = model.encode([entry['prompt'] for entry in entries])

[2023-05-22 13:12:00,926] {SentenceTransformer.py:66} INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2
[2023-05-22 13:12:01,084] {SentenceTransformer.py:105} INFO - Use pytorch device: cpu


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

### Step-4: Build the FAISS index based on the embeddings

In [87]:
# Step 4: Build the FAISS index, just like before.

# Dimension of the embeddings
dimension = prompt_embeddings.shape[1]

# Configure the FAISS index
index = faiss.IndexFlatL2(dimension)

# Add vectors to the index
index.add(prompt_embeddings)

### Step-5: Create a prompt so the user can enter his question. Convert the question into an embedding and perform a search on the index.

In [88]:
def find_best_matching_prompt(question, index):
    # Convert the question into an embedding
    question_embedding = model.encode([user_question])

    # Perform a search
    D, I = index.search(question_embedding, 1)

    # Get the best matching entry
    best_match_index = I[0][0]
    best_matching_entry = entries[best_match_index]
    return best_matching_entry

### Step-6: Use LLMCHain to create your response based on your question and the appropriate context

In [90]:
gpt4all_path = './models/gpt4all-converted.bin' 

# Calback manager for handling the calls with  the model
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

llm = GPT4All(model=gpt4all_path, callback_manager=callback_manager, verbose=True)

In [None]:
# User question
#user_question = "Which mechanisms does Dataiku DSS provides for python code?"
user_question = input()

best_matching_entry = find_best_matching_prompt(user_question, index)

In [91]:
template = """Given the following extracted parts of a long document and a question, create a final answer with references ("SOURCES"). 
If you don't know the answer, just say that you don't know. Don't try to make up an answer.
ALWAYS return a "SOURCES" part in your answer.
Respond in English.

QUESTION: {question}
=========
{context}
=========
FINAL ANSWER IN ENGLISH:"""

# Creating the context
context = best_matching_entry['response']

PROMPT = PromptTemplate(template=template, input_variables=["context", "question"]).partial(context=context)
llm_chain = LLMChain(prompt=PROMPT, llm=llm)
# Print the result
print(llm_chain.run(user_question))

 Given the following extracted parts of a long document and a question, create a final answer with references ("SOURCES"). 
If you don't know the answer, just say that you don't know. Don't try to make up an answer.
ALWAYS return a "SOURCES" part in your answer.
Respond in English.

QUESTION: What was the location of the Eurovision Song Contest in 2023?
The Eurovision Song Contest 2023 was the 67th edition of the Eurovision Song Contest. It took place in Liverpool, United Kingdom, after Ukraine, winner of the 2022 contest with the song "Stefania" by Kalush Orchestra, was unable to meet the demands of hosting the event due to security concerns caused by the Russian invasion of Ukraine. Organised by the European Broadcasting Union (EBU) and host broadcaster the British Broadcasting Corporation (BBC) on behalf of the Public Broadcasting Company of Ukraine (UA: PBC), the contest was held at the Liverpool Arena, and consisted of two semi-finals on 9 and 11 May, and the final on 13 May 2023.