In [1]:
!pip install -q requests
!pip install -q bs4
!pip install -q google-generativeai
!pip install -q sentence_transformers
!pip install -q langchain langchain_community


[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf 24.8.2 requires cubinlinker, which is not installed.
cudf 24.8.2 requires cupy-cuda11x>=12.0.0, which is not installed.
cudf 24.8.2 requires ptxcompiler, which is not installed.
cuml 24.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
dask-cudf 24.8.2 requires cupy-cuda11x>=12.0.0, which is not installed.
cudf 24.8.2 requires cuda-python<12.0a0,>=11.7.1, but you have cuda-python 12.6.0 which is incompatible.
distributed 2024.7.1 requires dask==2024.7.1, but you have dask 2024.8.1 which is incompatible.
google-cloud-bigquery 2.34.4 requires packaging<22.0dev,>=14.3, but you have packaging 24.1 which is incompatible.
jupyterlab 4.2.4 requires jupyter-lsp>=2.0.0, but you have jupyter-lsp 1.5.1 which is incompatible.
jupyterlab-lsp 5.1.0 requires jupyter-lsp>=2.0.0, but you have jupyter-lsp 1.5.1 w

In [2]:
from sentence_transformers import SentenceTransformer
from sentence_transformers import util
import torch
import langchain
from pydantic import BaseModel, Field
from langchain.output_parsers import PydanticOutputParser
from typing import List
import json


  from tqdm.autonotebook import tqdm, trange


## Insert Gemini Api
If you are using kaggle, make a user secret client for the gemini api

In [3]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("gemini_api")

In [4]:
import google.generativeai as genai
genai.configure(api_key = secret_value_0)

**List of gemini models**

In [5]:
for m in genai.list_models():
    if 'generateContent' in m.supported_generation_methods:
        print(m.name)

models/gemini-1.0-pro-latest
models/gemini-1.0-pro
models/gemini-pro
models/gemini-1.0-pro-001
models/gemini-1.0-pro-vision-latest
models/gemini-pro-vision
models/gemini-1.5-pro-latest
models/gemini-1.5-pro-001
models/gemini-1.5-pro
models/gemini-1.5-pro-exp-0801
models/gemini-1.5-flash-latest
models/gemini-1.5-flash-001
models/gemini-1.5-flash
models/gemini-1.5-flash-001-tuning


## Web scraping function

In [6]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
import json

def extract_internal_links(url):
    # Send a GET request to the website
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Get the base URL
        base_url = urlparse(url).netloc
        
        # Find all links in the HTML
        links = set()
        for anchor in soup.find_all('a', href=True):
            # Join the base URL with the href to handle relative links
            full_url = urljoin(url, anchor['href'])
            
            # Check if the link is internal (belongs to the same domain)
            parsed_url = urlparse(full_url)
            if parsed_url.netloc == base_url:
                links.add(full_url)
                
        return links
    else:
        print(f"Failed to retrieve {url}: {response.status_code}")
        return set()

def extract_text_without_links(url):
    # Send a GET request to the website
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract all text except for link text
        text_data = []
        for element in soup.find_all(text=True):
            # Skip script and style elements, and links (<a> tags)
            if element.parent.name not in ['a', 'script', 'style']:
                clean_text = element.strip()
                if clean_text:
                    text_data.append(clean_text)
                    
        return text_data
    else:
        print(f"Failed to retrieve {url}: {response.status_code}")
        return []


## Using langchain and pydantic
The one we will cover here is an implementation provided by Langchain that relies on the well-known Pydantic models to define the data structure. For the Langchain library, there are two main components an output parser must implement:

- Format instructions: A method that returns a string containing instructions for how the output of a language model should be formatted.
- Parser: A method that takes in a string (assumed to be the response from a language model) and parses it into some structure.

In [7]:
# Define your data structure for the first prompt
class QnA(BaseModel):
    question: str = Field(description="A question from the given text")
    answer: str = Field(description="Answer to the question in 2 or 3 lines")

class QnAList(BaseModel):
    qna_pairs: List[QnA] = Field(description="A list of 10 questions and answers")

# Set up a parser for the first prompt
parser = PydanticOutputParser(pydantic_object=QnAList)
format_instructions = parser.get_format_instructions()
print(format_instructions)

prompt_template = """
Generate 10 questions and answers based on the given text. Maximum 80 characters in one question. Dont give line breaks in output. Avoid any special characters other than json.

Given text:
{query}

Output format:
{format_instructions}
"""

# Define your data structure for the second prompt
class QnA2(BaseModel):
    question: str = Field(description="A given question")
    answer: str = Field(description="Answer to the given question in 2 or 3 lines")

class QnAList2(BaseModel):
    qna_pairs: List[QnA2] = Field(description="A list of 10 questions and their answers")

# Set up a parser for the second prompt
parser2 = PydanticOutputParser(pydantic_object=QnAList2)
format_instructions2 = parser2.get_format_instructions()
print(format_instructions2)

prompt_template2 = """
Given the following text, provide answers to the 10 questions listed below. Each answer should be 2 or 3 lines long and based on the provided text. Dont give line breaks in output. Avoid any special characters other than json.

Given text:
{text}

Questions:
{questions}

Output format:
{format_instructions}
"""

The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}
the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.

Here is the output schema:
```
{"$defs": {"QnA": {"properties": {"question": {"description": "A question from the given text", "title": "Question", "type": "string"}, "answer": {"description": "Answer to the question in 2 or 3 lines", "title": "Answer", "type": "string"}}, "required": ["question", "answer"], "title": "QnA", "type": "object"}}, "properties": {"qna_pairs": {"description": "A list of 10 questions and answers", "items": {"$ref": "#/$defs/QnA"}, "title": "Qna Pairs", "type": "array"}}, "required": ["qna_pairs"]}
```
The output should be formatted as a JSON insta

In [17]:
def extract_qna(json_string):
    # Parse the JSON string into a Python dictionary
    
    print(json_string)    
#     print(type(json_string))
    if len(json_string) <= 100:
        return None, None  #"Not enough information in the webpage or Unable to retrieve webpage"
    prefix = "```json"    
    prefix2 = "```"


    if json_string.startswith(prefix):
        # Strip the prefix and any trailing newlines or whitespace
        json_string = json_string[len(prefix):].lstrip()
    if json_string.startswith(prefix2):
        # Strip the prefix and any trailing newlines or whitespace
        json_string = json_string[len(prefix2):].lstrip()
        
    suffix = "```"
    if json_string.endswith(suffix):
        # Strip the suffix
        json_string = json_string[:-len(suffix)]

    data = json.loads(json_string)
    
    # Initialize lists to store questions and answers
    questions = []
    answers = []
    
    # Loop through each Q&A pair and extract questions and answers
    for pair in data['qna_pairs']:
        questions.append(pair['question'])
        answers.append(pair['answer'])
    
    return questions, answers
def generate_qna(text):
    prompt = prompt_template.format(
       query=f"Given text: {text}",
       format_instructions=parser.get_format_instructions()
    )
    output = model.generate_content(prompt)
    q,a = extract_qna(output.text)
    return q,a

def generateCandidates(text,questions):
    prompt2 = prompt_template2.format(
       text=text,
       questions=questions,
       format_instructions=parser2.get_format_instructions()
    )

    output = model.generate_content(prompt2)
    _,ans =  extract_qna(output.text)
    return ans

In [9]:
# model gemini-pro
model = genai.GenerativeModel('gemini-pro')

In [10]:
# model for calculating the semantic similarity between 2 answers
embed_model = SentenceTransformer('all-MiniLM-L6-v2')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

## Proposed Solution
- **fun1()** is responsible for scraping the current webpage, generating 10 questions and then exploring the links and finding 5 relevant links pertinent to the questions and content. We are doing so by instead of generating just 10 questions we are generating 10 question answer pair using the gemini-pro LLM model. These answers are treated as ground truth answers and will be used for comparing the relevance of child links.
- **fun2()** is getting the 10 questions as well as the contents of the child link to generate answers from that website. Then these candidate answers are then compared with the ground truth by computing semantic similarity using a all-MiniLM-L6-v2 language model.

In [11]:
def append_to_json(url, questions, relevant_links):
    # Convert the list of relevant links to the required format
    formatted_links = [{"url": link} for link in relevant_links]
    
    # Update existing entry or add a new one
    data[url] = {
        "url": url,
        "questions": questions,
        "relevant_links": formatted_links
    }



#generate 
def fun1(link):
    #calling scrap function
    child_links = extract_internal_links(link)
    child_links = list(child_links)
    links_text = extract_text_without_links(link)
    text = ''.join(links_text)
    
#     print(link)
#     print(child_links)
#     print(links_text)

#     push all the child links into queue
    for lnk in child_links:
        if lnk not in hashlinks:
            queue.append(lnk)
    
    # API generate 10 ques and answer pairs
    que10, ans10 = generate_qna(text)
#     print(que10,ans10)
    if que10 is None:
        return
    
    result = []
    for i in child_links:
        result.append((i, fun2(que10, ans10, i)))
    result.sort(key=lambda x: x[1], reverse=True)

    # the top 5 relevant links
    relevant_links = []
    if len(result) < 5:
        for link, rel_sco in result:
            if rel_sco > 0.3:
                relevant_links.append(link)
    else:
        relevant_links = [x[0] for x in result[:5]]
#     print(que10)
#     print(relevant_links)
    append_to_json(link, que10, relevant_links)
    counter[0]+=1
    

def fun2(que, ans_gt, link):
#     print(link)
    text = extract_text_without_links(link)
    link = extract_internal_links(link) # calling scrap func which returns text and link in list format
    text = ' '.join(text)
    ans_gen = generateCandidates(text, que)     # calling candidate it will return the just answers of the question we passed
    
#     print(len(ans_gt))
#     print(len(ans_gen))
    
    # Compute embedding for both lists
    embeddings1 = embed_model.encode(ans_gt, convert_to_tensor=True,show_progress_bar=False)
    embeddings2 = embed_model.encode(ans_gen, convert_to_tensor=True,show_progress_bar=False)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    embeddings1 = embeddings1.to(device)
    embeddings2 = embeddings2.to(device)
    
    # Compute cosine similarities
    cosine_scores = util.cos_sim(embeddings1, embeddings2)
    
    # Compute the mean of the diagonal elements
    diagonal_elements = torch.diag(cosine_scores)
    mean_diagonal = torch.mean(diagonal_elements)
    
    # Print only the numeric value
    return mean_diagonal.item()


def gen(url):
    print("Main function is starting.")
    queue.append(url)
    
    while len(queue) != 0 and counter[0]<=10:
        ele = queue.popleft() # the link we are checking currently
        fun1(ele)
        hashlinks.add(url)
        
    # fn1()  # Calling fn1 inside the main function
    print("Main function is ending.")


    

In [12]:
from collections import deque

# Global variables
queue = deque()  # Global queue (double-ended queue)
counter = [0]  # Global counter
hashlinks = set()
data = {}  # to store the data link as key and value as their url, questions, relevant_links

## Main function, put the input URL here

In [19]:
input_url = "https://www.ibm.com/products/watsonx-code-assistant"  # Example input
queue.clear()
gen(input_url)

Main function is starting.


  for element in soup.find_all(text=True):


{"qna_pairs": [{"question": "What does IBM watsonx Code Assistant leverage?", "answer": "IBM watsonx Code Assistant leverages generative AI to accelerate development while maintaining trust, security, and compliance at its core."}, {"question": "What is IBM watsonx Code Assistant powered by?", "answer": "IBM watsonx Code Assistant is powered by the IBM Granite that include state-of-the-art large language models designed for code."}, {"question": "What is the benefit of using IBM watsonx Code Assistant?", "answer": "IBM watsonx Code Assistant can help developers minimize the learning curve, reduce errors, and increase productivity, build quality code, unlock development potential, and accelerate time to value."}, {"question": "What are the key features of IBM watsonx Code Assistant?", "answer": "The key features of IBM watsonx Code Assistant include code generation, code matching, and code modernization."}, {"question": "What are the two IBM watsonx Code Assistant products?", "answer": 

KeyboardInterrupt: 

### **The json file contains here contains url, each url will have its 10 questions and its 5 relevant links**

In [20]:
data

{'https://www.ibm.com/products/watsonx-ai/foundation-models': {'url': 'https://www.ibm.com/products/watsonx-ai/foundation-models',
  'questions': ['What is the purpose of the foundation models library on Watsonx platform?',
   'Name the four models in the granite series.',
   "Can you elaborate on the Trusted aspect of IBM's approach to delivering enterprise-grade foundation models?",
   'What are the key features of the granite models?',
   'What is the full form of LLM?',
   'What is the input and output context length of the slate-125m-english-rtrvr model?',
   'What is the price per million tokens for the llama-3-405b-instruct model?',
   'What are the new strategic partnerships announced by IBM?',
   'How does IBM ensure intellectual property protection for AI models?',
   'What are the benefits of using Granite models?'],
  'relevant_links': [{'url': 'https://www.ibm.com/watsonx'},
   {'url': 'https://www.ibm.com/watsonx/pricing'},
   {'url': 'https://www.ibm.com/watsonx/resource

In [21]:
type(data)

dict

## Download the Json file

In [22]:
filename = "/kaggle/working/my_dict.json"

# Save the dictionary as a JSON file
with open(filename, 'w') as file:
    json.dump(data, file, indent=4)