In [24]:
from llama_index.llms.cleanlab import CleanlabTLM

In [26]:
options = {
    "model": "claude-3.5-sonnet-v2",
    "max_tokens": 128,
    "log": ["explanation"]
}
llm = CleanlabTLM(api_key="226eeab91e944b23bd817a46dbe3c8ae", options=options)

In [27]:
resp = llm.complete("which one of 9.11 and 9.9 is greater?")


In [28]:
print(resp)

9.11 is greater than 9.9

To compare decimal numbers, we can look at each decimal place from left to right. In this case:
- Both numbers have the same whole number (9)
- Looking at the tenths place (first decimal place):
  * 9.11 has 1 tenth
  * 9.9 has 9 tenths
  * 9.9 can also be written as 9.90
- Looking at the hundredths place (second decimal place):
  * 9.11 has 1


In [29]:
print(resp.additional_kwargs["explanation"])


The answer starts correctly but is incomplete and has a major error. It states 9.11 is greater but then shows 9.9 has 9 tenths while 9.11 has 1 tenth, which would actually make 9.9 greater. The explanation contradicts itself and cuts off mid-sentence. A nswer would explain that 9.9 = 9.90, and 9.90 < 9.11. 
This response is untrustworthy due to lack of consistency in possible responses from the model. Here's one inconsistent alternate response that the model considered (which may not be accurate either): 
9.11.


In [151]:
response = query_engine.query("How many campuses are there?Answer by including citations of document name and page numbers in the format [<document_name>, page <page_number>]")
display_response(response)

Response: Haaga-Helia operates on five campuses.
Trustworthiness score: 0.99


In [272]:
from llama_parse import LlamaParse
from llama_index.core import VectorStoreIndex
import requests
from bs4 import BeautifulSoup
import pdfkit
from llama_index.readers.docling import DoclingReader
from llama_index.core import Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.llms.cleanlab import CleanlabTLM
from typing import Dict, List, ClassVar
from llama_index.core.instrumentation.events import BaseEvent
from llama_index.core.instrumentation.event_handlers import BaseEventHandler
from llama_index.core.instrumentation import get_dispatcher
from llama_index.core.instrumentation.events.llm import LLMCompletionEndEvent
import nest_asyncio
import os
from llama_index.core import PromptTemplate
nest_asyncio.apply()

#############################
# Event Handler for Trustworthiness Score
#############################

class GetTrustworthinessScore(BaseEventHandler):
    events: ClassVar[List[BaseEvent]] = []
    trustworthiness_score: float = 0.0

    @classmethod
    def class_name(cls) -> str:
        return "GetTrustworthinessScore"

    def handle(self, event: BaseEvent) -> Dict:
        if isinstance(event, LLMCompletionEndEvent):
            self.trustworthiness_score = event.response.additional_kwargs.get("trustworthiness_score", 0.0)
            self.events.append(event)
        return {}

def display_response(response):
    response_str = response.response
    trustworthiness_score = event_handler.trustworthiness_score
    rsp = f"""
    Response = {response_str}
    
    Trustworthiness score = {round(trustworthiness_score, 2)}
    
    """
    display(Markdown(rsp))

#############################
# LLM and Embedding Settings
#############################

options = {
    "model": "gpt-4o",
    "max_tokens": 512,
    "log": ["explanation"]
}

llm = CleanlabTLM(api_key="CLEANLAB_API_KEY", options=options)
Settings.llm = llm

Settings.embed_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-small-en-v1.5"
)

# Set up the dispatcher and register the event handler.
root_dispatcher = get_dispatcher()
event_handler = GetTrustworthinessScore()
root_dispatcher.add_event_handler(event_handler)

##########################################
# PDF Generation from Multiple URLs
##########################################

# Configure wkhtmltopdf path
wkhtml_path = r'C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe'
config = pdfkit.configuration(wkhtmltopdf=wkhtml_path)

# Define URLs and assign document names
urls = {
    "LLMs": "https://en.wikipedia.org/wiki/Large_language_model"
}

# Directory to save PDFs
pdf_directory = "PDFs"
os.makedirs(pdf_directory, exist_ok=True)

pdf_paths = {}
for doc_name, url in urls.items():
    try:
        print(f"Processing {doc_name} from {url} ...")
        response = requests.get(url)
        soup = BeautifulSoup(response.text, "html.parser")
        main_content = soup.find("div", {"id": "mw-content-text"})
        if main_content is None:
            raise ValueError("Main content not found")
        # Replace protocol-relative URLs with absolute URLs
        html_string = str(main_content).replace('src="//', 'src="https://').replace('href="//', 'href="https://')
        pdf_file_path = os.path.join(pdf_directory, f"{doc_name}.pdf")
        pdfkit.from_string(
            html_string,
            pdf_file_path,
            options={'encoding': 'UTF-8', 'quiet': ''},
            configuration=config
        )
        pdf_paths[doc_name] = pdf_file_path
        print(f"Saved PDF for {doc_name} at {pdf_file_path}")
    except Exception as e:
        print(f"Error processing {doc_name}: {e}")

##########################################
# Parse PDFs with LlamaParse and Inject Metadata
##########################################

# Define parsing instructions (if your parser supports it)
parsing_instructions = """Extract the document content in markdown.
Split the document into nodes (for example, by page).
Ensure each node has metadata for document name and page number."""
        
# Create a LlamaParse instance (replace the API key below with your actual key)
parser = LlamaParse(
    api_key="llx-...",   # Replace with your LLAMACLOUD key
    parsing_instructions=parsing_instructions,
    result_type="markdown",
    premium_mode=True,
    max_timeout=600
)

# Directory to save combined Markdown files (one per PDF)
output_md_dir = os.path.join(pdf_directory, "markdown_docs")
os.makedirs(output_md_dir, exist_ok=True)

# List to hold all updated nodes for indexing
all_nodes = []

for doc_name, pdf_path in pdf_paths.items():
    try:
        print(f"Parsing PDF for {doc_name} from {pdf_path} ...")
        nodes = parser.load_data(pdf_path)  # Returns a list of nodes
        updated_nodes = []
        # Process each node: update metadata and inject citation header into the text.
        for i, node in enumerate(nodes, start=1):
            # Copy existing metadata (if any) and add our own keys.
            new_metadata = dict(node.metadata) if node.metadata else {}
            new_metadata["document_name"] = doc_name
            if "page_number" not in new_metadata:
                new_metadata["page_number"] = str(i)
            # Build the citation header.
            citation_header = f"[{new_metadata['document_name']}, page {new_metadata['page_number']}]\n\n"
            # Prepend the citation header to the node's text.
            updated_text = citation_header + node.text
            new_node = node.__class__(text=updated_text, metadata=new_metadata)
            updated_nodes.append(new_node)
        # Save a single combined Markdown file for the document using the updated node texts.
        combined_texts = [node.text for node in updated_nodes]
        combined_md = "\n\n---\n\n".join(combined_texts)
        md_filename = f"{doc_name}.md"
        md_filepath = os.path.join(output_md_dir, md_filename)
        with open(md_filepath, "w", encoding="utf-8") as f:
            f.write(combined_md)
        print(f"Saved combined markdown for {doc_name} to {md_filepath}")
        # Add the updated nodes to the global list for indexing.
        all_nodes.extend(updated_nodes)
        print(f"Parsed {len(updated_nodes)} nodes from {doc_name}.")
    except Exception as e:
        print(f"Error parsing {doc_name}: {e}")

##########################################
# Create Combined Index and Query Engine
##########################################

# Create a combined index from all nodes.
index = VectorStoreIndex.from_documents(documents=all_nodes)

# Define and create the PromptTemplate object
prompt_template = PromptTemplate("""\
You are an AI assistant with expertise in the subject matter.
Answer the question using ONLY the provided context.
Answer in well-formatted Markdown with bullets and sections wherever necessary.
If the provided context does not support an answer, respond with "I don't know."
Include citations of document name and page numbers in the format [<document_name>, page <page_number>].

Context:
{context_str}

Question:
{query_str}

Answer:
""")

# Create a query engine with the custom prompt.
query_engine = index.as_query_engine(
    similarity_top_k=3,
    llm=llm,
    text_qa_template=prompt_template  
)

print("Combined index and query engine created successfully!")


Processing LLMs from https://en.wikipedia.org/wiki/Large_language_model ...
Saved PDF for LLMs at C:/Users/h02317/Downloads/PDFs\LLMs.pdf
Parsing PDF for LLMs from C:/Users/h02317/Downloads/PDFs\LLMs.pdf ...
Started parsing the file under job_id 2b1ae611-0375-4998-a0f9-bab2e6f01d79
..Saved combined markdown for LLMs to C:/Users/h02317/Downloads/PDFs\markdown_docs\LLMs.md
Parsed 18 nodes from LLMs.
Combined index and query engine created successfully!


In [323]:
response = query_engine.query("When is mixture of experts approach used? ")
display_response(response)


    Response = The mixture of experts (MoE) approach is used when the largest large language models (LLMs) are too expensive to train and use directly. This approach allows for the training of models with a very large number of parameters, reaching up to 1 trillion, as pursued by Google researchers since 2017 [LLMs, page 4].
    
    Trustworthiness score =  0.98
    
    

In [337]:
response = query_engine.query("How do you compare Deepseek model with OpenAI's models? ")
display_response(response)


    Response = - **Performance and Cost-Effectiveness**:
  - The DeepSeek-R1 model, released by the Chinese company DeepSeek, achieved comparable performance to OpenAI's o1 model. This indicates that DeepSeek-R1 is competitive in terms of its capabilities in reasoning tasks.
  - DeepSeek-R1 is noted for being significantly more cost-effective to operate compared to OpenAI's models. This suggests that it may require less computational resources or be optimized for efficiency in some way [LLMs, page 7].

- **Openness and Accessibility**:
  - Unlike OpenAI's proprietary models, DeepSeek-R1 is an open-weight model. This means that its weights are accessible to researchers, allowing them to study and build upon the algorithm. This openness can foster more collaborative research and development in the AI community.
  - However, it is important to note that while the model weights are open, the training data for DeepSeek-R1 remains private, which may limit some aspects of transparency and reproducibility [LLMs, page 7].

- **Domain Capabilities**:
  - Both DeepSeek-R1 and OpenAI's reasoning models, such as o1, have shown superior capabilities in domains requiring structured logical thinking, such as mathematics, scientific research, and computer programming. This highlights their effectiveness in complex reasoning tasks [LLMs, page 7].

Overall, DeepSeek-R1 offers a competitive and cost-effective alternative to OpenAI's models, with the added benefit of open weights for research purposes, although it maintains some limitations in data transparency.
    
    Trustworthiness score =  0.96
    
    

In [235]:
""" Getting the trustworthiness score of an LLM response and its explanation"""

from cleanlab_studio import Studio
import streamlit as st
from langchain_groq.chat_models import ChatGroq
import os
GROQ_API_KEY = "..." #get GROQ API key from https://groq.com/

# Initialize the Groq Llama Instant model
groq_llm = ChatGroq(model="llama-3.3-70b-versatile", temperature=0.5)
# Ask a simple question
question = "Which one of 9.11 and 9.9 is bigger?"
# Construct a simple prompt
prompt = (
    f"You are a helpful assistant.\n"
    f"Answer the following question clearly and concisely:\n\n"
    f"Question: {question}\n"
    f"Answer:"
)
# Get the response from the model
response = groq_llm.invoke(prompt)
# Print the response
print(f"Answer = {response.content.strip()}")

studio = Studio("CLEANLAB_API_KEY")  # Get your free API key from: https://tlm.cleanlab.ai/
cleanlab_tlm = studio.TLM(options={"log": ["explanation"]})  # See Advanced Tutorial for optional TLM configurations to get better/faster results
prompt = "Which one of 9.11 and 9.9 is bigger?"
output = cleanlab_tlm.get_trustworthiness_score(prompt, response=response.content.strip())

print(f"Trustworthiness score = {output["trustworthiness_score"]}")
print(f'Explanation: {output["log"]["explanation"]}')


Answer = 9.11 is bigger than 9.9.
Trustworthiness score = 0.41811348835461787
Explanation: This response is untrustworthy due to lack of consistency in possible responses from the model. Here's one inconsistent alternate response that the model considered (which may not be accurate either): 
9.9 is bigger.


In [218]:
"""Comparing the answer, trustworthiness scores, and explanations of two LLMs for the same prompt"""

from cleanlab_studio import Studio
import markdown
from IPython.core.display import display, Markdown

# Initialize the Cleanlab Studio with API key
studio = Studio("CLEANLAB_API_KEY")  # Replace with your actual API key

# List of models to evaluate
models = ["gpt-4o", "claude-3.5-sonnet-v2"]

# Define the prompt
prompt_text = "Which one of 9.11 and 9.9 is bigger?"

# Loop through each model and evaluate
for model in models:
    tlm = studio.TLM(options={"log": ["explanation"], "model": model})
    out = tlm.prompt(prompt_text)
    
    md_content = f"""
## Model: {model}

**Response:** {out['response']}

**Trustworthiness Score:** {out['trustworthiness_score']}

**Explanation:** {out['log']['explanation']}

---
"""
    display(Markdown(md_content))


  from IPython.core.display import display, Markdown



## Model: gpt-4o

**Response:** 9.11 is bigger than 9.9. When comparing decimal numbers, you start by comparing the digits from left to right. Both numbers have the same whole number part (9), but in the tenths place, 9.11 has a 1 while 9.9 has a 9. Since 9.9 can be thought of as 9.90, comparing the hundredths place shows that 1 is greater than 0, making 9.11 the larger number.

**Trustworthiness Score:** 0.5913138800517392

**Explanation:** incorrect. When comparing decimal numbers, you start by comparing the digits from left to right. Both numbers have the same whole number part (9). In the tenths place, 9.11 has a 1, and 9.9 has a 9. However, the comparison should be made by considering 9.9 as 9.90 to align the decimal places. In this case, 9.90 is greater than 9.11 because 9 in the tenths place is greater than 1 in the tenths place. Therefore, 9.9 (or 9.90) is actually larger than 9.11. 
This response is untrustworthy due to lack of consistency in possible responses from the model. Here's one inconsistent alternate response that the model considered (which may not be accurate either): 
9.9 is bigger.

---



## Model: claude-3.5-sonnet-v2

**Response:** 9.11 is bigger than 9.9

To compare decimal numbers, we can look at each decimal place from left to right. In this case:
- Both numbers have 9 in the ones place
- In the tenths place (first decimal place), 9.11 has 1 and 9.9 has 9
- Since 1 is less than 9, we need to look at the hundredths place
- 9.11 has 1 in the hundredths place, while 9.9 is the same as 9.90
- Therefore, 9.11 > 9.90 (or 9.9)

So, 9.11 is the bigger number.

**Trustworthiness Score:** 0.5542884316430033

**Explanation:** Let me solve this step by step:

1) When comparing decimal numbers, we align the decimal points and compare digits from left to right.

2) Let's write both numbers:
   9.11
   9.90 (9.9 = 9.90)

3) Starting from the left:
   - Ones place: both have 9, so they're equal
   - Tenths place (first decimal): 
     * 9.11 has 1
     * 9.9 has 9
   - Since 9 is greater than 1 in the tenths place, we don't need to look further

4) The AI's answer claims 9.11 is bigger than 9.9, which is incorrect.
   9.9 = 9.90 > 9.11

5) The AI's explanation about needing to look at the hundredths place is wrong. Once we find a difference in a decimal place, we can stop comparing.

---


In [230]:
import streamlit as st
from langchain_groq.chat_models import ChatGroq
import os
GROQ_API_KEY = "..." #get GROQ API key from https://groq.com/

# Initialize the Groq Llama Instant model
groq_llm = ChatGroq(model="deepseek-r1-distill-llama-70b", temperature=0.5)
prompt = "Which one of 9.11 and 9.9 is bigger?"
# Get the response from the model
response = groq_llm.invoke(prompt)
#Initialize Cleanlab's studio
studio = Studio("CLEANLAB_API_KEY")  
cleanlab_tlm = studio.TLM(options={"log": ["explanation"]})  #for explanations
#Get the output containing trustworthiness score and explanation
output = cleanlab_tlm.get_trustworthiness_score(prompt, response=response.content.strip())

md_content = f"""
## Model: {model}

**Response:** {response.content.strip()}

**Trustworthiness Score:** {output['trustworthiness_score']}

**Explanation:** {output['log']['explanation']}

---
"""
display(Markdown(md_content))



## Model: claude-3.5-sonnet-v2

**Response:** <think>
First, I will compare the whole number parts of both numbers. Both 9.11 and 9.9 have the same whole number, which is 9.

Next, I will compare the decimal parts. For 9.11, the decimal part is 0.11, and for 9.9, it is 0.9.

Since 0.9 is greater than 0.11, the decimal part of 9.9 is larger.

Therefore, 9.9 is the bigger number.
</think>

To determine which number is larger between **9.11** and **9.9**, let's compare them step by step.

1. **Compare the Whole Number Parts:**
   - Both numbers have the same whole number part, which is **9**.

2. **Compare the Decimal Parts:**
   - For **9.11**, the decimal part is **0.11**.
   - For **9.9**, the decimal part is **0.9**.

3. **Determine Which Decimal is Larger:**
   - **0.9** is greater than **0.11** because **0.9** is equivalent to **0.90**, and **0.90 > 0.11**.

Since the decimal part of **9.9** is larger than that of **9.11**, **9.9** is the bigger number.

\[
\boxed{9.9}
\]

**Trustworthiness Score:** 0.9833656002275584

**Explanation:** Did not find a reason to doubt trustworthiness.

---
