In [40]:
#Installing Dependencies
!pip install PyPDF2 langchain faiss-cpu sentence-transformers requests ragas pinecone protobuf grpcio grpcio-tools protoc-gen-openapiv2 tavily-python

Collecting tavily-python
  Downloading tavily_python-0.5.0-py3-none-any.whl.metadata (11 kB)
Downloading tavily_python-0.5.0-py3-none-any.whl (14 kB)
Installing collected packages: tavily-python
Successfully installed tavily-python-0.5.0


In [51]:
from google.colab import drive
import PyPDF2
import json
import requests
from langchain.embeddings import HuggingFaceEmbeddings
from pinecone.grpc import PineconeGRPC as Pinecone
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
import pinecone
from ragas.metrics import Faithfulness, LLMContextPrecisionWithoutReference, LLMContextRecall
from ragas import SingleTurnSample

In [10]:
drive.mount('/content/drive')

# PDF to JSON Conversion
pdf_path = "/content/drive/MyDrive/NetsolStatement.pdf"
def process_and_convert_to_json(pdf_path):
    pdf_data = []
    with open(pdf_path, "rb") as pdf_file:
        reader = PyPDF2.PdfReader(pdf_file)
        for page_num, page in enumerate(reader.pages):
            pdf_data.append({"page": page_num + 1, "content": page.extract_text()})
    json_file_path = "financial_report.json"
    with open(json_file_path, "w") as json_file:
        json.dump(pdf_data, json_file, indent=4)
    return json_file_path

json_file_path = process_and_convert_to_json(pdf_path)
with open("financial_report.json", "r") as json_file:
    financial_data = json.load(json_file)

total_content = "\n".join([page["content"] for page in financial_data])
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = text_splitter.split_text(total_content)

Mounted at /content/drive


In [None]:

embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

pinecone_client = Pinecone(api_key="PINECONE_API")

index_name = "rag-chatbot"
index_host = "https://rag-chatbot-qdrd8pl.svc.aped-4627-b74a.pinecone.io"

index = pinecone_client.Index(host=index_host)

# Function to Generate and Upload Embeddings
def generate_and_upload_embeddings(chunks):
    batch_size = 32
    vectors = []
    for i, chunk in enumerate(chunks):
        embedding = embedding_model.embed_documents([chunk])[0]
        vectors.append({"id": str(i), "values": embedding})

        if len(vectors) == batch_size or i == len(chunks) - 1:
            index.upsert(vectors=vectors, namespace="example-namespace1")
            vectors = []

generate_and_upload_embeddings(chunks)
print("Embeddings uploaded successfully to Pinecone.")


Embeddings uploaded successfully to Pinecone.


In [30]:
# Query Classification
def classify_query(query):
    query = query.lower()
    if "netsol" in query:
        return "netsol"
    elif "event" in query or "live" in query or "news" in query:
        return "live_event"
    return "general"

In [31]:
# RAG Retrieval for NETSOL Queries
def retrieve_netsol_chunks(query):
    query_embedding = embedding_model.embed_query(query)

    response = index.query(
        vector=query_embedding,
        top_k=1,
        include_metadata=True
    )

    matches = response.get("matches", [])
    if matches:
        return "\n".join([match["metadata"]["chunk"] for match in matches])
    else:
        return "No relevant NETSOL information found."

In [None]:
# Tavily API for Live Events Queries
API_KEY = "TAVILY_API"
API_URL = "https://api.tavily.com/search"
def search_live_events(query):
    payload = {
        'query': query,
        'api_key': API_KEY,
        'max_results': 1,
        'search_depth': 'basic',
        'topic': 'general',
        'include_images': False,
        'include_answer': False,
        'include_raw_content': False
    }
    try:
        response = requests.post(API_URL, json=payload)
        response.raise_for_status()
        data = response.json()
        return "\n".join([
            f"Title: {result['title']}\nSnippet: {result.get('snippet', 'No snippet available')}\nURL: {result['url']}\n"
            for result in data.get('results', [])
        ]) if 'results' in data else "Error: No results key in response."
    except requests.exceptions.RequestException as e:
        return f"Error: {e}"

In [35]:
# Initialize GPT-2 Model and Tokenizer
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

tokenizer.pad_token = tokenizer.eos_token

def generate_general_response(query):
    inputs = tokenizer(
        query,
        return_tensors="pt",
        truncation=True,
        max_length=512,
        padding=True
    )

    output = model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=150,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        top_k=50,
        top_p=0.95,
        pad_token_id=tokenizer.pad_token_id
    )

    return tokenizer.decode(output[0], skip_special_tokens=True)

In [37]:
# RAG Workflow
def rag_workflow(query):
    query_type = classify_query(query)
    print(f"Query Type: {query_type}")
    if query_type == "netsol":
        return retrieve_netsol_chunks(query)
    elif query_type == "live_event":
        return search_live_events(query)
    return generate_general_response(query)

In [58]:
# Evaluation with RAGAS
queries = [
    "What is the vision of NETSOL Technologies Limited?",
    "What are the main certifications achieved by NETSOL?",
    "What are the core products offered by NETSOL?",
    "What services does NETSOL provide?",
    "What is Project Optimus at NETSOL?",
    "Who is the Chairman of NETSOL Technologies?",
    "Which new product developed by NETSOL targets the digital auto retail market?",
    "What awards has NETSOL Technologies won?",
    "Which regions does NETSOL primarily focus its marketing efforts?",
    "What is the proposed cash dividend for the fiscal year 2023-2024?",
    "What is the total revenue from contracts for 2024?",
    "What is the main purpose of Appex Now?",
    "What is the employee-related expenditure for 2024?",
    "What is the location of NETSOL's registered office?",
    "What is the gross profit margin for 2024?",
    "What initiative has NETSOL taken for sustainability?",
    "Which event marked the success of NETSOL's Otoz platform?",
    "How many employees does NETSOL focus on retaining after restructuring?",
    "What are the main objectives outlined in NETSOL's mission statement?",
    "What is the importance of NETSOL's AI focus in 2024?",
    "who won first t20 match between Pakistan and Zimbabwe?",
    "Is Pat Gelsinger the CEO of Intel?"
]

ground_truth = [
    "To become the leading and world-class provider of IT solutions and services in each market of operations by leveraging global positioning, creating strong growth potential, increasing shareholder value, and providing a great environment for employees.",
    "NETSOL has been audited for ISO 27001, ISO 20000, and ISO 9001 certifications and achieved SOC 2 Type 2 compliance.",
    "The core products are Ascent (a platform for asset finance and leasing), NFS Digital (digital finance ecosystem), and Appex Now (API-first marketplace for finance and leasing).",
    "NETSOL provides professional services, cloud services, AI solutions, IT consulting, application development and maintenance, business process outsourcing, and more.",
    "Project Optimus is NETSOL's initiative to develop advanced AI solutions to enhance internal software development processes and drive innovation.",
    "Naeem Ullah Ghauri is the Chairman of NETSOL Technologies.",
    "Otoz, a fully digital white-label platform for digital auto retail and mobility orchestration.",
    "NETSOL has won the CSR Gold Award and Gender Diversity Merit Award by P@SHA and the Gold Award for IT Products and Solutions by PSEB.",
    "NETSOL focuses its marketing efforts in North America and European regions.",
    "The proposed cash dividend is 30%, i.e., Rs. 3 per share.",
    "The total revenue from contracts with customers for 2024 is Rs. 9,280,647,000.",
    "Appex Now serves as a marketplace for the global credit, finance, and leasing industry, offering cloud-based API-first products.",
    "The salaries and other employee benefits expenditure for 2024 is Rs. 4,743,031,000.",
    "The registered office is located at NETSOL IT Village (Software Technology Park), Lahore Ring Road, Ghazi Road Interchange, Lahore Cantt., Pakistan.",
    "The gross profit margin for 2024 is 44.87%.",
    "NETSOL has implemented energy-efficient data centers, waste reduction programs, and a tree plantation drive for sustainability.",
    "The success of the Otoz platform was marked by customer utilization in the United States.",
    "NETSOL retained its best-performing employees during the restructuring to ensure operational efficiency and maintain quality.",
    "The objectives are to invest in quality resources, leverage global solutions, enhance shareholder value, focus on market-specific growth, and support capacity building in technology.",
    "NETSOL's focus on AI aims to improve internal efficiencies, provide advanced analytics, and integrate transformative technologies to meet evolving client needs.",
    "Pakistan won by 57 runs",
    "No, he was recently forced to resign from his position."
]

responses = []
for query, true_answer in zip(queries, ground_truth):
    response = rag_workflow(query)
    print(f"Query: {query}")
    print(f"Response: {response}")
    print(f"Ground Truth: {true_answer}")
    print("-" * 80)
    responses.append(response)


Query Type: netsol
Query: What is the vision of NETSOL Technologies Limited?
Response: No relevant NETSOL information found.
Ground Truth: To become the leading and world-class provider of IT solutions and services in each market of operations by leveraging global positioning, creating strong growth potential, increasing shareholder value, and providing a great environment for employees.
--------------------------------------------------------------------------------
Query Type: netsol
Query: What are the main certifications achieved by NETSOL?
Response: No relevant NETSOL information found.
Ground Truth: NETSOL has been audited for ISO 27001, ISO 20000, and ISO 9001 certifications and achieved SOC 2 Type 2 compliance.
--------------------------------------------------------------------------------
Query Type: netsol
Query: What are the core products offered by NETSOL?
Response: No relevant NETSOL information found.
Ground Truth: The core products are Ascent (a platform for asset finan

In [64]:
class RAGWorkflowWrapper:
    def __init__(self, rag_workflow):
        self.rag_workflow = rag_workflow

    async def __call__(self, inputs, **kwargs):
        if isinstance(inputs, str):
            return [self.rag_workflow(inputs)]
        else:
            raise ValueError("Input should be a string.")

    async def generate(self, prompt, **kwargs):
        if isinstance(prompt, str):
            response = await self.rag_workflow(prompt)
            return [response]
        else:
            raise ValueError("Prompt should be a string.")

wrapped_rag_workflow = RAGWorkflowWrapper(rag_workflow)

# Initialize RAGAS metrics with the wrapped rag_workflow
faithfulness_scorer = Faithfulness(llm=wrapped_rag_workflow)
context_precision = LLMContextPrecisionWithoutReference(llm=wrapped_rag_workflow)
context_recall = LLMContextRecall(llm=wrapped_rag_workflow)

async def evaluate_ragas(queries, responses, ground_truth):
    faithfulness_scores, precision_scores, recall_scores = [], [], []
    for query, response, true_answer in zip(queries, responses, ground_truth):
        sample = SingleTurnSample(user_input=query, response=response, retrieved_contexts=[true_answer])
        faithfulness_scores.append(await faithfulness_scorer.single_turn_ascore(sample))
        precision_scores.append(await context_precision.single_turn_ascore(sample))
        recall_scores.append(await context_recall.single_turn_ascore(
            SingleTurnSample(user_input=query, response=response, reference=true_answer, retrieved_contexts=[true_answer])
        ))
    return {
        "faithfulness": sum(faithfulness_scores) / len(faithfulness_scores),
        "precision": sum(precision_scores) / len(precision_scores),
        "recall": sum(recall_scores) / len(recall_scores)
    }

evaluation_scores = await evaluate_ragas(queries, responses, ground_truth)
print("Evaluation Scores:", evaluation_scores)


ValueError: Prompt should be a string.