## Installing the dependencies

- **Install Weaviate Client**: `!pip install weaviate-client` installs the Weaviate client library for interacting with the Weaviate database.

- **Install LangChain**: `!pip install langchain` installs the core LangChain library, which provides tools for building applications with language models.

- **Install LangChain Community**: `!pip install langchain-community` installs additional third-party integrations for LangChain.

- **Install PyPDF**: `!pip install pypdf` installs the PyPDF library for reading and manipulating PDF files.

- **Install LangChain Groq**: `!pip install langchain-groq` installs the Groq integration for LangChain, enabling specific functionalities related to Groq.

- **Install YouTube Transcript API**: `!pip install youtube_transcript_api` installs a library to retrieve transcripts from YouTube videos.

- **Install PyPDF2**: `!pip install PYPDF2` installs another library for working with PDF files, allowing for reading and writing PDF documents.

- **Install Ollama**: `!pip install ollama` installs the Ollama library, which may be used for specific functionalities in your project.

In [None]:
# !pip install weaviate-client
# !pip install langchain
# !pip install langchain-community
# !pip install pypdf
# !pip install langchain-groq
# !pip install youtube_transcript_api
# !pip install PYPDF2
# !pip install ollama

## Importing the libraries

In [None]:
# import ollama

# # Specify the server URL and port
# server_url = "http://localhost:11434"  # Change to your desired port if needed

# # Generate a response using the llama3:latest model
# response = ollama.generate(model='llama3:latest', prompt='Explain quantum computing.', base_url=server_url)

# # Print the response
# print(response['response'])

In [None]:
import os
import re
import weaviate

# To access the API keys
from google.colab import userdata
from rich import print

# Large language model
from langchain_groq import ChatGroq
from langchain import HuggingFacePipeline

# For performing operaitons on the vector database
from langchain.retrievers.weaviate_hybrid_search import WeaviateHybridSearchRetriever

# For reading the data from the source and creating chunks
from youtube_transcript_api import YouTubeTranscriptApi
from PyPDF2 import PdfReader
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

print("Everyting Imported successfully")

## Loading the embedding model, Large language model

In [None]:
# Huggingface embedding model
embdding_model = "HuggingFaceH4/zephyr-7b-beta"

# LLM
model = ChatGroq(model="llama3-8b-8192",api_key = userdata.get('GROQ_API_KEY'))

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=4000,
    chunk_overlap=200,
    separators=["\n\n", "\n", " ", ""]
)

## Defining the youtube video transcript retriever

In [None]:
# Function to extract the video ID from the YouTube URL
def get_youtube_id(url):

    regex = (
        r'(https?://)?(www\.)?(youtube|youtu|youtube-nocookie)\.(com|be)/(watch\?v=|embed/|v/|.+\?v=)?([^&=%\?]{11})')
    match = re.match(regex, url)
    return match.group(6) if match else None

def get_transcript(youtube_url):

  # Calling function to get youtube id and then corresponding transcript
  result = YouTubeTranscriptApi.get_transcript(get_youtube_id(youtube_url))
  yt_captions = ""
  for item in iter(result):
    yt_captions = yt_captions + item['text'] + ""

  return yt_captions

In [None]:
print(get_transcript("https://www.youtube.com/watch?v=H_cqBjDVinw"))

In [None]:
# Path to the PDF file
path = "/content/Yuvraj_Singh.pdf"

def process_pdf(file_path,text_splitter,retriever):

  # Initialize the PDF loader
  loader = PyPDFLoader(file_path)

  # Load the PDF data and skip the first 7 pages and the last 6 pages
  all_pages = loader.load()

  # Extract the required pages, ignoring the first 7 and last 6
  filtered_pages = all_pages[7:-6]

  # Combine the filtered pages into a single string if needed
  content = " ".join([page.page_content for page in all_pages])

  # Creating the chunks and storing them in the vector database
  docs = text_splitter.create_documents(content.split("\n\n"))

  # Adding the docs to retriever
  retriever.add_documents(docs)


## Setting up the weviate vector database

In [None]:
# Define the Weaviate URL
WEAVIATE_URL = "https://mraljvvcqhqzgnfv0hmsa.c0.us-west3.gcp.weaviate.cloud"

# Initialize the Weaviate client using version 4
client = weaviate.WeaviateClient(
    url=WEAVIATE_URL,
    auth_client_secret=weaviate.AuthApiKey(userdata.get('WEAVIATE_API_KEY')),
    additional_headers={
        "X-HuggingFace-Api-Key": userdata.get('HF_TOKEN')
    },
)

def setup_db(client):
    # Deleting the existing schema if it exists
    try:
        client.schema.delete_class("RAG")
    except Exception as e:
        print(f"Error deleting class: {e}")

    # Defining the schema
    schema = {
        "classes": [
            {
                "class": "RAG",
                "description": "Documents for RAG",
                "vectorizer": "text2vec-huggingface",
                "moduleConfig": {
                    "text2vec-huggingface": {
                        "model": "sentence-transformers/all-MiniLM-L6-v2",
                        "type": "text"
                    }
                },
                "properties": [
                    {
                        "dataType": ["text"],
                        "description": "The content of the paragraph",
                        "moduleConfig": {
                            "text2vec-huggingface": {
                                "skip": False,
                                "vectorizePropertyName": False,
                            }
                        },
                        "name": "content",
                    },
                ],
            },
        ]
    }

    # Creating the schema
    try:
        client.schema.create(schema)
    except Exception as e:
        print(f"Error creating schema: {e}")

    # Defining the retriever
    retriever = WeaviateHybridSearchRetriever(
        alpha=0.5,  # defaults to 0.5, which is equal weighting between keyword and semantic search
        client=client,  # keyword arguments to pass to the Weaviate client
        index_name="RAG",  # The name of the index to use
        text_key="content",  # The name of the text key to use
        attributes=[],  # The attributes to return in the results
        create_schema_if_missing=True,
    )

    return retriever


%%time
retriever = setup_db(client)

TypeError: WeaviateClient.__init__() got an unexpected keyword argument 'url'

In [None]:
count = client.query.aggregate("RAG").with_meta_count().do()
print(f"Number of objects in the index: {count['data']['Aggregate']['RAG'][0]['meta']['count']}")

## Recursive text splitter for creating chunks

In [None]:
%%time
print("📌Here is the retrieved doc")
print("____________________________")
print(docs[0])
print("____________________________")

CPU times: user 8.68 ms, sys: 971 µs, total: 9.66 ms
Wall time: 10.4 ms


Document(page_content='Yuvraj Singh +91-6239305919\nRoll No:21BCS6343 ys2002github@gmail.com\nB.TECH(Hons) 21BCS6343@cuchd.in\nArtificial Intelligence and Machine learning Github/yuvraaj2002\nChandigarh University,Gharuan Linkedin/Yuvraj Singh\nEducation\nDegree/Certificate Institute/Board CGPA/Percentage Year\nB.Tech. (CSE) Chandigarh University, Gharuan 8.02 Jul 2021- Jul 2025\nSenior Secondary CBSE Board 91.2% Feb 2021\nExperience\n•Samsung research and development Jan 2024 - Present\nPrism Intern Remote\n–Innovated a personal document detection and masking algorithm, enhancing identification accuracy by 35%, en-\nsuring privacy regulation compliance and fortifying data security measures across diverse document types.\n•Wictronix Jun 2023 - Aug 2023\nAIML Developer Intern Remote\n–Implemented YOLO V8 for Indian road traffic management, achieving a 8% improvement in vehicle number plate\ndetection accuracy, thus increasing fine collections by 25%.\nProjects\n•InterviewX (AI Tutor)\nD

## RAG pipeline

In [23]:
def rag_pipeline(query_text,retriever,model):

  query_text = "What are the differences between the radius and ulna"

  # Retrieving the most accurate documents using the hybrid search
  vdb_context_text = retriever.invoke(query_text,score=True)[0].page_content

  # Getting the response from the LLM based on query text and retrieved context data
  response = model.invoke(f"""
  To provide an accurate and informative answer to your medical query, I will need to review the specific query text '{query_text}'
  and the context information '{vdb_context_text}' retrieved from the vector database.""")

  return response.content

query_text = input("Enter you query 🤔: ")
print(rag_pipeline(query_text,retriever,model))

Enter you query 🤔: WAL


In [None]:
print(response.content)