# Installation and Importing Dependencies

In [None]:
! pip install langchain transformers chromadb sentence-transformers tqdm
! pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
! pip install langchain_community
! pip install accelerate
! apt-get install poppler-utils
! apt install tesseract-ocr
! apt install ocrmypdf
! apt install poppler-utils

Collecting langchain
  Downloading langchain-0.2.6-py3-none-any.whl (975 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/975.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.6/975.5 kB[0m [31m4.0 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m972.8/975.5 kB[0m [31m21.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m975.5/975.5 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
Collecting chromadb
  Downloading chromadb-0.5.3-py3-none-any.whl (559 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m559.5/559.5 kB[0m [31m29.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m25.5 MB/s[0m eta [

In [None]:
import os, torch
from langchain_community.vectorstores import Chroma
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_text_splitters import MarkdownHeaderTextSplitter
import subprocess
from sentence_transformers import CrossEncoder
import tempfile
from transformers import pipeline
from sentence_transformers import SentenceTransformer

In [None]:
! pwd

/content


In [None]:
print(torch.cuda.is_available())

True


# Preprocessing

In [None]:
def process_file(file_path):
    document= ""
    try:
        # Use the provided file_path instead of hardcoding the path
        # ! ocrmypdf "/content/Files.pdf" out.pdf --skip-text
        ! ocrmypdf "/content/sql_tutorial.pdf" out.pdf --skip-text
        ! pdftotext out.pdf out.txt

        with open("out.txt", "r") as f:
            document = f.read()
        os.remove("out.pdf")
        os.remove("out.txt")
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return []  # Return an empty list if the file is not found
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
        return []  # Return an empty list if there's an error during processing

    documents = []
    chunk_size = 2000
    check_overlap = 200
    splits = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=check_overlap).split_text(document)
    for i, split in enumerate(splits):
        documents.append(split)
    return documents

def traverse_directory(directory_path):
    splitsmaxpro = []
    for root, dirs, files in os.walk(directory_path):
        for file in files:
            if file.lower().endswith('.pdf'):  # Check if the file is a PDF
                file_path = os.path.join(root, file)
                documents = process_file(file_path)
                splitsmaxpro.extend(documents)
    return splitsmaxpro

directory_path = "/content"
splitsmaxpro = traverse_directory(directory_path)  # Use traverse_directory to process all PDFs in the directory

if splitsmaxpro!=[]:
    embeddings= HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    vectorstore = Chroma.from_texts(texts=splitsmaxpro, embedding=embeddings, persist_directory="./chroma_db")
    vectorstore.persist()
else:
    print("No text content found in the PDF files.")

Scanning contents: 100% 30/30 [00:00<00:00, 46.83page/s]
Start processing 2 pages concurrently
    1 skipping all processing on this page
    2 skipping all processing on this page
    3 skipping all processing on this page
    4 skipping all processing on this page
    6 skipping all processing on this page
    5 skipping all processing on this page
    7 skipping all processing on this page
    8 skipping all processing on this page
    9 skipping all processing on this page
   11 skipping all processing on this page
   10 skipping all processing on this page
   12 skipping all processing on this page
   13 skipping all processing on this page
   14 skipping all processing on this page
   15 skipping all processing on this page
   16 skipping all processing on this page
   17 skipping all processing on this page
   18 skipping all processing on this page
   20 skipping all processing on this page
   19 skipping all processing on this page
   22 skipping all processing on this page
  

# Vectorstore

In [None]:
embeddings= HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [None]:
vectorstore = Chroma.from_texts(texts=splitsmaxpro, embedding=embeddings, persist_directory="./chroma_db")
vectorstore.persist()

retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 10})

# Retriever

In [None]:
!pip install numpy
import numpy as np

!pip install scikit-learn
from sklearn.metrics.pairwise import cosine_similarity



In [None]:
import numpy as np
from sklearn.metrics import jaccard_score

In [None]:
Cross_encoder_model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

def re_rank_documents_stage_1(query_embedding, documents, top_k=10):
    doc_embeddings = np.array([embeddings.embed_query(doc.page_content) for doc in documents])
    # Calculate cosine similarity here
    similarities = cosine_similarity([query_embedding], doc_embeddings).flatten()
    ranked_indices = np.argsort(similarities)[::-1]
    ranked_docs = [documents[idx] for idx in ranked_indices[:top_k]]
    return ranked_docs

# Second stage re-ranking using CrossEncoder model
def re_rank_documents_stage_2(query, documents, top_k=5):
    query_doc_pairs = [(query, doc.page_content) for doc in documents]
    # Use the globally defined Cross_encoder_model
    scores = Cross_encoder_model.predict(query_doc_pairs)
    ranked_indices = np.argsort(scores)[::-1]
    ranked_docs = [documents[idx] for idx in ranked_indices[:top_k]]
    return ranked_docs

query = "What are course objectives of network?"
retrieved_docs = retriever.invoke(query)
query_embedding = embeddings.embed_query(query)
re_ranked_docs_stage_1 = re_rank_documents_stage_1(query_embedding, retrieved_docs, top_k=10)
bert_model = SentenceTransformer('all-MiniLM-L6-v2')
re_ranked_docs_stage_2 = re_rank_documents_stage_2(query, re_ranked_docs_stage_1, top_k=5)

for doc in re_ranked_docs_stage_2:
    print(doc.page_content)
    print('-' * 36)



5
Course Objectives:
1. Understand computer network basic, different models used for study of computer networks, ability to
identify different designs, understanding of the issues surrounding wired and wireless Networks.
2. Design, calculate, and apply subnet masks to fulfill networking requirements and building the skills of
routing mechanisms.
3. Analyze the features and operations of various application layer protocols such as Http, DNS, SMTP and
FTP.
4. Analyze the requirements for a given organizational structure and select the most appropriate networking
architecture and technologies
5. Familiarity with the basic protocols of computer networks, and how they can be used to assist in network
design and implementation.
Course Learning Outcomes: After the course completion, the student will be able to
1. Characterize and appreciate computer networks from the view point of components and from the view point
of services
2. Display good understanding of the flow of a protocol in general

# LLM

In [None]:
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it", token="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")
model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it", device_map="cuda", torch_dtype=torch.float16, token="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")

tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [None]:
docs = [doc.page_content for doc in re_ranked_docs_stage_2]
context = "\n".join(docs)

## Prompt template differs for different LLMs. This template is specific for gemma-it
prompt_template = "<bos><start_of_turn>user\nContext:\n{context}\nBased on the context, answer the Question: {question}\n<end_of_turn>\n<start_of_turn>model\n"
# Make sure to use 'query' instead of 'prompt' here
prompt = prompt_template.format(context=context, question=query)

inputs = tokenizer(prompt, return_tensors="pt").to('cuda')
_ = model.generate(**inputs, max_new_tokens=256)

In [None]:
print(tokenizer.decode(_[0], skip_special_tokens=True))

user
Context:
Course Objectives:
1. Understand computer network basic, different models used for study of computer networks, ability to
identify different designs, understanding of the issues surrounding wired and wireless Networks.
2. Design, calculate, and apply subnet masks to fulfill networking requirements and building the skills of
routing mechanisms.
3. Analyze the features and operations of various application layer protocols such as Http, DNS, SMTP and
FTP.
4. Analyze the requirements for a given organizational structure and select the most appropriate networking
architecture and technologies
5. Familiarity with the basic protocols of computer networks, and how they can be used to assist in network
design and implementation.
Course Learning Outcomes: After the course completion, the student will be able to
1. Characterize and appreciate computer networks from the view point of components and from the view point
of services
2. Display good understanding of the flow of a protoco