   # DEEPDIVE AI RAG

In [41]:
import PyPDF2
import nltk
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoModel, AutoTokenizer, pipeline
import os
import faiss
import streamlit as st
import numpy as np
import re

# 1.Data Handling

In [29]:
def extract_text_from_pdf(file_path):
    text = ""
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            text += page.extract_text()
    return text

def structure_text(text):
    sections = {
        "Abstract": "",
        "Introduction": "",
        "Methodology": "",
        "Results and Discussion": "",
        "Conclusion": ""
    }
    
    # Define keywords to identify sections
    keywords = {
        "Abstract": ["abstract"],
        "Introduction": ["introduction"],
        "Methodology": ["methodology", "methods"],
        "Results and Discussion": ["results", "discussion"],
        "Conclusion": ["conclusion"]
    }
    
    # Split text into lines
    lines = text.split('\n')
    current_section = None
    
    for line in lines:
        # Check if the line contains a section keyword
        for section, keys in keywords.items():
            if any(key.lower() in line.lower() for key in keys):
                current_section = section
                break
        
        if current_section:
            sections[current_section] += line + " "
    
    return sections

# PDF Links
pdf_files = [
    "C:/Users/anand/Desktop/DeepDive AI Project/2501.12948v1.pdf",
    "C:/Users/anand/Desktop/DeepDive AI Project/2407.21783v3.pdf",
    "C:/Users/anand/Desktop/DeepDive AI Project/2312.11805v4.pdf",
    "C:/Users/anand/Desktop/DeepDive AI Project/2303.08774v6.pdf",
    "C:/Users/anand/Desktop/DeepDive AI Project/1706.03762v7.pdf"
]

for pdf_file in pdf_files:
    pdf_text = extract_text_from_pdf(pdf_file)
    structured_text = structure_text(pdf_text)
    
    print(f"Text from {pdf_file[:30]}...")
    for section, content in structured_text.items():
        print(f"{section}:\n{content[:500]}...\n")  

Text from C:/Users/anand/Desktop/DeepDiv...
Abstract:
Abstract We introduce our first-generation reasoning models, DeepSeek-R1-Zero and DeepSeek-R1. DeepSeek-R1-Zero, a model trained via large-scale reinforcement learning (RL) without super- vised fine-tuning (SFT) as a preliminary step, demonstrates remarkable reasoning capabilities. Through RL, DeepSeek-R1-Zero naturally emerges with numerous powerful and intriguing reasoning behaviors. However, it encounters challenges such as poor readability, and language mixing. To address these issues and fu...

Introduction:
1 Introduction 3 1.1 Contributions . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 4 21. Introduction In recent years, Large Language Models (LLMs) have been undergoing rapid iteration and evolution (Anthropic, 2024; Google, 2024; OpenAI, 2024a), progressively diminishing the gap towards Artificial General Intelligence (AGI). Recently, post-training has emerged as an important component of the 

# 2.Data Preprocessing

In [31]:
# Step-1 Clean and tokenize the extracted text
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\d+\.\d+', '', text)
    text = re.sub(r'Page \d+', '', text)  # Remove page number metadata
    return text

def tokenize_text(text):
    sentences = sent_tokenize(text)
    return sentences


# Step-2 Chunk the content into manageable sections (200-500 words)
def chunk_text(text, chunk_size=300):
    words = text.split()
    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks


# Step-3 Add metadata such as section headings and page numbers
def preprocess_text(file_path):
    text = extract_text_from_pdf(file_path)
    structured_text = structure_text(text)
    
    for section, content in structured_text.items():
        if content:
            cleaned_text = clean_text(content)
            tokenized_text = tokenize_text(cleaned_text)
            chunks = chunk_text(cleaned_text)
            
            print(f"Section: {section}")
            for i, chunk in enumerate(chunks):
                print(f"Chunk {i+1}:\n{chunk}\n")

pdf_files = [
    "C:/Users/anand/Desktop/DeepDive AI Project/2501.12948v1.pdf",
    "C:/Users/anand/Desktop/DeepDive AI Project/2407.21783v3.pdf",
    "C:/Users/anand/Desktop/DeepDive AI Project/2312.11805v4.pdf",
    "C:/Users/anand/Desktop/DeepDive AI Project/2303.08774v6.pdf",
    "C:/Users/anand/Desktop/DeepDive AI Project/1706.03762v7.pdf"
]

for pdf_file in pdf_files:
    preprocess_text(pdf_file)

Section: Abstract
Chunk 1:
Abstract We introduce our first-generation reasoning models, DeepSeek-R1-Zero and DeepSeek-R1. DeepSeek-R1-Zero, a model trained via large-scale reinforcement learning (RL) without super- vised fine-tuning (SFT) as a preliminary step, demonstrates remarkable reasoning capabilities. Through RL, DeepSeek-R1-Zero naturally emerges with numerous powerful and intriguing reasoning behaviors. However, it encounters challenges such as poor readability, and language mixing. To address these issues and further enhance reasoning performance, we introduce DeepSeek-R1, which incorporates multi-stage training and cold-start data before RL. DeepSeek- R1 achieves performance comparable to OpenAI-o1-1217 on reasoning tasks. To support the research community, we open-source DeepSeek-R1-Zero, DeepSeek-R1, and six dense models (B, 7B, 8B, 14B, 32B, 70B) distilled from DeepSeek-R1 based on Qwen and Llama. AIME 2024 (Pass@1)Codeforces (Percentile)GPQA Diamond (Pass@1)MATH-500 (Pas

# 3.Embedding Creation

In [33]:
# To create sentence vector representations of the chunks
# Load pre-trained model
model = SentenceTransformer('model Load')

def create_embeddings(chunks):
    embeddings = model.encode(chunks)
    return embeddings

In [34]:
# Store embeddings in a vector database for efficient retrieval
def store_embeddings(embeddings):
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    return index


def preprocess_text(file_path):
    text = extract_text_from_pdf(file_path)
    structured_text = structure_text(text)
    
    chunks = []
    metadata = []
    
    for section, content in structured_text.items():
        if content:
            cleaned_text = clean_text(content)
            chunked_text = chunk_text(cleaned_text)
            chunks.extend(chunked_text)
            metadata.extend([(section, i) for i in range(len(chunked_text))])
    
    embeddings = create_embeddings(chunks)
    index = store_embeddings(embeddings)
    
    return index, metadata

all_embeddings = []
all_metadata = []

for pdf_file in pdf_files:
    index, metadata = preprocess_text(pdf_file)
    all_embeddings.append(index)
    all_metadata.append(metadata)

In [35]:
# Print All Metadata
for i, metadata in enumerate(all_metadata):
    print(f"\nMetadata for PDF file {i+1}:\n")
    for section, chunk_id in metadata:
        print(f"Section: {section}, Chunk ID: {chunk_id}")


Metadata for PDF file 1:

Section: Abstract, Chunk ID: 0
Section: Introduction, Chunk ID: 0
Section: Methodology, Chunk ID: 0
Section: Methodology, Chunk ID: 1
Section: Methodology, Chunk ID: 2
Section: Methodology, Chunk ID: 3
Section: Results and Discussion, Chunk ID: 0
Section: Results and Discussion, Chunk ID: 1
Section: Results and Discussion, Chunk ID: 2
Section: Results and Discussion, Chunk ID: 3
Section: Results and Discussion, Chunk ID: 4
Section: Results and Discussion, Chunk ID: 5
Section: Results and Discussion, Chunk ID: 6
Section: Results and Discussion, Chunk ID: 7
Section: Results and Discussion, Chunk ID: 8
Section: Results and Discussion, Chunk ID: 9
Section: Results and Discussion, Chunk ID: 10
Section: Results and Discussion, Chunk ID: 11
Section: Results and Discussion, Chunk ID: 12
Section: Results and Discussion, Chunk ID: 13
Section: Results and Discussion, Chunk ID: 14
Section: Results and Discussion, Chunk ID: 15
Section: Results and Discussion, Chunk ID: 16

# 4. Query Processing

In [60]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import os

# Set your Hugging Face API token
hf_token = 'you api key'
os.environ[''] = '1'  # Suppress symlink warning

# Load Hugging Face models
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", token=hf_token)
model = AutoModelForQuestionAnswering.from_pretrained("bert-base-uncased", token=hf_token)
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

# Load your pre-trained SentenceTransformer model for semantic search
semantic_model = SentenceTransformer('Load Model')

# More detailed chunks of text from research papers
research_chunks = [
    """Attention Is All You Need: This paper introduces the Transformer model which relies on self-attention mechanisms instead of recurrent neural networks. 
    The Transformer allows for much more parallelization, improving training speed. The key innovation is the self-attention mechanism, which allows the model to weigh the importance of different words in a sentence when generating a response.""",
    
    """DeepSeek-R1: This paper discusses the use of reinforcement learning to improve reasoning capabilities in large language models (LLMs). 
    Reinforcement learning allows the model to receive rewards for correct predictions, thereby improving its performance over time. The DeepSeek-R1 model is designed to handle complex reasoning tasks more effectively than previous models.""",
    
    """GPT-4 Technical Report: The technical report outlines the architecture and improvements in the GPT-4 model, including increased model size, better training data, and more sophisticated training techniques. 
    These improvements result in a model that is more accurate, robust, and capable of understanding and generating human-like text across a wide range of tasks.""",
    
    """The Llama 3 Herd of Models: This paper presents the Llama 3 series of models, known for their efficiency and robustness. 
    The Llama 3 models leverage a mixture of experts' approach, where different parts of the model are specialized for different tasks, leading to better overall performance and resource utilization.""",
    
    """Gemini: This research introduces a family of multimodal models capable of handling text, image, and audio inputs. 
    The Gemini models are designed to integrate information from multiple modalities, allowing them to perform complex tasks that require understanding and generating content across different types of data."""
]

# Function to perform semantic search
def retrieve_chunks_from_vector_db(query_embedding, database_embeddings, top_k=5):
    similarities = cosine_similarity(query_embedding, database_embeddings)
    top_k_indices = np.argsort(similarities[0])[::-1][:top_k]
    return [research_chunks[i] for i in top_k_indices]

# Function to process a single query
def process_single_query(query, database_chunks, database_embeddings):
    query_embedding = semantic_model.encode([query])
    relevant_chunks = retrieve_chunks_from_vector_db(query_embedding, database_embeddings)
    context = " ".join(relevant_chunks)
    
    # Pass the context and query to the Hugging Face model for generating a response
    response = qa_pipeline(question=query, context=context)
    return response['answer']

# Function to process multiple queries
def process_queries(queries, database_chunks, database_embeddings):
    responses = {}
    for query in queries:
        responses[query] = process_single_query(query, database_chunks, database_embeddings)
    return responses

# Encode the research chunks
database_embeddings = semantic_model.encode(research_chunks)

# Quries To Raise
queries = [
    "Explain the concept of transformers in AI",
    "What is the importance of self-attention mechanisms?",
    "How does reinforcement learning improve reasoning capabilities?",
    "What are the key improvements in the GPT-4 model?",
    "What makes the Llama 3 series of models efficient?",
    "How can multimodal models handle text, image, and audio inputs?",
    "What are the applications of Transformer models in NLP?",
    "How do attention mechanisms compare to traditional RNNs and LSTMs in performance?",
    "What challenges and solutions exist in training large-scale language models?",
    "How do recent advancements in AI models impact various industries, such as healthcare and finance?"
]
responses = process_queries(queries, research_chunks, database_embeddings)
for query, response in responses.items():
    print(f"Query: {query}\nResponse: {response}\n")

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu


Query: Explain the concept of transformers in AI
Response: are designed to integrate information from multiple modalities,

Query: What is the importance of self-attention mechanisms?
Response: are designed to integrate information from multiple modalities,

Query: How does reinforcement learning improve reasoning capabilities?
Response: ,

Query: What are the key improvements in the GPT-4 model?
Response: are designed to integrate information from multiple modalities,

Query: What makes the Llama 3 series of models efficient?
Response: The Llama 3 Herd

Query: How can multimodal models handle text, image, and audio inputs?
Response: The Gemini models are designed to integrate information from multiple modalities,

Query: What are the applications of Transformer models in NLP?
Response: The Gemini models are designed to integrate information from multiple modalities,

Query: How do attention mechanisms compare to traditional RNNs and LSTMs in performance?
Response: is the self-attentio