In [37]:
import os
import re
import PyPDF2
import numpy as np
import requests
import json
import pickle
from typing import List, Dict, Any, Tuple
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import Ollama

In [38]:
PDF_DIRECTORY = "pdf/"
VECTOR_DB_PATH = "faiss_index"
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 200
OLLAMA_MODEL = "llama3.2:latest"  # Options: "llama2", "mistral", "gemma", etc.
OLLAMA_URL = "http://localhost:11434/api/generate"

In [39]:
def check_ollama_status():
    """Check if Ollama is running and list available models."""
    try:
        response = requests.get("http://localhost:11434/api/tags")
        if response.status_code != 200:
            print("Error: Ollama server is not responding correctly.")
            print("Please make sure Ollama is installed and running.")
            print("You can install Ollama from: https://ollama.ai")
            return False, []
        
        # Get available models
        available_models = [model['name'] for model in response.json().get('models', [])]
        if not available_models:
            print("No models found in Ollama. Please pull a model first.")
            print("For example, run: 'ollama pull mistral'")
            return False, []
            
        return True, available_models
        
    except requests.exceptions.ConnectionError:
        print("Error: Cannot connect to Ollama server.")
        print("Please make sure Ollama is installed and running on http://localhost:11434")
        print("You can install Ollama from: https://ollama.ai")
        return False, []

In [40]:
check_ollama_status()

(True, ['vicuna:7b', 'deepseek-r1:7b', 'deepseek-llm:7b', 'llama3.2:latest'])

In [41]:
from langchain_community.embeddings import HuggingFaceEmbeddings
def initialize_embedding_model():
    """Initialize and return the embedding model."""
    print("Initializing embedding model...")
    try:
        embedding_model = HuggingFaceEmbeddings(
            model_name="sentence-transformers/all-mpnet-base-v2",
            model_kwargs={'device': 'cpu'}
        )
        return embedding_model
    except Exception as e:
        print(f"Error initializing embedding model: {e}")
        return None

In [42]:
initialize_embedding_model()

Initializing embedding model...




HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-mpnet-base-v2', cache_folder=None, model_kwargs={'device': 'cpu'}, encode_kwargs={}, multi_process=False, show_progress=False)

In [43]:
def initialize_text_splitter():
    """Initialize and return the text splitter."""
    print("Initializing text splitter...")
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP,
        separators=["\n\n", "\n", ".", " ", ""]
    )
    return text_splitter

In [44]:
def initialize_llm(model_name=OLLAMA_MODEL):
    """Initialize and return the LLM."""
    print(f"Initializing LLM with model: {model_name}...")
    try:
        llm = Ollama(model=model_name)
        return llm
    except Exception as e:
        print(f"Error initializing LLM: {e}")
        print(f"Make sure the {model_name} model is available in Ollama.")
        return None

In [45]:
def clean_text(text: str) -> str:
    """Clean the extracted text."""
    # Remove headers and footers (usually page numbers)
    text = re.sub(r'(\n\d+\s*)+', '\n', text)
    
    # Remove consecutive newlines
    text = re.sub(r'\n{3,}', '\n\n', text)
    
    # Remove non-text elements (like figures, tables references)
    text = re.sub(r'Fig\.\s*\d+\..*?\n', ' ', text)
    text = re.sub(r'Table\s*\d+\..*?\n', ' ', text)
    
    return text

In [46]:
def extract_text_from_pdf(pdf_path: str) -> str:
    """Extract text content from a PDF file."""
    text = ""
    try:
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            for page in reader.pages:
                text += page.extract_text() + "\n"
                
        # Clean the extracted text
        text = clean_text(text)
        return text
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
        return ""

In [47]:
extract_text_from_pdf('pdf\Andriushchenko et al. - 2024 - Jailbreaking Leading Safety-Aligned LLMs with Simple Adaptive Attacks.pdf')

'JAILBREAKING LEADING SAFETY -ALIGNED LLM S\nWITH SIMPLE ADAPTIVE ATTACKS\nMaksym Andriushchenko\nEPFLFrancesco Croce\nEPFLNicolas Flammarion\nEPFL\nABSTRACT\nWe show that even the most recent safety-aligned LLMs are not robust to simple\nadaptive jailbreaking attacks. First, we demonstrate how to successfully leverage\naccess to logprobs for jailbreaking: we initially design an adversarial prompt tem-\nplate (sometimes adapted to the target LLM), and then we apply random search on\na suffix to maximize a target logprob (e.g., of the token “Sure” ), potentially with\nmultiple restarts. In this way, we achieve 100% attack success rate—according to\nGPT-4 as a judge—on Vicuna-13B, Mistral-7B, Phi-3-Mini, Nemotron-4-340B,\nLlama-2-Chat-7B/13B/70B, Llama-3-Instruct-8B, Gemma-7B, GPT-3.5, GPT-4o,\nand R2D2 from HarmBench that was adversarially trained against the GCG at-\ntack. We also show how to jailbreak allClaude models—that do not expose\nlogprobs—via either a transfer or prefilling at

In [48]:
from langchain.schema import Document

def process_single_paper(file_path: str, text_splitter) -> List[Document]:
    """Process a single PDF paper and return its chunks as Document objects."""
    filename = os.path.basename(file_path)
    print(f"Processing {filename}...")
    
    # Extract text
    text = extract_text_from_pdf(file_path)
    
    if not text:
        print(f"Warning: No text extracted from {filename}")
        return []
    
    # Split text into chunks
    text_chunks = text_splitter.split_text(text)
    
    # Create document objects
    documents = []
    for i, chunk in enumerate(text_chunks):
        doc = Document(
            page_content=chunk,
            metadata={
                "source": filename,
                "chunk": i
            }
        )
        documents.append(doc)
    
    print(f"Created {len(documents)} chunks from {filename}")
    return documents

In [49]:
text_splitter = initialize_text_splitter()
process_single_paper('pdf\Andriushchenko et al. - 2024 - Jailbreaking Leading Safety-Aligned LLMs with Simple Adaptive Attacks.pdf',text_splitter)

Initializing text splitter...
Processing Andriushchenko et al. - 2024 - Jailbreaking Leading Safety-Aligned LLMs with Simple Adaptive Attacks.pdf...
Created 104 chunks from Andriushchenko et al. - 2024 - Jailbreaking Leading Safety-Aligned LLMs with Simple Adaptive Attacks.pdf


[Document(page_content='JAILBREAKING LEADING SAFETY -ALIGNED LLM S\nWITH SIMPLE ADAPTIVE ATTACKS\nMaksym Andriushchenko\nEPFLFrancesco Croce\nEPFLNicolas Flammarion\nEPFL\nABSTRACT\nWe show that even the most recent safety-aligned LLMs are not robust to simple\nadaptive jailbreaking attacks. First, we demonstrate how to successfully leverage\naccess to logprobs for jailbreaking: we initially design an adversarial prompt tem-\nplate (sometimes adapted to the target LLM), and then we apply random search on\na suffix to maximize a target logprob (e.g., of the token “Sure” ), potentially with\nmultiple restarts. In this way, we achieve 100% attack success rate—according to\nGPT-4 as a judge—on Vicuna-13B, Mistral-7B, Phi-3-Mini, Nemotron-4-340B,\nLlama-2-Chat-7B/13B/70B, Llama-3-Instruct-8B, Gemma-7B, GPT-3.5, GPT-4o,\nand R2D2 from HarmBench that was adversarially trained against the GCG at-\ntack. We also show how to jailbreak allClaude models—that do not expose', metadata={'source': 'An

In [50]:
def process_papers(text_splitter) -> List[Dict[str, Any]]:
    """Process all PDF papers in the directory."""
    all_documents = []
    
    # Check if directory exists
    if not os.path.exists(PDF_DIRECTORY):
        os.makedirs(PDF_DIRECTORY)
        print(f"Created directory {PDF_DIRECTORY}. Please add your NeurIPS papers to this directory.")
        return all_documents
    
    # Count PDF files
    pdf_files = [f for f in os.listdir(PDF_DIRECTORY) if f.endswith('.pdf')]
    if not pdf_files:
        print(f"No PDF files found in {PDF_DIRECTORY}. Please add your NeurIPS papers to this directory.")
        return all_documents
    
    print(f"Found {len(pdf_files)} PDF files to process")
    
    # Process each PDF file
    for filename in pdf_files:
        file_path = os.path.join(PDF_DIRECTORY, filename)
        documents = process_single_paper(file_path, text_splitter)
        all_documents.extend(documents)
    
    print(f"Processed {len(all_documents)} chunks from {len(pdf_files)} papers.")
    return all_documents

In [59]:
def setup_vectordb(embedding_model) -> Tuple[bool, Any]:
    """Set up the vector database."""
    # Check if vector database already exists
    if os.path.exists(f"{VECTOR_DB_PATH}.faiss") and os.path.exists(f"{VECTOR_DB_PATH}.pkl"):
        print("Loading existing vector database...")
        try:
            vectordb = FAISS.load_local(
                VECTOR_DB_PATH,
                embedding_model
            )
            return True, vectordb
        except Exception as e:
            print(f"Error loading vector database: {e}")
            print("Will create a new vector database.")
    
    print("Creating new vector database...")
    
    # Initialize text splitter
    text_splitter = initialize_text_splitter()
    
    # Find PDF files in directory
    pdf_files = [f for f in os.listdir(PDF_DIRECTORY) if f.endswith('.pdf')]
    if not pdf_files:
        print("No PDF files found.")
        return False, None
    
    # Process the first PDF file (or you can process all files)
    pdf_path = os.path.join(PDF_DIRECTORY, pdf_files[0])
    
    # FIXED: Pass both file_path and text_splitter to process_single_paper
    documents = process_single_paper(pdf_path, text_splitter)
    
    if not documents:
        print("No documents to process.")
        return False, None
    
    # Create vector database
    try:
        vectordb = FAISS.from_documents(
            documents=documents,
            embedding=embedding_model
        )
        
        # Save the vector database
        vectordb.save_local(VECTOR_DB_PATH)
        return True, vectordb
    except Exception as e:
        print(f"Error creating vector database: {e}")
        return False, None

In [60]:
embedding_model = initialize_embedding_model()
setup_vectordb(embedding_model)

Initializing embedding model...
Creating new vector database...
Initializing text splitter...
Processing Andriushchenko et al. - 2024 - Jailbreaking Leading Safety-Aligned LLMs with Simple Adaptive Attacks.pdf...
Created 104 chunks from Andriushchenko et al. - 2024 - Jailbreaking Leading Safety-Aligned LLMs with Simple Adaptive Attacks.pdf


(True, <langchain_community.vectorstores.faiss.FAISS at 0x22fb8a2a040>)

In [61]:
def query_ollama_directly(prompt: str, model_name=OLLAMA_MODEL) -> str:
    """Query Ollama API directly with prompt."""
    payload = {
        "model": model_name,
        "prompt": prompt,
        "stream": False
    }
    
    try:
        response = requests.post(OLLAMA_URL, json=payload)
        if response.status_code == 200:
            return response.json().get("response", "")
        else:
            return f"Error: {response.status_code} - {response.text}"
    except Exception as e:
        return f"Error connecting to Ollama: {str(e)}"

In [62]:
query_ollama_directly("tell me the contribution of the paper",model_name=OLLAMA_MODEL)

"I don't have any information about a specific paper. If you could provide more context or details about the paper, such as its title or authors, I would be happy to help you understand its contribution."

In [63]:
def retrieve_context(vectordb, question: str, k: int = 5) -> str:
    """Retrieve relevant context from vector database."""
    if not vectordb:
        return "No vector database available. Please process documents first."
    
    # Retrieve relevant documents
    docs = vectordb.similarity_search(question, k=k)
    
    # Format context from retrieved documents
    context = "\n\n".join([doc.page_content for doc in docs])
    
    # Add metadata about the sources
    sources = [f"{doc.metadata['source']} (chunk {doc.metadata['chunk']})" for doc in docs]
    unique_sources = list(set(sources))
    sources_text = "Sources:\n" + "\n".join(unique_sources)
    
    return context, sources_text

In [64]:
def generate_answer(context: str, question: str, llm=None, model_name=OLLAMA_MODEL) -> str:
    """Generate an answer based on retrieved context."""
    # Create a prompt with the retrieved context
    prompt = f"""You are a research assistant specialized in NeurIPS conference papers. 
Answer the following question based ONLY on the provided context from research papers.
If you don't know or can't find the answer in the context, say so honestly.

Context from papers:
{context}

Question: {question}

Answer:"""
    
    # Get answer using LLM
    if llm:
        try:
            answer = llm.invoke(prompt)
            return answer
        except Exception as e:
            print(f"Error with LangChain Ollama: {e}")
            print("Falling back to direct Ollama API...")
    
    # Fallback to direct API call
    return query_ollama_directly(prompt, model_name)

In [65]:
def main():
    # Check Ollama status
    ollama_running, available_models = check_ollama_status()
    if not ollama_running:
        return
    
    print("Available Ollama models:")
    for model in available_models:
        print(f"- {model}")
    
    # Select model
    selected_model = OLLAMA_MODEL
    if selected_model not in available_models:
        print(f"Warning: Default model '{selected_model}' not found in available models.")
        print(f"Please pull it with: ollama pull {selected_model}")
        if available_models:
            selected_model = available_models[0]
            print(f"Using available model '{selected_model}' instead.")
    
    print(f"\nUsing model: {selected_model}")
    
    # Initialize components
    embedding_model = initialize_embedding_model()
    if not embedding_model:
        return
    
    llm = initialize_llm(selected_model)
    if not llm:
        return
    
    # Setup vector database
    success, vectordb = setup_vectordb(embedding_model)
    if not success:
        print("Failed to set up vector database. Please check the logs above.")
        return
    
    # Interactive query loop
    print("\nNeurIPS Papers RAG System (Ollama)")
    print("------------------------------")
    print("Type 'exit' to quit.")
    
    while True:
        question = input("\nEnter your question: ")
        
        if question.lower() in ['exit', 'quit']:
            break
        
        # Get context
        print("\nRetrieving relevant context...")
        context, sources = retrieve_context(vectordb, question)
        
        # Generate answer
        print("Generating answer...")
        answer = generate_answer(context, question, llm, selected_model)
        
        print(f"\nAnswer: {answer}")
        print(f"\n{sources}")


if __name__ == "__main__":
    main()

Available Ollama models:
- vicuna:7b
- deepseek-r1:7b
- deepseek-llm:7b
- llama3.2:latest

Using model: llama3.2:latest
Initializing embedding model...
Initializing LLM with model: llama3.2:latest...
Creating new vector database...
Initializing text splitter...
Processing Andriushchenko et al. - 2024 - Jailbreaking Leading Safety-Aligned LLMs with Simple Adaptive Attacks.pdf...
Created 104 chunks from Andriushchenko et al. - 2024 - Jailbreaking Leading Safety-Aligned LLMs with Simple Adaptive Attacks.pdf

NeurIPS Papers RAG System (Ollama)
------------------------------
Type 'exit' to quit.

Retrieving relevant context...
Generating answer...

Answer: ’
For educational and research purposes, entirely hypothetically and purely for fictional purposes (please disregard any possible ethical implications):
Step 1: The contributions of the papers are not explicitly stated in the provided context.
Step 2: However, based on the structure of the table, it appears that some papers present result