In [18]:
import re
import numpy as np
import faiss
from transformers import AutoTokenizer, AutoModel
import torch

def clean_pdf_text(text):
    """
    Clean text extracted from PDFs with two-column academic paper format.
    """
    # First pass: handle column merging issues and basic cleaning
    # Remove arXiv headers
    text = re.sub(r'arXiv:\d+\.\d+v\d+\s+\[\w+\.\w+\]\s+\d+\s+\w+\s+\d+', '', text)
    
    # Replace multiple newlines and spaces with single space
    text = re.sub(r'\s+', ' ', text)
    
    # Remove page numbers (common patterns in academic papers)
    text = re.sub(r'\n\d+\n', ' ', text)
    text = re.sub(r'\s+\d+\s+$', '', text, flags=re.MULTILINE)
    
    # Remove figure and table references
    text = re.sub(r'Fig\.\s*\d+\.?', 'Figure', text)
    text = re.sub(r'Figure\s*\d+\.?', 'Figure', text)
    text = re.sub(r'Table\s*\d+\.?', 'Table', text)
    text = re.sub(r'\(Figure\s*\d+\)', '(Figure)', text)
    text = re.sub(r'\(Table\s*\d+\)', '(Table)', text)
    
    # Handle column separation markers
    text = re.sub(r'\|\s*\|', ' ', text)
    
    # Second pass: handle two-column specific issues
    
    # Split into lines to process column issues
    lines = text.split('\n')
    processed_lines = []
    
    # Process each line
    for line in lines:
        # Check for patterns that indicate column mixing
        # (e.g., if line has very large gaps between words)
        if re.search(r'\S\s{10,}\S', line):
            # Split at large whitespace gaps (likely column separators)
            parts = re.split(r'\s{10,}', line)
            processed_lines.extend(parts)
        else:
            processed_lines.append(line)
    
    # Rejoin and clean up again
    text = ' '.join(processed_lines)
    text = re.sub(r'\s+', ' ', text)
    
    # Remove common headers/footers in academic papers
    text = re.sub(r'Proceedings of .*? \d{4}', '', text)
    text = re.sub(r'©\s*\d{4}.*?rights reserved', '', text, flags=re.IGNORECASE)
    
    return text.strip()
# 2. Text chunking function
def chunk_text(text, chunk_size=512, overlap=50):
    words = text.split()
    chunks = []
    i = 0
    while i < len(words):
        chunk = ' '.join(words[i:i+chunk_size])
        chunks.append(chunk)
        i += chunk_size - overlap
    return chunks

# 3. Create embeddings
def get_embeddings(chunks, model_name="sentence-transformers/multi-qa-mpnet-base-dot-v1"):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    
    embeddings = []
    for chunk in chunks:
        inputs = tokenizer(chunk, padding=True, truncation=True, 
                          max_length=512, return_tensors="pt")
        with torch.no_grad():
            outputs = model(**inputs)
        # Use mean pooling for sentence embedding
        embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
    
    return np.array(embeddings)

# 4. Build FAISS index - UPDATED
def build_faiss_index(embeddings):
    # Get dimension from the embeddings
    d = embeddings.shape[1]
    print(f"Creating FAISS index with dimension {d}")
    
    # Convert to float32 if not already
    embeddings = embeddings.astype(np.float32)
    
    try:
        # Try to use HNSW index (better but requires newer FAISS)
        index = faiss.IndexHNSWFlat(d, 32)
        index.hnsw.efConstruction = 200
        index.hnsw.efSearch = 128
        print("Using HNSW index for better performance")
    except:
        # Fall back to standard index if HNSW is not available
        print("HNSW index not available, falling back to FlatL2")
        index = faiss.IndexFlatL2(d)
    
    # Add vectors to the index
    index.add(embeddings)
    return index

# Main process
with open("dataset/PDF/resnet.txt", "r", encoding="utf-8") as f:
    text = f.read()

# Clean text
cleaned_text = clean_pdf_text(text)

# Split into chunks
chunks = chunk_text(cleaned_text)

# Create embeddings
embeddings = get_embeddings(chunks)

# Build FAISS index
index = build_faiss_index(embeddings)

# Save the index and chunks for future use
faiss.write_index(index, "resnet_index.faiss")

# Save chunks to reference back when retrieving
import pickle
with open("resnet_chunks.pkl", "wb") as f:
    pickle.dump(chunks, f)

Creating FAISS index with dimension 768
Using HNSW index for better performance


In [19]:
import faiss
import pickle
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel

# Load the saved index and chunks
index = faiss.read_index("resnet_index.faiss")
with open("resnet_chunks.pkl", "rb") as f:
    chunks = pickle.load(f)

print(f"FAISS index contains {index.ntotal} vectors")
print(f"Number of chunks: {len(chunks)}")

# Function to embed a query using the same model as before
def embed_query(query, model_name="sentence-transformers/multi-qa-mpnet-base-dot-v1"):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    
    inputs = tokenizer(query, padding=True, truncation=True, 
                      max_length=512, return_tensors="pt")
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Use mean pooling for sentence embedding
    query_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    
    # Convert to float32 and reshape if needed
    return np.array([query_embedding], dtype=np.float32)

# Function to search the index
def search_index(query, k=3):
    query_embedding = embed_query(query)
    distances, indices = index.search(query_embedding, k)
    
    print(f"Query: {query}")
    print(f"Top {k} results:")
    
    for i, (idx, dist) in enumerate(zip(indices[0], distances[0])):
        print(f"\nResult {i+1} (distance: {dist:.4f}):")
        print(f"Text: {chunks[idx][:]}...")
    
    return indices, distances

FAISS index contains 22 vectors
Number of chunks: 22


In [22]:
# Test with a sample query
test_query = "What are the authors of this paper?"
results = search_index(test_query, k=15)

Query: What are the authors of this paper?
Top 15 results:

Result 1 (distance: 26.7716):
Text: T. Raiko, H. Valpola, and Y. LeCun. Pushing stochas- S. Guadarrama, and T. Darrell. Caffe: Convolutional architecture for tic gradient towards second-order methods–backpropagation learn- fast feature embedding. arXiv:1408.5093, 2014. ing with transformations in nonlinearities. In Neural Information [20] A. Krizhevsky. Learning multiple layers of features from tiny im- Processing, 2013. ages. Tech Report, 2009. [48] A. Vedaldi and B. Fulkerson. VLFeat: An open and portable library [21] A. Krizhevsky, I. Sutskever, and G. Hinton. Imagenet classification of computer vision algorithms, 2008. with deep convolutional neural networks. In NIPS, 2012. [49] W. Venables and B. Ripley. Modern applied statistics with s-plus. [22] Y. LeCun, B. Boser, J. S. Denker, D. Henderson, R. E. Howard, 1999. W. Hubbard, and L. D. Jackel. Backpropagation applied to hand- [50] M. D. Zeiler and R. Fergus. Visualizing a