In [2]:
import re
from typing import List
import os
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document
import pickle

### Split Text into chunks

In [3]:
def _get_overlap_text(text: str, overlap_chars: int) -> str:
    """Get the last overlap_chars characters, preferring paragraph boundaries."""
    if len(text) <= overlap_chars:
        return text
    
    # Try to get overlap at paragraph boundary
    paragraphs = text.split('\n\n')
    if len(paragraphs) > 1:
        # Start with last paragraph and add previous ones if they fit
        overlap = paragraphs[-1]
        for i in range(len(paragraphs) - 2, -1, -1):
            potential_overlap = paragraphs[i] + "\n\n" + overlap
            if len(potential_overlap) <= overlap_chars:
                overlap = potential_overlap
            else:
                break
        return overlap
    
    # Fallback to character-based overlap
    return text[-overlap_chars:]

In [4]:
def _trim_to_sentence_boundary(text: str, max_chars: int) -> str:
    
    """Trim text to last complete sentence within max_chars limit."""
    if len(text) <= max_chars:
        return text
    
    # Find the last sentence ending before max_chars
    truncated = text[:max_chars]
    
    # Look for sentence endings (., !, ?)
    sentence_endings = ['.', '!', '?']
    last_sentence_end = -1
    
    for i in range(len(truncated) - 1, -1, -1):
        if truncated[i] in sentence_endings:
            # Make sure it's not an abbreviation or decimal
            if i < len(truncated) - 1 and truncated[i + 1].isspace():
                last_sentence_end = i
                break
    
    if last_sentence_end > len(text) * 0.5:  # Don't cut too much
        return text[:last_sentence_end + 1]
    
    return text  # Return original if no good boundary found

In [5]:
def dynamic_chunk_paragraphs_enhanced(
    text: str, 
    max_chars: int = 500, 
    overlap_chars: int = 100,
    min_chunk_size: int = 50,
    preserve_sentence_boundaries: bool = True
) -> List[str]:
    
    if not text.strip():
        return []
    
    # Split into paragraphs, keeping empty lines as separators
    paragraphs = re.split(r'\n\s*\n', text.strip())
    paragraphs = [p.strip() for p in paragraphs if p.strip()]
    
    chunks = []
    current_chunk = ""
    overlap_buffer = ""
    
    for i, para in enumerate(paragraphs):
        # Check if adding this paragraph would exceed max_chars
        potential_chunk = current_chunk + ("\n\n" if current_chunk else "") + para
        
        if len(potential_chunk) <= max_chars:
            current_chunk = potential_chunk
        else:
            # Current chunk is ready, process it
            if current_chunk:
                final_chunk = current_chunk
                
                # Apply sentence boundary preservation if enabled
                if preserve_sentence_boundaries and len(final_chunk) > max_chars * 0.8:
                    final_chunk = _trim_to_sentence_boundary(final_chunk, max_chars)
                
                chunks.append(final_chunk)
                
                # Prepare overlap for next chunk
                overlap_buffer = _get_overlap_text(final_chunk, overlap_chars)
            
            # Start new chunk with overlap + current paragraph
            current_chunk = overlap_buffer + ("\n\n" if overlap_buffer else "") + para
            overlap_buffer = ""
    
    # Add the last chunk if it exists and meets minimum size
    if current_chunk and len(current_chunk.strip()) >= min_chunk_size:
        chunks.append(current_chunk)
    
    return chunks

In [6]:
# Store all Document chunks
all_documents = []

cleaned_folder = "../cmu_oie_scrape/cleaned"

for filename in os.listdir(cleaned_folder):
    if filename.endswith(".txt"):
        file_path = os.path.join(cleaned_folder, filename)
        with open(file_path, "r", encoding="utf-8") as f:
            file_text = f.read()

        chunks = dynamic_chunk_paragraphs_enhanced(file_text, max_chars=200, overlap_chars=50)
        
        # Tag documents with metadata (e.g., source file)
        docs = [Document(page_content=chunk, metadata={"source": filename}) for chunk in chunks]
        all_documents.extend(docs)

        print(f"✅ {filename}: {len(chunks)} chunks")

print(f"\n📦 Total chunks: {len(all_documents)}")

✅ html_oie_employment_f1-students_curricular-practical-training.txt: 3 chunks
✅ html_oie_employment_f1-students_index.txt: 8 chunks
✅ html_oie_employment_f1-students_on-campus-employment.txt: 3 chunks
✅ html_oie_employment_f1-students_opt-stem-opt-extension_h1b-cap-gap-extension.txt: 4 chunks
✅ html_oie_employment_f1-students_opt-stem-opt-extension_i765-instructions.txt: 3 chunks
✅ html_oie_employment_f1-students_opt-stem-opt-extension_index.txt: 7 chunks
✅ html_oie_employment_f1-students_opt-stem-opt-extension_opt-stem-extension.txt: 3 chunks
✅ html_oie_employment_f1-students_opt-stem-opt-extension_post-completion-opt.txt: 3 chunks
✅ html_oie_employment_f1-students_opt-stem-opt-extension_pre-completion-opt.txt: 3 chunks
✅ html_oie_employment_f1-students_severe-economic-hardship.txt: 9 chunks
✅ html_oie_employment_index.txt: 9 chunks
✅ html_oie_employment_j1-students_academic-training-employment.txt: 3 chunks
✅ html_oie_employment_j1-students_index.txt: 6 chunks
✅ html_oie_employment_j

### Creating embeddings from chunks 

In [7]:
embedding_model = HuggingFaceEmbeddings(model_name="multi-qa-MiniLM-L6-cos-v1")

  embedding_model = HuggingFaceEmbeddings(model_name="multi-qa-MiniLM-L6-cos-v1")


In [8]:
vectorstore = FAISS.from_documents(all_documents, embedding_model)

In [9]:
# Save the vectorstore to disk

vectorstore_path = "../vectorstore/faiss_index"
os.makedirs(vectorstore_path, exist_ok=True)

vectorstore.save_local(vectorstore_path)
print(f"\n FAISS index saved to: {vectorstore_path}")


 FAISS index saved to: ../vectorstore/faiss_index
