In [4]:
import os
import pickle
from typing import Dict, List
from sentence_transformers import SentenceTransformer
import torch

In [5]:



class VectorGenerator:
    def __init__(self, model_name: str = 'sentence-transformers/all-mpnet-base-v2', embedding_dir: str = '../embeddings'):
        self.model = SentenceTransformer(model_name)
        self.vectors = {}
        self.chunk_map = {}  # Maps vector index to original document and chunk
        self.embedding_dir = embedding_dir

        # Ensure the embeddings directory exists
        os.makedirs(self.embedding_dir, exist_ok=True)

    def read_chunks(self, parsed_content_dir: str) -> Dict[str, List[str]]:
        """Load chunks from text files in the parsed content directory"""
        documents = {}
        for file_name in os.listdir(parsed_content_dir):
            if file_name.endswith('.txt'):
                with open(os.path.join(parsed_content_dir, file_name), 'r', encoding='utf-8') as f:
                    chunk_text = f.read()
                # Group by base filename without the chunk suffix
                # Remove _chunk_x suffix
                base_name = "_".join(file_name.split('_')[:-2])
                documents.setdefault(base_name, []).append(chunk_text)
        return documents

    def generate_embeddings(self, documents: Dict[str, List[str]], batch_size: int = 32):
        """Generate embeddings for all document chunks"""
        current_index = 0

        for doc_name, chunks in documents.items():
            print(f"Generating embeddings for {doc_name}")

            # Generate embeddings in batches
            for i in range(0, len(chunks), batch_size):
                batch = chunks[i:i + batch_size]
                # Generate embeddings
                with torch.no_grad():
                    embeddings = self.model.encode(batch)

                # Store embeddings and mapping
                for j, embedding in enumerate(embeddings):
                    self.vectors[current_index] = embedding
                    self.chunk_map[current_index] = {
                        'document': doc_name,
                        'chunk_index': i + j,
                        'text': chunks[i + j]
                    }
                    current_index += 1

            print(f"Generated {len(chunks)} embeddings for {doc_name}")

    def save_vectors(self):
        """Save combined vectors and chunk mapping to the embeddings directory"""
        combined_data_path = os.path.join(self.embedding_dir, 'vectors.pkl')
        data = {
            'vectors': self.vectors,
            'chunk_map': self.chunk_map,
        }

        with open(combined_data_path, 'wb') as file:
            pickle.dump(data, file)

        print(f"Saved combined vectors and chunk map to {combined_data_path}")

    def load_vectors(self):
        """Load combined vectors and chunk mapping from the embeddings directory"""
        combined_data_path = os.path.join(self.embedding_dir, 'vectors.pkl')

        with open(combined_data_path, 'rb') as file:
            data = pickle.load(file)
            self.vectors = data['vectors']
            self.chunk_map = data['chunk_map']

        print(f"Loaded vectors and chunk map from {combined_data_path}")




In [6]:

if __name__ == "__main__":
    parsed_content_dir = '../parsed_content'
    embedding_dir = '../embeddings'

    # Initialize the vector generator
    vector_gen = VectorGenerator(embedding_dir=embedding_dir)

    # Load chunks
    print("Loading chunks from parsed content directory...")
    documents = vector_gen.read_chunks(parsed_content_dir)

    # Generate vectors
    print("Generating embeddings...")
    vector_gen.generate_embeddings(documents)

    # Save vectors
    print("Saving embeddings...")
    vector_gen.save_vectors()

    # Print some statistics
    print("\nVector Generation Summary:")
    print(f"Total vectors generated: {len(vector_gen.vectors)}")
    print(f"Vector dimension: {len(next(iter(vector_gen.vectors.values())))}")

Loading chunks from parsed content directory...
Generating embeddings...
Generating embeddings for tsla-20231231-gen
Generated 434 embeddings for tsla-20231231-gen
Generating embeddings for uber-10-k-2023
Generated 683 embeddings for uber-10-k-2023
Generating embeddings for goog-10-k-2023 (1)
Generated 337 embeddings for goog-10-k-2023 (1)
Saving embeddings...
Saved combined vectors and chunk map to ../embeddings/vectors.pkl

Vector Generation Summary:
Total vectors generated: 1454
Vector dimension: 768
