<a href="https://colab.research.google.com/github/yasmin249/indexing1/blob/main/indexingMain.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import json
from sentence_transformers import SentenceTransformer
import numpy as np
from typing import Dict, List
import torch

def generate_and_save_embeddings(json_data: str, output_file: str = "document_embeddings.json"):
    """
    Generate embeddings for summaries and save both summaries and embeddings to a JSON file.

    Args:
        json_data: Input JSON string containing the summaries
        output_file: Output file path for saving the results
    """
    # Initialize the model
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

    # Parse input data
    documents = json.loads(json_data)

    # Prepare output structure
    output_data = []

    # Process each document
    for idx, doc in enumerate(documents):
        # Generate embeddings for each part
        embeddings = {
            f"embedding_part_{i+1}": model.encode(doc[f"part_{i+1}"]).tolist()
            for i in range(4)
        }

        # Create full document embedding
        full_text = ' '.join([doc[f"part_{i+1}"] for i in range(4)])
        full_embedding = model.encode(full_text).tolist()

        # Combine everything in a document structure
        document_data = {
            "document_id": idx,
            "full_text": full_text,
            "full_embedding": full_embedding,
            "parts": {
                f"part_{i+1}": {
                    "text": doc[f"part_{i+1}"],
                    "embedding": embeddings[f"embedding_part_{i+1}"]
                }
                for i in range(4)
            }
        }

        output_data.append(document_data)

    # Save to JSON file
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump({
            "metadata": {
                "model": "sentence-transformers/all-MiniLM-L6-v2",
                "embedding_dimension": len(output_data[0]["full_embedding"]),
                "total_documents": len(output_data)
            },
            "documents": output_data
        }, f, ensure_ascii=False, indent=2)

    return output_file

# Example usage
if __name__ == "__main__":
    # Replace with your actual JSON data
    with open("split_summaries.json", 'r', encoding='utf-8') as f:
        json_data = f.read()

    output_file = generate_and_save_embeddings(json_data)
    print(f"Embeddings saved to {output_file}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embeddings saved to document_embeddings.json


**search engine**

In [None]:
import json
import numpy as np
from sentence_transformers import SentenceTransformer
from collections import Counter
from typing import Dict, List, Tuple
import torch

class SimilaritySearchEngine:
    def __init__(self, embeddings_file: str):
        """
        Initialize the search engine with pre-computed embeddings.

        Args:
            embeddings_file: Path to the document_embeddings.json file
        """
        # Load the model for encoding queries
        self.model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

        # Load pre-computed embeddings
        with open(embeddings_file, 'r', encoding='utf-8') as f:
            self.data = json.load(f)

        # Convert embeddings to tensors for faster computation
        self.documents = {}
        for doc in self.data['documents']:
            doc_id = doc['document_id']
            self.documents[doc_id] = {
                'full_text': doc['full_text'],
                'parts': {
                    part_name: {
                        'text': part_data['text'],
                        'embedding': torch.tensor(part_data['embedding'])
                    }
                    for part_name, part_data in doc['parts'].items()
                }
            }

    def search(self, query: str, similarity_threshold: float = 0.5, top_k: int = 1) -> List[Dict]:
        """
        Search for documents based on query similarity across all parts.

        Args:
            query: Search query
            similarity_threshold: Minimum similarity score to count as a hit
            top_k: Number of top documents to return

        Returns:
            List of dictionaries containing matched documents and their hit counts
        """
        # Encode query
        query_embedding = torch.tensor(self.model.encode(query))

        # Track hits for each document
        doc_hits = Counter()

        # Store similarity details for each document
        doc_similarities = {}

        # Calculate similarities for each document's parts
        for doc_id, doc_data in self.documents.items():
            part_similarities = []

            # Calculate similarity for each part
            for part_name, part_data in doc_data['parts'].items():
                similarity = torch.nn.functional.cosine_similarity(
                    query_embedding.unsqueeze(0),
                    part_data['embedding'].unsqueeze(0)
                ).item()

                # If similarity exceeds threshold, count as a hit
                if similarity >= similarity_threshold:
                    doc_hits[doc_id] += 1

                part_similarities.append({
                    'part': part_name,
                    'similarity': similarity,
                    'text': part_data['text']
                })

            doc_similarities[doc_id] = {
                'hits': doc_hits[doc_id],
                'parts': part_similarities,
                'full_text': doc_data['full_text']
            }

        # Get top-k documents based on hit count
        top_docs = sorted(
            doc_hits.items(),
            key=lambda x: (x[1], max(
                max(p['similarity'] for p in doc_similarities[x[0]]['parts']),
                0
            )),
            reverse=True
        )[:top_k]

        # Prepare detailed results
        results = []
        for doc_id, hits in top_docs:
            doc_info = doc_similarities[doc_id]
            results.append({
                'document_id': doc_id,
                'hit_count': hits,
                'full_text': doc_info['full_text'],
                'part_details': sorted(
                    doc_info['parts'],
                    key=lambda x: x['similarity'],
                    reverse=True
                )
            })

        return results

def search_documents(
    query: str,
    embeddings_file: str = "document_embeddings.json",
    similarity_threshold: float = 0.5,
    top_k: int = 1
) -> List[Dict]:
    """
    Convenience function to perform document search.

    Args:
        query: Search query
        embeddings_file: Path to embeddings JSON file
        similarity_threshold: Minimum similarity score to count as a hit
        top_k: Number of top documents to return

    Returns:
        List of matched documents with hit counts and similarity details
    """
    engine = SimilaritySearchEngine(embeddings_file)
    return engine.search(query, similarity_threshold, top_k)

# Example usage
if __name__ == "__main__":
    # Example query
    query = "what are the monthly achievements?"

    # Search documents
    results = search_documents(
        query=query,
        similarity_threshold=0.6,
        top_k=3  # Return top 3 documents
    )

    # Print results
    for result in results:
        print(f"\nDocument ID: {result['document_id']}")
        print(f"Hit Count: {result['hit_count']}")
        print("\nPart Similarities:")
        for part in result['part_details']:
            print(f"{part['part']}: {part['similarity']:.3f}")


Document ID: 7
Hit Count: 3

Part Similarities:
part_3: 0.665
part_2: 0.646
part_4: 0.614
part_1: 0.544


In [None]:
# Example usage
if __name__ == "__main__":
    # Example query
    query = "what is citizen laws?"

    # Search documents
    results = search_documents(
        query=query,
        similarity_threshold=0.1,
        top_k=3  # Return top 3 documents
    )

    # Print results
    for result in results:
        print(f"\nDocument ID: {result['document_id']}")
        print(f"Hit Count: {result['hit_count']}")
        print("\nPart Similarities:")
        for part in result['part_details']:
            print(f"{part['part']}: {part['similarity']:.3f}")


Document ID: 8
Hit Count: 4

Part Similarities:
part_1: 0.371
part_2: 0.188
part_3: 0.186
part_4: 0.163

Document ID: 9
Hit Count: 4

Part Similarities:
part_4: 0.340
part_1: 0.313
part_2: 0.304
part_3: 0.215

Document ID: 3
Hit Count: 4

Part Similarities:
part_4: 0.251
part_2: 0.186
part_3: 0.125
part_1: 0.124


counting result based on sum of all the similaries of parts

In [None]:
import json
import numpy as np
!pip install sentence-transformers
from sentence_transformers import SentenceTransformer
from typing import Dict, List, Tuple
import torch

class SimilaritySearchEngine:
    def __init__(self, embeddings_file: str):
        """
        Initialize the search engine with pre-computed embeddings.

        Args:
            embeddings_file: Path to the document_embeddings.json file
        """
        # Load the model for encoding queries
        self.model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

        # Load pre-computed embeddings
        with open(embeddings_file, 'r', encoding='utf-8') as f:
            self.data = json.load(f)

        # Convert embeddings to tensors for faster computation
        self.documents = {}
        for doc in self.data['documents']:
            doc_id = doc['document_id']
            self.documents[doc_id] = {
                'full_text': doc['full_text'],
                'parts': {
                    part_name: {
                        'text': part_data['text'],
                        'embedding': torch.tensor(part_data['embedding'])
                    }
                    for part_name, part_data in doc['parts'].items()
                }
            }

    def search(self, query: str, top_k: int = 1) -> List[Dict]:
        """
        Search for documents based on sum of similarities across all parts.

        Args:
            query: Search query
            top_k: Number of top documents to return

        Returns:
            List of dictionaries containing matched documents and their similarity scores
        """
        # Encode query
        query_embedding = torch.tensor(self.model.encode(query))

        # Store similarity details for each document
        doc_scores = []

        # Calculate similarities for each document's parts
        for doc_id, doc_data in self.documents.items():
            part_similarities = []
            similarity_sum = 0.0

            # Calculate similarity for each part
            for part_name, part_data in doc_data['parts'].items():
                similarity = torch.nn.functional.cosine_similarity(
                    query_embedding.unsqueeze(0),
                    part_data['embedding'].unsqueeze(0)
                ).item()

                similarity_sum += similarity

                part_similarities.append({
                    'part': part_name,
                    'similarity': similarity,
                    'text': part_data['text']
                })

            doc_scores.append({
                'document_id': doc_id,
                'similarity_sum': similarity_sum,
                'average_similarity': similarity_sum / 4,  # 4 parts per document
                'full_text': doc_data['full_text'],
                'part_details': sorted(
                    part_similarities,
                    key=lambda x: x['similarity'],
                    reverse=True
                )
            })

        # Sort documents by total similarity score
        ranked_docs = sorted(
            doc_scores,
            key=lambda x: x['similarity_sum'],
            reverse=True
        )[:top_k]

        return ranked_docs

def search_documents(
    query: str,
    embeddings_file: str = "document_embeddings.json",
    top_k: int = 1
) -> List[Dict]:
    """
    Convenience function to perform document search.

    Args:
        query: Search query
        embeddings_file: Path to embeddings JSON file
        top_k: Number of top documents to return

    Returns:
        List of matched documents with similarity scores and details
    """
    engine = SimilaritySearchEngine(embeddings_file)
    return engine.search(query, top_k)

# Example usage and results formatting
def print_search_results(query: str, results: List[Dict]):
    """
    Print formatted search results.
    """
    print(f"\nSearch Query: {query}")
    print("-" * 80)

    for i, result in enumerate(results, 1):
        print(f"\nRank {i}:")
        print(f"Document ID: {result['document_id']}")
        print(f"Total Similarity Score: {result['similarity_sum']:.3f}")
        print(f"Average Similarity: {result['average_similarity']:.3f}")
        print("\nPart-by-part similarities:")
        for part in result['part_details']:
            print(f"- {part['part']}: {part['similarity']:.3f}")
        print("\nFull Text Preview:")
        preview = result['full_text'][:200] + "..." if len(result['full_text']) > 200 else result['full_text']
        print(preview)
        print("-" * 80)

if __name__ == "__main__":
    # Example query
    query = "What are the functions of the Department of Justice?"

    # Search documents
    results = search_documents(
        query=query,
        top_k=3  # Return top 3 documents
    )

    # Print formatted results
    print_search_results(query, results)

Collecting sentence-transformers
  Downloading sentence_transformers-3.2.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.2.1-py3-none-any.whl (255 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.8/255.8 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.2.1


  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


Search Query: What are the functions of the Department of Justice?
--------------------------------------------------------------------------------

Rank 1:
Document ID: 1
Total Similarity Score: 2.600
Average Similarity: 0.650

Part-by-part similarities:
- part_3: 0.692
- part_4: 0.671
- part_1: 0.660
- part_2: 0.577

Full Text Preview:
About Department | Department of Justice | India | Last updated: 19-04-2024 As per the Allocation of Business (Rules), 1961, Department of Justice is a part of Ministry of Law & Justice, Government of...
--------------------------------------------------------------------------------

Rank 2:
Document ID: 3
Total Similarity Score: 2.440
Average Similarity: 0.610

Part-by-part similarities:
- part_1: 0.760
- part_2: 0.724
- part_4: 0.553
- part_3: 0.404

Full Text Preview:
Functions of Department | Department of Justice | India | Last Updated : 19-09-2022 Appointment, resignation and removal of the Chief Justice of India, Judges of the Supreme Court of

In [None]:
# # Simple search for best match
# results = search_documents("What is the vision ?")

# # Get multiple matches
# results = search_documents(
#     query="What is the vision?",
#     top_k=3  # Get top 3 matches
# )

# Print detailed results
print_search_results(
    query="What are the citizen laws?",
    results=results
)


Search Query: What are the citizen laws?
--------------------------------------------------------------------------------

Rank 1:
Document ID: 2
Total Similarity Score: 1.085
Average Similarity: 0.271

Part-by-part similarities:
- part_1: 0.443
- part_4: 0.331
- part_3: 0.258
- part_2: 0.052

Full Text Preview:
Vision and Mission | Department of Justice | India | VISION: Facilitating administration of Justice that ensures easy access and timely delivery of Justice to all. MISSION: Ensuring adequacy of courts...
--------------------------------------------------------------------------------

Rank 2:
Document ID: 1
Total Similarity Score: 0.428
Average Similarity: 0.107

Part-by-part similarities:
- part_4: 0.129
- part_2: 0.121
- part_3: 0.098
- part_1: 0.080

Full Text Preview:
About Department | Department of Justice | India | Last updated: 19-04-2024 As per the Allocation of Business (Rules), 1961, Department of Justice is a part of Ministry of Law & Justice, Government of...
----

hit counts and sum both

In [None]:
import json
import torch
from sentence_transformers import SentenceTransformer
from typing import Dict, List

class SimilaritySearchEngine:
    def __init__(self, embeddings_file: str):
        """
        Initialize the search engine with pre-computed embeddings.

        Args:
            embeddings_file: Path to the document_embeddings.json file
        """
        self.model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

        with open(embeddings_file, 'r', encoding='utf-8') as f:
            self.data = json.load(f)

        self.documents = {}
        for doc in self.data['documents']:
            doc_id = doc['document_id']
            self.documents[doc_id] = {
                'full_text': doc['full_text'],
                'parts': {
                    part_name: {
                        'text': part_data['text'],
                        'embedding': torch.tensor(part_data['embedding'])
                    }
                    for part_name, part_data in doc['parts'].items()
                }
            }

    def search(self, query: str, similarity_threshold: float = 0.5, top_k: int = 3) -> List[Dict]:
        """
        Search for documents based on total similarity scores across all parts.

        Args:
            query: Search query
            similarity_threshold: Minimum similarity score to count as a hit
            top_k: Number of top documents to return

        Returns:
            List of dictionaries containing matched documents with similarity scores
        """
        query_embedding = torch.tensor(self.model.encode(query))

        doc_results = {}

        # Calculate similarities for each document's parts
        for doc_id, doc_data in self.documents.items():
            total_similarity = 0
            hit_count = 0
            part_similarities = []

            # Calculate similarity for each part
            for part_name, part_data in doc_data['parts'].items():
                similarity = torch.nn.functional.cosine_similarity(
                    query_embedding.unsqueeze(0),
                    part_data['embedding'].unsqueeze(0)
                ).item()

                total_similarity += similarity

                if similarity >= similarity_threshold:
                    hit_count += 1

                part_similarities.append({
                    'part': part_name,
                    'similarity': similarity,
                    'text': part_data['text']
                })

            doc_results[doc_id] = {
                'document_id': doc_id,
                'total_similarity': total_similarity,
                'hit_count': hit_count,
                'full_text': doc_data['full_text'],
                'part_details': sorted(
                    part_similarities,
                    key=lambda x: x['similarity'],
                    reverse=True
                )
            }

        # Sort documents by total similarity score
        sorted_results = sorted(
            doc_results.values(),
            key=lambda x: x['total_similarity'],
            reverse=True
        )[:top_k]

        return sorted_results

def search_documents(
    query: str,
    embeddings_file: str = "document_embeddings.json",
    similarity_threshold: float = 0.5,
    top_k: int = 3
) -> List[Dict]:
    """
    Convenience function to perform document search with total similarity ranking.

    Args:
        query: Search query
        embeddings_file: Path to embeddings JSON file
        similarity_threshold: Minimum similarity score to count as a hit
        top_k: Number of top documents to return

    Returns:
        List of matched documents with total similarity scores and details
    """
    engine = SimilaritySearchEngine(embeddings_file)
    return engine.search(query, similarity_threshold, top_k)

# Example usage
if __name__ == "__main__":
    query = "What are the functions of the Department of Justice?"

    results = search_documents(
        query=query,
        similarity_threshold=0.5,
        top_k=3
    )

    # Print results
    for result in results:
        print(f"\nDocument ID: {result['document_id']}")
        print(f"Total Similarity Score: {result['total_similarity']:.3f}")
        print(f"Hit Count: {result['hit_count']}")
        print(f"Full Text: {result['full_text'][:200]}...")  # Show first 200 chars
        print("\nPart Similarities:")
        for part in result['part_details']:
            print(f"{part['part']}: {part['similarity']:.3f}")


Document ID: 1
Total Similarity Score: 2.600
Hit Count: 4
Full Text: About Department | Department of Justice | India | Last updated: 19-04-2024 As per the Allocation of Business (Rules), 1961, Department of Justice is a part of Ministry of Law & Justice, Government of...

Part Similarities:
part_3: 0.692
part_4: 0.671
part_1: 0.660
part_2: 0.577

Document ID: 3
Total Similarity Score: 2.440
Hit Count: 3
Full Text: Functions of Department | Department of Justice | India | Last Updated : 19-09-2022 Appointment, resignation and removal of the Chief Justice of India, Judges of the Supreme Court of India and High Co...

Part Similarities:
part_1: 0.760
part_2: 0.724
part_4: 0.553
part_3: 0.404

Document ID: 2
Total Similarity Score: 2.385
Hit Count: 3
Full Text: Vision and Mission | Department of Justice | India | VISION: Facilitating administration of Justice that ensures easy access and timely delivery of Justice to all. MISSION: Ensuring adequacy of courts...

Part Similarities:
part_4

In [None]:
# Example usage
if __name__ == "__main__":
    query = "What is citizen law?"

    results = search_documents(
        query=query,
        similarity_threshold=0.5,
        top_k=3
    )

    # Print results
    for result in results:
        print(f"\nDocument ID: {result['document_id']}")
        print(f"Total Similarity Score: {result['total_similarity']:.3f}")
        print(f"Hit Count: {result['hit_count']}")
        print(f"Full Text: {result['full_text'][:200]}...")  # Show first 200 chars
        print("\nPart Similarities:")
        for part in result['part_details']:
            print(f"{part['part']}: {part['similarity']:.3f}")


Document ID: 9
Total Similarity Score: 1.117
Hit Count: 0
Full Text: Acts and Rules | Department of Justice | India | Last Updated : 02-07-2024 | Acts, And, Rules, Acts and Rules # Acts and Rules Last Updated : 02-07-2024 | Sl No. | Title | Document | Division Dealing ...

Part Similarities:
part_1: 0.325
part_4: 0.296
part_2: 0.292
part_3: 0.205

Document ID: 8
Total Similarity Score: 1.017
Hit Count: 0
Full Text: Citizens’ Charter | Department of Justice | India | Last Updated : 01-04-2024 | Citizens’, Charter, Citizens’ Charter # Citizens’ Charter Last Updated : 01-04-2024 Here is a concise summary of the doc...

Part Similarities:
part_1: 0.418
part_3: 0.214
part_4: 0.195
part_2: 0.191

Document ID: 2
Total Similarity Score: 0.996
Hit Count: 0
Full Text: Vision and Mission | Department of Justice | India | VISION: Facilitating administration of Justice that ensures easy access and timely delivery of Justice to all. MISSION: Ensuring adequacy of courts...

Part Similarities:
part_4

SyntaxError: invalid syntax (<ipython-input-16-9355f788a3b4>, line 1)