In [27]:
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

class InvertedIndexer:
    def __init__(self):
        self.vectorizer = TfidfVectorizer()
        self.inverted_index = None

    def fit_transform(self, documents):
        """
        Fit the vectorizer and transform documents into a matrix.
        """
        self.inverted_index = self.vectorizer.fit_transform(documents)

    def save_index(self, file_path):
        """
        Save the inverted index to a pickle file.
        """
        with open(file_path, 'wb') as f:
            pickle.dump(self.inverted_index, f)

    def load_index(self, file_path):
        """
        Load the inverted index from a pickle file.
        """
        with open(file_path, 'rb') as f:
            self.inverted_index = pickle.load(f)

    def search(self, query, top_n=5):
        """
        Search the index for the top N documents most similar to the query.
        """
        query_vector = self.vectorizer.transform([query])
        similarities = cosine_similarity(query_vector, self.inverted_index)
        top_indices = similarities.argsort()[0][-top_n:][::-1]
        return top_indices

# Example usage
if __name__ == "__main__":
    # Example documents
    documents = [
        "The quick brown fox jumps over the lazy dog",
        "The quick brown dog jumps over the lazy fox",
        "The lazy dog jumps over the quick brown fox",
        "The lazy fox jumps over the quick brown dog",
        "The quick brown fox jumps over the lazy dog"
    ]

    # Initialize and fit the indexer
    indexer = InvertedIndexer()
    indexer.fit_transform(documents)

    # Save the index
    indexer.save_index('inverted_index.pkl')

    # Load the index (optional, for demonstration)
    indexer.load_index('inverted_index.pkl')

    # Search the index
    query = "The quick brown fox"
    top_indices = indexer.search(query, top_n=2)
    print(f"Top 2 documents for query '{query}': {top_indices}")


Top 2 documents for query 'The quick brown fox': [4 3]
