# Semantic Search on Policy Documents using Transformers

In [None]:
from sentence_transformers import SentenceTransformer, util

# Load the pretrained model
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
# List of example policy-related documents
documents = [
    "The government reduced tariffs on agricultural imports in 1998.",
    "A major reform in 2002 led to an increase in VAT rates.",
    "In 2010, a new fiscal policy focused on education spending was implemented.",
    "Tariffs on electronics were removed under the 2015 free trade agreement.",
    "Corporate taxes were adjusted in response to the financial crisis of 2008.",
    "Foreign investment incentives were introduced in the energy sector.",
    "A subsidy program was initiated to support local manufacturers.",
    "The budget deficit led to a restructuring of public debt obligations.",
    "Import duties on technology were lifted to promote innovation.",
    "Tax cuts were proposed to stimulate consumer spending."
]

# User query
query = "tax reform during financial crisis"


In [None]:
# Encode documents and query into embeddings
doc_embeddings = model.encode(documents, convert_to_tensor=True)
query_embedding = model.encode(query, convert_to_tensor=True)

# Compute cosine similarity scores
cosine_scores = util.cos_sim(query_embedding, doc_embeddings)

# Find the index of the top-matching document
top_result = cosine_scores.argmax().item()

# Display the most relevant document
print(f"Query: {query}")
print(f"Top Match: {documents[top_result]}")
print(f"Score: {cosine_scores[0][top_result]:.4f}")

In [None]:
import torch

# Get top 3 similarity scores and document indices
scores = cosine_scores[0]
top_results = torch.topk(scores, k=3)

# Print top 3 documents with similarity scores
for score, idx in zip(top_results.values, top_results.indices):
    print(f"Document: {documents[idx]}")
    print(f"Similarity Score: {score:.4f}")
    print("---")

In [None]:
import pandas as pd

# Create a results table
top_docs = [documents[idx] for idx in top_results.indices]
top_scores = [score.item() for score in top_results.values]

results_df = pd.DataFrame({
    'Document': top_docs,
    'Similarity Score': top_scores
}).sort_values(by='Similarity Score', ascending=False)

results_df