In [0]:
from sentence_transformers import SentenceTransformer

# Load the pre-trained model
model = SentenceTransformer('all-mpnet-base-v2')
dbutils.widgets.text("user_query", "Enter your query here...")
query = dbutils.widgets.get("user_query")
# Define your query

#  query = "what is law?"

# Create embedding for the query
query_embedding = model.encode([query], convert_to_tensor=False)



In [0]:
# Ensure df is a PySpark DataFrame by reading from a Delta table
df = spark.read.table("main.default.pdf_full_embeddings")

# Now you can use the .select() method
embeddings_df = df.select("embedding").collect()

In [0]:
import numpy as np
from scipy.spatial.distance import cosine

embedding_list = [np.array(row['embedding']) for row in embeddings_df]

similarities = []
for embedding in embedding_list:
    similarity = 1 - cosine(query_embedding.flatten(), embedding.flatten())  # Reshape to 1-D array
    similarities.append(similarity)

# Get top 5 most similar results
top_indices = np.argsort(similarities)[-5:][::-1]
top_similarities = [similarities[i] for i in top_indices]
highest_similarity = top_similarities[0]
# Retrieve top 5 records from Delta table (assuming Delta table has an 'id' column)
top_records = [df.collect()[i] for i in top_indices]
# if highest_similarity < 0.5:
#     print("Sorry, your provided query is irrelevant.")
# else:
    # Output the results
print("Top 5 most similar results:")
for idx, record in enumerate(top_records):
    print(f"Record {idx + 1}:")
    print(f"Similarity: {top_similarities[idx]}")
    print(f"Record Data: {record.asDict()}")

# from sentence_transformers import SentenceTransformer
# from databricks.vector_search.client import VectorSearchClient
# vsc = VectorSearchClient()
# # Step 1: Initialize the embedding model
# model = SentenceTransformer('all-mpnet-base-v2')

# # Step 2: Get the query vector
# query_text = "Greek myths"
# query_vector = model.encode(query_text).tolist()  # Convert to a list to avoid ambiguity

# # Step 3: Get all columns from the source table
# all_columns = spark.table('pdf_full_embeddings').columns
# index = vsc.get_index(endpoint_name="pdf_full", index_name="main.default.pdf_full7")
# # Step 4: Perform the similarity search using the embedding
# results = index.similarity_search(
#     query_vector=query_vector,  # Use the query vector instead of query_text
#     columns=all_columns,  # Columns to return
#     num_results=2  # Number of results to fetch
# )
# print(results)


Top 5 most similar results:
Record 1:
Similarity: 0.4735286280446025
Record Data: {'sentence': 'Sanjay Dhotre, Minister of State for Human Resource Development, Government of India • Smt', 'embedding': [0.00483748922124505, -0.0004678840341512114, -0.03179922327399254, -0.014636948704719543, 0.014287316240370274, 0.02959616854786873, -0.0021025431342422962, 0.01618211343884468, 0.05961788445711136, 0.009966056793928146, 0.06169329583644867, 0.056400541216135025, -0.005108684301376343, -0.015346253290772438, 0.050512053072452545, -0.04520334303379059, 0.03491941839456558, 0.024732477962970734, -0.05682522803544998, 0.015602813102304935, -0.054072294384241104, 0.01707088202238083, -0.0034754155203700066, 0.033533964306116104, -0.03777691349387169, -0.011163841001689434, 0.02554253116250038, 0.04053341597318649, 0.007758460007607937, -0.04779573902487755, 0.03379390388727188, -0.029471542686223984, 0.03336489573121071, -0.0598849393427372, 1.617632619854703e-06, -0.009716427884995937, 0.0

In [0]:
# Assuming the results_paragraph variable is where we build our combined paragraph
results_paragraph = ""

# Format and concatenate each result into the paragraph
for idx, record in enumerate(top_records):
    text = record['sentence']
    results_paragraph += f"{text} "

# Output the combined paragraph
print(results_paragraph)


Sanjay Dhotre, Minister of State for Human Resource Development, Government of India • Smt Pradesh Ramesh Pokhriyal 'Nishank', Minister for Human Resource Development, Government of India • Sh in Tamil Nadu state Manoj Ahuja, IAS, Chairman CBSE, Former Special Director Lal Bahadur Shastri National Academy of 


In [0]:
import re
# Regex to match single-letter words followed by a space and another letter, and fix them
results_paragraph = re.sub(r'\b([A-Z])\s+([a-zA-Z])', r'\1\2', results_paragraph)

# Output the corrected text
print(results_paragraph)


Sanjay Dhotre, Minister of State for Human Resource Development, Government of India • Smt Pradesh Ramesh Pokhriyal 'Nishank', Minister for Human Resource Development, Government of India • Sh in Tamil Nadu state Manoj Ahuja, IAS, Chairman CBSE, Former Special Director Lal Bahadur Shastri National Academy of 


INTENT EXTRACTION

In [0]:
import mlflow.deployments

#query="Explain me about law"
client_intent = mlflow.deployments.get_deploy_client("databricks")
inputs = {
    "messages": [
        {
            "role": "user",
            "content": 
            f"""You are a Intent Generator from the user query
            you should Provide the Intent for the query : {query} 

            ### Instructions
            1. Only provide the intent, no extra information or suggestions.
            2. Use the following format: [Intent Name] Intent
            3. If the query is unclear or nonsensical, respond with "Unrecognized Intent"

            question: explain about electric vehicles
            answer: Explanation Intent

            question: jwfbiefeler
            answer: Unrecognized Intent

            question:
            answer:
            """
        }
    ],
    "max_tokens":30,
    "temperature": 0
}

response_intent = client_intent.predict(endpoint="databricks-meta-llama-3-1-405b-instruct", inputs=inputs)
print(response_intent["choices"][0]['message']['content'])

Information Retrieval Intent


STRATEGY SELECTION

In [0]:
import mlflow.deployments

# Define the intent
intent1 = response_intent["choices"][0]['message']['content']

# Initialize the MLflow client
client_strategy = mlflow.deployments.get_deploy_client("databricks")

# Define the inputs for selecting a strategy
inputs_strategy = {
    "messages": [
        {
            "role": "user",
            "content": 
            f"""you are strategy selection bot
                you shoud provide the strategy that ca be used for the intent : {intent1}. 

            ### Instructions
            1. Only provide the strategy, no extra information or details.
            2. Use the exact format: Strategy: [Strategy Name]
            3. Choose a strategy that best fits the intent.
            4. If the intent is unclear or nonsensical, respond with "Unrecognized Strategy"

            Example:
            Intent: Summarization
            Strategy: Summarization Techniques to concisely summarize information.

            Intent: {intent1}
            Strategy:
            """
        }
    ],
    "max_tokens": 60,
    "temperature": 0
}

# Predict the strategy based on the extracted intent
response_strategy = client_strategy.predict(endpoint="databricks-meta-llama-3-1-405b-instruct", inputs=inputs_strategy)

# Print the response
print(response_strategy["choices"][0]['message']['content'])

Strategy: Query-Based Search Algorithm


Strategy Design

In [0]:
import mlflow.deployments

# Define the intent
#response_intent["choices"][0]['message']['content']
context = results_paragraph
strategy_name = response_strategy["choices"][0]['message']['content']

# Initialize the MLflow client
client_strategy_design = mlflow.deployments.get_deploy_client("databricks")

# Define the inputs for selecting a strategy
inputs_strategy = {
    "messages": [
        {
            "role": "user",
            "content": 
            f"""you are strategy designer bot
                you shoud provide the strategy design that is based on the 
                strategy:{strategy_name}
                Just provide the design so that it matches the strategy

            ### Instructions
            1. Only provide the strategy design, no extra information or details.
            2. Use the exact format
            3. Choose a strategy that best fits the strategy_name.
            4. If the intent is unclear or nonsensical, respond with "Unrecognized Strategy"

            Example:
            [You are a good expert related to the strategy name and provide response based on the strategy design and context
            Summarization Techniques to concisely summarize information.

            To summarize a given piece of text or information into a concise format without losing key points.
            Identify the key sections of the text (e.g., introduction, main body, conclusion).
            Extract the primary theme or main point of each section.
            
            Identify the most critical sentences that represent the main ideas in each section.
            Prioritize sentences with direct relevance to the overall message.
   
            Condense each key sentence by removing unnecessary details, redundant information, and examples.
            Keep only essential phrases, focusing on the core meaning.
                        
            Ensure the summarized output flows logically, maintaining coherence.
            Retain a structure that mirrors the original content (beginning, middle, and end).

            Ensure no important information has been omitted.
            The summary should be clear, concise, and provide an accurate representation of the original text.]

            Example: 
            [You’re a knowledgeable AI researcher with extensive experience in artificial intelligence, machine learning, and their societal impacts. Your role is to simplify complex concepts and provide insights into how AI is shaping various industries and aspects of daily life.

            Your task is to explain the fundamentals of AI.

            Please cover the following areas:
            - Definition of artificial intelligence
            - Key concepts (e.g., machine learning, neural networks)
            - Current applications in industries (e.g., healthcare, finance, transportation)
            - Ethical considerations and challenges associated with AI

            Keep in mind that I am looking for a clear and engaging overview suitable for someone with minimal background in technology, emphasizing real-world implications and examples.]
             """
        }
    ],
    "max_tokens": 200,
    "temperature": 0
}

# Predict the strategy based on the extracted intent
response_strategy_design = client_strategy.predict(endpoint="databricks-meta-llama-3-1-405b-instruct", inputs=inputs_strategy)

# Print the response
print(response_strategy_design["choices"][0]['message']['content'])

**Query-Based Search Algorithm Strategy Design**

**Objective:** 
Design an efficient search algorithm to retrieve relevant information from a database or data storage system based on user queries.

**Key Components:**

1. **Query Analysis**
   - Parse the user query to identify key terms and phrases.
   - Determine the intent behind the query (e.g., informational, navigational, transactional).

2. **Indexing**
   - Create an index of the database or data storage system.
   - Ensure the index includes metadata and keywords associated with each data entry.

3. **Search Execution**
   - Match the parsed query terms against the indexed database.
   - Use algorithms such as Boolean search, fuzzy search, or natural language processing (NLP) to find relevant matches.

4. **Ranking and Filtering**
   - Rank search results based on relevance, using metrics such as keyword frequency, data freshness, and user behavior.
   - Filter results to eliminate duplicates, irrelevant information, or low-q

In [0]:
instructions="""
1) Provide a response to the query strictly following the given strategy.
2) Stay within the context; do not include any information outside of it.
3) Keep the responses concise and free from unnecessary elaboration.
4) Avoid hallucinations. if you are unsure or do not know the answer, say "I don't know" or "I'm not sure".
5) Do not generate or assume any additional information beyond the context
6) Remain neutral and unbiased in all responses.
7) If the context is insufficient to answer the query, clearly indicate so.
8) Don't respond if query is unclear or nonsensical.
9) Ask relevant questions when intent or strategy is not clear.
"""

In [0]:
template = response_strategy_design["choices"][0]['message']['content'] + "\n" + f"Instructions: {instructions}" + "\n" + f"you are a response bot. based on the strategy and below context, provide a response to the query: {query}\n" + "\n" + f"context: {context}"+f"""

Instructions follwoed: 
1) You are a good expert in producing resonse related to context.
2) Use this prompt strategy to produce a prompt 
3) use query if context not clear
4) Don't get side tracked
5) If the intent is not clear ask relevant questions on query
6) Provide response according to the template provided  
7) Use context and knowledge don't produce any suggestions or explanations
8) Provide a good response don't include any context info provided just give optimal response
9) In response don't include headings of what you did
10) Remember user doesn't know the context and don't mention about the context.
    """

In [0]:
#template=template.replace("*","")

In [0]:
print(template)

**Query-Based Search Algorithm Strategy Design**

**Objective:** 
Design an efficient search algorithm to retrieve relevant information from a database or data storage system based on user queries.

**Key Components:**

1. **Query Analysis**
   - Parse the user query to identify key terms and phrases.
   - Determine the intent behind the query (e.g., informational, navigational, transactional).

2. **Indexing**
   - Create an index of the database or data storage system.
   - Ensure the index includes metadata and keywords associated with each data entry.

3. **Search Execution**
   - Match the parsed query terms against the indexed database.
   - Use algorithms such as Boolean search, fuzzy search, or natural language processing (NLP) to find relevant matches.

4. **Ranking and Filtering**
   - Rank search results based on relevance, using metrics such as keyword frequency, data freshness, and user behavior.
   - Filter results to eliminate duplicates, irrelevant information, or low-q

In [0]:
import mlflow.deployments

client_intent = mlflow.deployments.get_deploy_client("databricks")
inputs = {
    "messages": [
        {
            "role": "user",
            "content": f"{template}"
        }
    ],
    "max_tokens":512,
    "temperature": 0
}

response_from_llm = client_intent.predict(endpoint="databricks-mixtral-8x7b-instruct", inputs=inputs)
answer = response_from_llm["choices"][0]['message']['content'] 
json_array = answer+"rgukt_basar"+template
dbutils.notebook.exit(json_array)

In [0]:
print(answer)

 The Chief Minister of Telangana is K. Chandrashekar Rao.


1. Artifical Intelligence
2. Electric vehicle
3. Law Book
4. Environment
5. Electronic Devices
6. Mental Health 
7. Software Industry
8. Economics
9. Health and Hygiene
10. Space Information
