In [1]:
import sqlite3
def initialize_db(db_name="context_DPM_data.db"):
    conn = sqlite3.connect(db_name)
    cursor = conn.cursor()
    
    # Create a table to store contexts
    cursor.execute('''CREATE TABLE IF NOT EXISTS contexts
                     (id INTEGER PRIMARY KEY, context TEXT)''')
    conn.commit()
    return conn, cursor

# Connect to DB
conn, cursor = initialize_db()

In [2]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch

# Load the T5 model and tokenizer
model_name = "t5-small"
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

  from .autonotebook import tqdm as notebook_tqdm
Downloading (…)ve/main/spiece.model: 100%|██████████| 792k/792k [00:00<00:00, 1.02MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 2.32k/2.32k [00:00<00:00, 486kB/s]


In [13]:
def get_answer_and_confidence(context, question):
    # Format the question and context for T5
    input_text = f"question: {question} context: {context}"
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)

    # Get the output tokens from the model using the generate method
    outputs = model.generate(inputs["input_ids"], num_return_sequences=1)

    # Convert the generated token IDs to a string
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Get logits for the generated tokens
    with torch.no_grad():
        logits = model(inputs["input_ids"], decoder_input_ids=outputs).logits

    # Compute the probabilities from the logits
    probs = torch.nn.functional.softmax(logits, dim=-1)
    real_probs = torch.gather(probs, 2, outputs.unsqueeze(-1))
    confidence = real_probs.log().mean().exp().item()

    return answer, confidence

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def rank_contexts_by_relevance(question, contexts, top_n=5):
    # Vectorize the question and contexts using TF-IDF
    tfidf_vectorizer = TfidfVectorizer().fit_transform([question] + contexts)
    
    # Compute the cosine similarity between the question and each context
    cosine_similarities = cosine_similarity(tfidf_vectorizer[0:1], tfidf_vectorizer).flatten()
    
    # Get the indices of the top_n most similar contexts
    relevant_indices = cosine_similarities.argsort()[:-top_n-1:-1]
    
    # Select the most relevant contexts based on the indices
    most_relevant_contexts = [contexts[i-1] for i in relevant_indices][1:]  # We exclude the first item since it's the question itself

    return list(set(most_relevant_contexts))

In [5]:
def get_contexts_for_filter_keywords(conn, cursor, question, filter_keywords):
    
    # Construct the query to search for contexts containing keywords
    query_clauses = ["context LIKE ?" for _ in filter_keywords]
    query = "SELECT context FROM contexts WHERE " + " OR ".join(query_clauses)
    params = ['%' + keyword + '%' for keyword in filter_keywords]
    
    cursor.execute(query, params)
    results = cursor.fetchall()
    
    # Extract contexts from the results
    contexts = [result[0] for result in results]
    relevant_contexts = rank_contexts_by_relevance(question, contexts)
    
    return relevant_contexts

In [6]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def filter_keywords(question):
    keywords = question.split()
    filtered_keywords = [word for word in keywords if word.lower() not in stop_words and len(word) > 2]  # We also filter out words with length <= 2
    return filtered_keywords

[nltk_data] Downloading package stopwords to /home/ILMSI/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
question = "What is termed as Liquidated Damages?"

In [16]:
question="Force Majeure"

In [18]:
question="Deadline for Submission of Quotations"

In [19]:
# All Answer with Confidence score printer
def get_answers_for_all_contexts(question):
    # Get contexts related to the question from your database
    filtered_keywords = filter_keywords(question)
    contexts = get_contexts_for_filter_keywords(conn, cursor, question, filtered_keywords)
    best_confidence = -1
    best_answer = ""
    # Go through each context and generate answers
    for context in contexts:
        answer, confidence = get_answer_and_confidence(context, question)
        if confidence > best_confidence:
            best_confidence = confidence
            best_answer = answer
        print(f"Context: {context}")
        print(f"Answer: {answer}")
        print(f"Confidence Score: {confidence:.4f}")
        print("-" * 50)  # separator line for better readability
    print("#"*50)
    print("The best answer is: ", best_answer)

get_answers_for_all_contexts(question)



Context: Place and deadline for receipt of tenders
Answer: Place and deadline for receipt of tenders
Confidence Score: 0.0000
--------------------------------------------------
Context: Modification and Withdrawal of Bids: A bidder may modify or withdraw his bid after submission provided that the written notice of modification or withdrawal is received by the Buyer prior to deadline prescribed for submission of bids. A withdrawal notice may be sent by fax but it should be followed by a signed confirmation copy to be sent by post and such signed confirmation should reach the purchaser not later than the deadline for submission of bids. No bid shall be modified after the deadline for submission of bids. No bid may be withdrawn in the interval between the deadline for submission of bids and expiration of the period of bid validity specified. Withdrawal of a bid during this period will result in Bidder’s forfeiture of bid security.
Answer: No bid shall be modified after the deadline for su