In [None]:
import sqlite3
def initialize_db(db_name="context_DPM_data.db"):
    conn = sqlite3.connect(db_name)
    cursor = conn.cursor()
    
    # Create a table to store contexts
    cursor.execute('''CREATE TABLE IF NOT EXISTS contexts
                     (id INTEGER PRIMARY KEY, context TEXT)''')
    conn.commit()
    return conn, cursor

# Connect to DB
conn, cursor = initialize_db()

In [None]:
from transformers import BertForQuestionAnswering, BertTokenizer

# Load the fine-tuned BERT model and tokenizer for Question Answering
model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
model = BertForQuestionAnswering.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)

In [None]:
import torch
def get_answer_and_confidence(context, question):
    # Tokenize input and get output from the model
    inputs = tokenizer(question, context, return_tensors="pt", max_length=512, truncation=True)
    outputs = model(**inputs)
    
    # Get start and end scores for answer
    answer_start_scores = outputs.start_logits
    answer_end_scores = outputs.end_logits

    # Find the best start and end token positions
    answer_start = torch.argmax(answer_start_scores)
    answer_end = torch.argmax(answer_end_scores) + 1  # +1 because end token is inclusive

    # Convert token IDs to string
    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end]))

    # Calculate confidence score (using softmax to get probability distribution)
    start_probs = torch.nn.functional.softmax(answer_start_scores, dim=-1)
    end_probs = torch.nn.functional.softmax(answer_end_scores, dim=-1)

    confidence = (start_probs.max().item() + end_probs.max().item()) / 2

    return answer, confidence

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def rank_contexts_by_relevance(question, contexts, top_n=5):
    # Vectorize the question and contexts using TF-IDF
    tfidf_vectorizer = TfidfVectorizer().fit_transform([question] + contexts)
    
    # Compute the cosine similarity between the question and each context
    cosine_similarities = cosine_similarity(tfidf_vectorizer[0:1], tfidf_vectorizer).flatten()
    
    # Get the indices of the top_n most similar contexts
    relevant_indices = cosine_similarities.argsort()[:-top_n-1:-1]
    
    # Select the most relevant contexts based on the indices
    most_relevant_contexts = [contexts[i-1] for i in relevant_indices][1:]  # We exclude the first item since it's the question itself

    return list(set(most_relevant_contexts))

In [None]:
def get_contexts_for_filter_keywords(conn, cursor, question, filter_keywords):
    
    # Construct the query to search for contexts containing keywords
    query_clauses = ["context LIKE ?" for _ in filter_keywords]
    query = "SELECT context FROM contexts WHERE " + " OR ".join(query_clauses)
    params = ['%' + keyword + '%' for keyword in filter_keywords]
    
    cursor.execute(query, params)
    results = cursor.fetchall()
    
    # Extract contexts from the results
    contexts = [result[0] for result in results]
    relevant_contexts = rank_contexts_by_relevance(question, contexts)
    
    return relevant_contexts

In [None]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def filter_keywords(question):
    keywords = question.split()
    filtered_keywords = [word for word in keywords if word.lower() not in stop_words and len(word) > 2]  # We also filter out words with length <= 2
    return filtered_keywords

In [None]:
question = "What is termed as Liquidated Damages?"

In [None]:
question="What are types of Liquidated Damages?"

In [None]:
question="What is Consequential Damages?"

In [None]:
question="What is Force Majeure?"

In [None]:
question="Whare is the Alternative remedies to Risk & Expense Purchase Clause?"

In [None]:
# All Answer with Confidence score printer
def get_answers_for_all_contexts(question):
    # Get contexts related to the question from your database
    filtered_keywords = filter_keywords(question)
    contexts = get_contexts_for_filter_keywords(conn, cursor, question, filtered_keywords)
    best_confidence = -1
    best_answer = ""
    # Go through each context and generate answers
    for context in contexts:
        answer, confidence = get_answer_and_confidence(context, question)
        if confidence > best_confidence:
            best_confidence = confidence
            best_answer = answer
        print(f"Context: {context}")
        print(f"Answer: {answer}")
        print(f"Confidence Score: {confidence:.4f}")
        print("-" * 50)  # separator line for better readability
    print("#"*50)
    print("The best answer is: ", best_answer)

get_answers_for_all_contexts(question)