In [None]:
"""
A Generative AI system for document-based question answering for Boeing.

A use-case of AI for intelligent information retrieval and natural language processing, capable of fetching information from structured document data and generating human-like responses to queries based on the content of the Boeing documents.

Document Vector Retrieval (get_document_vector): This function retrieves the vector representation of a document from a database (like Pinecone) using the document's ID. The vector is a numerical representation that captures various aspects of the document (such as summary, keywords, title, and category).

Document Indexing: The code iterates through a list of documents, each represented by a dictionary with an 'id' and a 'vector'. These vectors are then indexed or updated in the database using the index.upsert method.

Relevance Scoring and Selection (select_most_relevant_document): This function calculates the relevance of each document to a given query. It computes cosine similarity scores between the query's vector and each document's vector components (summary, keywords, title, category), and combines these scores to determine the most relevant document.

Query Processing (process_query): This central function processes the user's query by:
Extracting important keywords and summarizing the query.

Vectorizing the summary and keywords using a model (e.g., an NLP model).

Querying the indexed documents to find the top relevant documents.

Applying additional logic (like considering document views) to select the most relevant document.

Response Generation: Once the most relevant document is selected, its content is retrieved, and a response is generated using OpenAI's GPT-3 model. The model is prompted with the content of the relevant document and the user's query to generate an answer.

Error Handling: The response generation is wrapped in a try-except block to handle any potential errors during the response generation process.

Main Function (main): This function is the entry point of the program. It sets up necessary data like views count for documents and the OpenAI API key, takes a sample question (e.g., about Boeing 787's engine maintenance safety protocols), and then processes this query to generate a response.

Execution Check: The if __name__ == "__main__": condition checks if the script is run as the main program and not imported as a module. If it's the main program, it executes the main() function.

"""


from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import openai

def get_document_vector(doc_id):
    # Query Pinecone for the vector of the given document ID
    response = index.query(ids=[doc_id])
    if response['results'][0]['matches']:
        # Extract the vector from the response
        doc_vector = response['results'][0]['matches'][0]['vector']
        return doc_vector
    else:
        return None


# Assuming `documents` is a list of dictionaries, where each dictionary
# represents a document with its 'id' and 'vector'
for document in documents:
    # Here, 'vector' is the concatenated vector of summary, keywords, title, and category
    index.upsert(ids=[document['id']], vectors=[document['vector']])


def select_most_relevant_document(doc_ids, query_vector, weights):
    highest_score = 0
    most_relevant_doc = None

    for doc_id in doc_ids:
        # Retrieve the document's vector (summary, keywords, title, category)
        doc_vector = get_document_vector(doc_id)  # Implement this function

        # Compute similarity scores
        summary_similarity = compute_cosine_similarity(query_vector['summary'], doc_vector['summary'])
        keyword_similarity = compute_cosine_similarity(query_vector['keywords'], doc_vector['keywords'])
        title_similarity = compute_cosine_similarity(query_vector['title'], doc_vector['title'])
        category_similarity = compute_cosine_similarity(query_vector['category'], doc_vector['category'])

        # Combine scores
        combined_score = combine_scores(summary_similarity, keyword_similarity, title_similarity, category_similarity, weights)

        # Check for the highest score
        if combined_score > highest_score:
            highest_score = combined_score
            most_relevant_doc = doc_id

    return most_relevant_doc


def process_query(query):
    # Extract important keywords and summarize the query
    query_keywords = extract_important_keywords(query)
    query_summary = summarize_text(query)

    # Vectorize summary and keywords
    query_vector = model.encode([query_summary] + [' '.join(query_keywords)])

    # Fetch relevant document IDs from Pinecone
    results = index.query(query_vector, top_k=3)
    relevant_documents = [result['id'] for result in results['matches']]

    # Additional logic to choose the most relevant document
    relevant_document_id = select_most_relevant_document(relevant_documents, query_vector)

    # Generate response using OpenAI and Langchain
    response = prompt(f"Using the information from document {relevant_document_id}, answer: {query}")
    return response

def compute_cosine_similarity(vec1, vec2):
    return cosine_similarity([vec1], [vec2])[0][0]


def combine_scores(summary_score, keyword_score, summary_weight=0.5):
    return summary_weight * summary_score + (1 - summary_weight) * keyword_score


def select_most_relevant_document(doc_ids, query_vector, summary_weight=0.5):
    highest_score = 0
    most_relevant_doc = None

    for doc_id in doc_ids:
        # Retrieve the document's vector (summary + keywords)
        doc_vector = get_document_vector(doc_id)  # Implement this function

        # Compute similarity scores
        summary_similarity = compute_cosine_similarity(query_vector[:len(query_vector)//2], doc_vector[:len(doc_vector)//2])
        keyword_similarity = compute_cosine_similarity(query_vector[len(query_vector)//2:], doc_vector[len(doc_vector)//2:])

        # Combine scores
        combined_score = combine_scores(summary_similarity, keyword_similarity, summary_weight)

        # Check if this is the highest score so far
        if combined_score > highest_score:
            highest_score = combined_score
            most_relevant_doc = doc_id

    return most_relevant_doc



def combine_scores(summary_score, keyword_score, views_score, summary_weight=0.3, views_weight=0.4):
    """
    Combine the summary, keyword, and views scores into a single score.
    Adjust the weights of each component as needed.
    """
    return summary_weight * summary_score + (1 - summary_weight - views_weight) * keyword_score + views_weight * views_score


def select_most_relevant_document(doc_ids, query_vector, views_dict, summary_weight=0.3):
    """
    Select the most relevant document based on cosine similarity and number of views.
    `views_dict` is a dictionary mapping document IDs to their view counts.
    """
    highest_score = 0
    most_relevant_doc = None

    for doc_id in doc_ids:
        # Retrieve the document's vector (summary + keywords)
        doc_vector = get_document_vector(doc_id)  # Implement this function

        # Compute similarity scores
        summary_similarity = compute_cosine_similarity(query_vector[:len(query_vector)//2], doc_vector[:len(doc_vector)//2])
        keyword_similarity = compute_cosine_similarity(query_vector[len(query_vector)//2:], doc_vector[len(doc_vector)//2:])

        # Get the views score (normalized)
        views_score = normalize_views_score(views_dict.get(doc_id, 0))

        # Combine scores
        combined_score = combine_scores(summary_similarity, keyword_similarity, views_score, summary_weight)

        # Check if this is the highest score so far
        if combined_score > highest_score:
            highest_score = combined_score
            most_relevant_doc = doc_id

    return most_relevant_doc

def normalize_views_score(views_count):
    """
    Normalize the views count to a score between 0 and 1.
    This function can be adjusted based on how you want to scale the views count.
    """
    return min(1, views_count / 1000) 





def process_query(query, openai_api_key):
    # Extract important keywords and summarize the query
    query_keywords = extract_important_keywords(query)
    query_summary = summarize_text(query)

    # Vectorize summary and keywords
    query_vector = model.encode([query_summary] + [' '.join(query_keywords)])

    # Fetch relevant document IDs from Pinecone
    results = index.query(query_vector, top_k=3)
    relevant_documents = [result['id'] for result in results['matches']]

    # Additional logic to choose the most relevant document
    relevant_document_id = select_most_relevant_document(relevant_documents, query_vector)

    # Retrieve the content of the most relevant document
    document_content = retrieve_document_content(relevant_document_id)  # Implement this function

    # Generate response using OpenAI GPT-3
    openai.api_key = openai_api_key
    try:
        response = openai.Completion.create(
            engine="text-davinci-003",  # Or the latest available engine
            prompt=f"Using the information from the following Boeing engineering document: {document_content}\n\nAnswer the query: {query}",
            max_tokens=150
        )
        return response.choices[0].text.strip()
    except Exception as e:
        return f"Error generating response: {e}"


def main():
    # Example Boeing doc views data and API key
    views_dict = {"doc1": 100, "doc2": 50, ...}
    openai_api_key = "your-api-key-here"

    # Sample Boeing question
    question = "What are the safety protocols for Boeing 787's engine maintenance?"

    # Process the query
    response = process_query(question, openai_api_key)
    print(response)

if __name__ == "__main__":
    main()