In [None]:
import os
import faiss
import numpy as np
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from sentence_transformers import SentenceTransformer
from langchain.docstore.in_memory import InMemoryDocstore
from langchain_core.documents import Document
from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer

# Expanded Personal Information Document (for Task 1.1)
documents = ["""
My name is Vuong Loc Truong, and I am a Vietnamese citizen. I was born on October 20, 2001, and I am 23 years old. I am currently pursuing a Master's degree in Data Science and AI, which is my highest level of education thus far. My academic journey has focused on the intricacies of data and artificial intelligence, a field I find both challenging and profoundly rewarding. While I currently have no formal work experience, my involvement in web development has provided me with practical insights into the technological landscape. Presently, I serve as a teaching assistant at Van Lang University, where I contribute to the educational development of students.
             
My core belief regarding the role of technology in shaping society is that it holds immense potential to enhance the quality of life. By providing solutions to complex problems in healthcare, education, and environmental sustainability, technology can be a powerful force for good. However, I also recognize the importance of ensuring equitable access and addressing ethical concerns to prevent the exacerbation of existing inequalities. Responsible innovation and thoughtful regulation are crucial for harnessing technology’s power for the greater good.

Furthermore, I believe that cultural values should significantly influence technological advancements. Technology should be developed and implemented in a manner that respects and preserves diverse cultural identities and traditions. Avoiding the imposition of a single cultural perspective is essential. Instead, prioritizing inclusivity and adaptability to different cultural contexts ensures that technology serves the needs of diverse communities and promotes cultural understanding.

As a Master's student, I find that the most challenging aspect of my studies thus far is English communication. Overcoming this obstacle is a priority for me. My primary academic goal is to graduate on time, ensuring that I can effectively apply my knowledge and contribute to the field of Data Science and AI. I am dedicated to my studies and eager to see how my research interests will evolve and contribute to the technological advancements of the future.

In addition to my academic pursuits, I have a strong interest in web development. This interest has led me to work on various projects, both personal and academic, that involve creating and maintaining websites. My experience in web development has provided me with a solid understanding of front-end and back-end technologies, as well as the importance of user experience and accessibility. I believe that web development is a crucial skill in today's digital age, and I am committed to continuing my growth in this area.

At Van Lang University, I have had the opportunity to work closely with students as a teaching assistant. This role has allowed me to develop my communication and mentoring skills, as well as gain a deeper understanding of the educational process. I take great pride in helping students achieve their academic goals and am always looking for ways to improve my teaching methods.

Looking ahead, I am excited about the potential for technology to drive positive change in society. I am particularly interested in exploring how data science and artificial intelligence can be used to address pressing global challenges, such as climate change, healthcare, and education. I am committed to using my skills and knowledge to contribute to these efforts and to make a meaningful impact on the world.
"""]

# Task 1.1 - Find all relevant sources related to yourself
# The personal information document (above) is the relevant source that describes the user's information.

# Alternative: Using SentenceTransformers to generate embeddings locally (no API limit)
def create_embeddings_with_sentence_transformer(documents):
    model = SentenceTransformer('all-MiniLM-L6-v2')  # Pre-trained model
    embeddings = model.encode(documents)
    print(f"Generated embeddings shape: {np.array(embeddings).shape}")  # Debugging line to check dimensions
    return embeddings

# Create embeddings using SentenceTransformer embeddings (offline)
embeddings = create_embeddings_with_sentence_transformer(documents)

# Convert the embeddings to a numpy array (required by FAISS)
embeddings = np.array(embeddings).astype("float32")
print(f"Embedding array type: {type(embeddings)}")  # Debugging line to check the type

# Debugging: Print the shape of the embeddings
print(f"Embedding shape: {embeddings.shape}")

# Create the FAISS index
dimension = embeddings.shape[1]  # The dimensionality of the embeddings
print(f"Embedding dimensionality: {dimension}")  # Debugging line

index = faiss.IndexFlatL2(dimension)  # Use L2 distance for similarity search

# Add the embeddings to the index
index.add(embeddings)

# Save the FAISS index if necessary
faiss.write_index(index, "faiss_index.index")

# Create the docstore and the index_to_docstore_id dictionary
index_to_docstore_id = {i: str(i) for i in range(len(documents))}
docstore = InMemoryDocstore({index_to_docstore_id[i]: Document(page_content=doc) for i, doc in enumerate(documents)})

# Create the FAISS retriever manually using the index and other required arguments
db = FAISS(index=index, embedding_function=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2"), docstore=docstore, index_to_docstore_id=index_to_docstore_id)

# Use FAISS to store embeddings and enable retrieval
retriever = db.as_retriever()

# Load the model and tokenizer for QA
qa_model = AutoModelForQuestionAnswering.from_pretrained('distilbert-base-uncased-distilled-squad')
qa_tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased-distilled-squad')

# Define QA pipeline
qa_pipeline = pipeline('question-answering', model=qa_model, tokenizer=qa_tokenizer)

# Task 1.2 - Design the prompt template for the chatbot
prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template="""You are an AI assistant providing precise and concise answers about Vuong Loc Truong.
    Context: {context}
    Question: {question}
    Answer:
    """
)

# Task 1.3 - Explore other text-generation models or OPENAI models to enhance capabilities (GPT-2 is being used here as an example)
local_model = pipeline('text-generation', model='gpt2')

def analyze_model_output_v2(question):
    """Test retrieval and generation with a QA model (DistilBERT)."""
    
    # Get relevant documents from the retriever
    retrieved_docs = retriever.get_relevant_documents(question)
    
    # Debugging: Print the retrieved documents
    print("Retrieved Documents:")
    for doc in retrieved_docs:
        print(f"Document: {doc.page_content[:300]}...")  # Print the first 300 characters for brevity

    # Combine the context from the relevant documents
    context = " ".join([doc.page_content for doc in retrieved_docs])
    
    # Ask the model to answer the question using the context
    result = qa_pipeline(question=question, context=context)
    
    # Print the answer
    print("\nGenerated Answer:")
    print(result['answer'])
    
    return retrieved_docs, result['answer']

  from .autonotebook import tqdm as notebook_tqdm







Generated embeddings shape: (1, 384)
Embedding array type: <class 'numpy.ndarray'>
Embedding shape: (1, 384)
Embedding dimensionality: 384


  db = FAISS(index=index, embedding_function=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2"), docstore=docstore, index_to_docstore_id=index_to_docstore_id)
Device set to use cpu
Device set to use cpu


In [2]:
# Task 1.3 - Example Question
analyze_model_output_v2("How old are you?")

  retrieved_docs = retriever.get_relevant_documents(question)


Retrieved Documents:
Document: 
My name is Vuong Loc Truong, and I am a Vietnamese citizen. Born on October 20, 2001, now I'm 23, I am currently pursuing a Master's degree in Data Science and AI, marking my highest level of education thus far. My academic journey has been focused on the intricacies of data and artificial intellig...

Generated Answer:
23


([Document(metadata={}, page_content="\nMy name is Vuong Loc Truong, and I am a Vietnamese citizen. Born on October 20, 2001, now I'm 23, I am currently pursuing a Master's degree in Data Science and AI, marking my highest level of education thus far. My academic journey has been focused on the intricacies of data and artificial intelligence, a field I find both challenging and profoundly rewarding. While I currently have no formal years of work experience, my involvement in web development has provided me with practical insights into the technological landscape. Presently, I serve as a teaching assistant at Van Lang University, where I contribute to the educational development of students.\n\nMy core belief regarding the role of technology in shaping society is that it holds immense potential to enhance the quality of life. By providing solutions to complex problems in healthcare, education, and environmental sustainability, technology can be a powerful force for good. However, I also

In [3]:

# Task 2.1 - List of retriever and generator models used
retriever_models = ["FAISS with SentenceTransformerEmbeddings"]
generator_models = ["DistilBERT QA model", "HuggingFace GPT-2"]

print("Retriever Models Used:", retriever_models)
print("Generator Models Used:", generator_models)

# Task 2.2 - Analyze any issues related to the models providing unrelated information
def analyze_issues(retriever_models, generator_models):
    issues = []
    
    # Check for issues in retriever models
    for model in retriever_models:
        if "FAISS" in model:
            issues.append(f"Issue with {model}: Potential for retrieving unrelated documents due to embedding distance inaccuracies.")
    
    # Check for issues in generator models
    for model in generator_models:
        if "GPT-2" in model:
            issues.append(f"Issue with {model}: Potential for generating unrelated information due to model's generalization capabilities.")
    
    return issues

# Task 2.2 - Example issue analysis
issues = analyze_issues(retriever_models, generator_models)
print("Issues Found:")
for issue in issues:
    print(issue)

Retriever Models Used: ['FAISS with SentenceTransformerEmbeddings']
Generator Models Used: ['DistilBERT QA model', 'HuggingFace GPT-2']
Issues Found:
Issue with FAISS with SentenceTransformerEmbeddings: Potential for retrieving unrelated documents due to embedding distance inaccuracies.
Issue with HuggingFace GPT-2: Potential for generating unrelated information due to model's generalization capabilities.
