In [6]:
from webscrapper import WebScraper
import fitz
import re
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from operator import itemgetter
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dotenv import load_dotenv
import os

In [None]:
# Get the path to the immediate parent folder of the current working directory
parent_folder_path = os.path.dirname(os.getcwd())

# Construct the path to the .env file in the parent folder
dotenv_path = os.path.join(parent_folder_path, ".env")

# Load the .env file
load_dotenv(dotenv_path)

In [7]:
import os
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = os.getenv("LANGCHAIN_API_KEY")
os.environ['OPENAI_API_KEY'] = os.getenv("OPENAI_API_KEY")

#### Code to extract answers from pdf

In [8]:
def extract_qa_from_pdf(pdf_path):
    # Open the PDF file
    doc = fitz.open(pdf_path)
    
    # Extract text from all pages
    text = ""
    for page in doc:
        text += page.get_text()
    
    # Close the document
    doc.close()
    
    lines = text.split('\n')
    qa_dict = {}
    current_speaker = None
    current_message = []
    first_user_statement = True
    current_question = None

    for line in lines:
        if line.strip() == 'Assistant:':
            if current_speaker == 'User' and not first_user_statement:
                qa_dict[current_question] = ' '.join(current_message).strip()
            current_speaker = 'Assistant'
            current_message = []
        elif line.strip() == 'User:':
            if current_speaker == 'Assistant':
                current_question = ' '.join(current_message).strip()
            current_speaker = 'User'
            current_message = []
            if first_user_statement:
                first_user_statement = False
        elif line.strip():
            current_message.append(line.strip())

    # Handle the last message
    if current_speaker == 'User' and not first_user_statement and current_question:
        qa_dict[current_question] = ' '.join(current_message).strip()
    qa_dict.pop(None,None)
    return qa_dict

In [9]:
# Usage
pdf_path = 'conversation.pdf'
qa_dictionary = extract_qa_from_pdf(pdf_path)
qa_dictionary

{"Hello! Thank you for taking the time to speak with me today about the Entry-Level Machine Learning Engineer position at G-Research. To start, could you tell me about your experience with machine learning, particularly any projects or coursework you've completed in this area?": "I'm sure I have a total of six months of experience with machine learning after I entered the University of Kent. Right now I'm mainly working on Numpy and Pandas. Numpy is to analyze image information and Pandas to extract data from CSVs.",
 "Thank you for sharing that information. Could you elaborate on a specific machine learning project you've worked on, perhaps one that involved Numpy or Pandas? What was the goal of the project, and what challenges did you face?": "I'm sure so one of the projects that I worked on was classifying the minst data set of images. Yeah basically it was done. I extracted the training set from the minst data, used a CNN algorithm to find out features from the images and then use 

In [16]:
for question, answer in qa_dictionary.items():
    print(f"Q: {question}")
    print(f"A: {answer}")
    print()

Q: Hello! Thank you for taking the time to speak with me today about the Entry-Level Machine Learning Engineer position at G-Research. To start, could you tell me about your experience with machine learning, particularly any projects or coursework you've completed in this area?
A: I'm sure I have a total of six months of experience with machine learning after I entered the University of Kent. Right now I'm mainly working on Numpy and Pandas. Numpy is to analyze image information and Pandas to extract data from CSVs.

Q: Thank you for sharing that information. Could you elaborate on a specific machine learning project you've worked on, perhaps one that involved Numpy or Pandas? What was the goal of the project, and what challenges did you face?
A: I'm sure so one of the projects that I worked on was classifying the minst data set of images. Yeah basically it was done. I extracted the training set from the minst data, used a CNN algorithm to find out features from the images and then use

#### Code to Decompose asnwers to find relevant sites on the internet

In [10]:
from langchain.prompts import ChatPromptTemplate

# Decomposition
template = """You are a helpful assistant that generates multiple sub-queries related to an answer by an interview candidate. \n
You will be provided the original question for context. The goal is to find the accuracy of the answer. For this
you will break down the answer into a set of sub-problems / sub-queries that can be used as a search string in google to find the relevant information. \n
Here is the original question: {question}
Generate multiple search queries related to: {answer} \n
Output (2 queries):"""
prompt_decomposition = ChatPromptTemplate.from_template(template)

In [12]:
# LLM
llm = ChatOpenAI(temperature=0)
documents = []
# Chain
generate_queries_decomposition = ( prompt_decomposition | llm | StrOutputParser() | (lambda x: x.split("\n")))
# Run
for question, answer in qa_dictionary.items():
    queries = generate_queries_decomposition.invoke({"question":question,"answer":answer})
    print(queries)
    for query in queries:
        scraper = WebScraper(answer,2)
        # Call the method to get the file paths of the scraped data
        documents.extend(scraper.get_scraped_data())
documents

['1. "University of Kent machine learning projects or coursework"', '2. "Numpy and Pandas usage in machine learning projects"']
['1. "Classifying MNIST dataset images using CNN algorithm project details"', '2. "Challenges faced in implementing CNN algorithm for image classification project"']
['1. "Fine-tuning LLM model for chatbot applications"', '2. "Applications of LLM models in chatbots"']


[Document(metadata={'source': 'https://stackoverflow.com/questions/46242284/pandas-and-python-image-to-numpy-array'}, page_content='\n\n\n\n\n\n \n\n\nSkip to main content\n\n\n\n\n\n\nStack Overflow\n\n\n\nAbout\n\n\n\n\t\t\t\t\t\tProducts\n\t\t\t\t\t\n\n\nOverflowAI\n\n\n\n\n\n\n\nStack Overflow for Teams\nWhere developers & technologists share private knowledge with coworkers\n\n\n\n\nAdvertising & Talent\nReach devs & technologists worldwide about your product, service or employer brand\n\n\n\n\nOverflowAI\nGenAI features for Teams\n\n\n\n\nOverflowAPI\nTrain & fine-tune LLMs\n\n\n\n\nLabs\nThe future of collective knowledge sharing\n\n\n\nAbout the company\nVisit the blog\n\n\n\n\n\n\n\n\n\n\n\n\nLoading…\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\ncurrent community\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n            Stack Overflow\n        \n\n\n\nhelp\nchat\n\n\n\n\n\n\n\n\n\n            Meta Stack Overflow\n        \n\n\n\n\n\n\nyour communities            \n\n\n\nSign up or log in to customize yo

#### Storing the relevant documents in a vector store

In [13]:
# Split
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=300, 
    chunk_overlap=50)

# Make splits
splits = text_splitter.split_documents(documents)

# Index

vectorstore = Chroma.from_documents(documents=splits, 
                                    embedding=OpenAIEmbeddings())

retriever = vectorstore.as_retriever()

#### Checking accuracy with  background context.

In [14]:
# Prompt
template = """
Here is the question for the answer you need to check:

\n --- \n {question} \n --- \n

Here is the answer you need to check:

\n --- \n {answer} \n --- \n

Here is any available background question + answer + accuracy percentage + feedback:

\n --- \n {q_a_pairs} \n --- \n

Here is additional context relevant to the question: 

\n --- \n {context} \n --- \n

Use the above context, your own knowledge and background question + answer + accuracy percentage + feedback
on the subject matter to get the accuracy of the answer. A percentage accuracy score and also note down the 
places that the answer was inaccurate and give feedback for those places
"""

decomposition_prompt = ChatPromptTemplate.from_template(template)

In [19]:

def format_qa_pair(question, answer, feedback):
    """Format Q and A pair"""
    
    formatted_string = ""
    formatted_string += f"Question: {question}\nAnswer: {answer}\nFeedback: {feedback}\n\n"
    return formatted_string.strip()

# llm
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

q_a_pairs = ""
rag_chain = (
    {"context": itemgetter("answer") | retriever, 
     "answer": itemgetter("answer"),
     "question": itemgetter("question"),
     "q_a_pairs": itemgetter("q_a_pairs")} 
    | decomposition_prompt
    | llm
    | StrOutputParser())
for question, answer in qa_dictionary.items():
    feedback = rag_chain.invoke({"answer":answer,"question":question,"q_a_pairs":q_a_pairs})
    q_a_pair = format_qa_pair(question,answer,feedback)
    q_a_pairs = q_a_pairs + "\n---\n"+  q_a_pair

In [20]:
q_a_pairs

"\n---\nQuestion: Hello! Thank you for taking the time to speak with me today about the Entry-Level Machine Learning Engineer position at G-Research. To start, could you tell me about your experience with machine learning, particularly any projects or coursework you've completed in this area?\nAnswer: I'm sure I have a total of six months of experience with machine learning after I entered the University of Kent. Right now I'm mainly working on Numpy and Pandas. Numpy is to analyze image information and Pandas to extract data from CSVs.\nFeedback: Accuracy: 70%\n\nThe answer correctly identifies the use of Numpy and Pandas for analyzing image information and extracting data from CSVs in the context of machine learning. However, the answer does not directly address the specific question about importing image data from a CSV file and saving it as a numpy array for processing. \n\nFeedback:\n- The answer could be more accurate by providing a step-by-step guide or code snippet on how to ac