In [None]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.chains import ConversationalRetrievalChain
from langchain.schema import messages_from_dict, messages_to_dict
from langchain.memory import ConversationBufferMemory

from IPython.display import Markdown
from IPython.display import Latex
from tqdm import tqdm

import sys
import os
import re
import shutil

import requests
from bs4 import BeautifulSoup
from openai import OpenAI
import numpy as np

from dotenv import load_dotenv

load_dotenv()

# Brave Search API configuration
BRAVE_SEARCH_API_KEY = os.environ.get("BRAVE_SEARCH_API_KEY")
BRAVE_SEARCH_ENDPOINT = "https://api.search.brave.com/res/v1/web/search"

# OpenAI API configuration
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))


# 1. Web Search Integration using Brave Search
def brave_search(query, count=5):
    headers = {
        "Accept": "application/json",
        "X-Subscription-Token": BRAVE_SEARCH_API_KEY
    }
    params = {
        "q": query,
        "count": count
    }
    response = requests.get(BRAVE_SEARCH_ENDPOINT, headers=headers, params=params)
    return response.json()['web']['results']

# 2. Web Scraping and Content Extraction
def extract_content(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup.get_text(separator=' ', strip=True)

# 3. Text Processing (max length for ada is 8192)
def preprocess_text(text, max_length=8192):
    # Simple preprocessing: truncate to max_length characters
    return text[:max_length]

# 4. Embedding Generation using OpenAI
def generate_embedding(text):
    response = client.embeddings.create(
        input=text,
        model="text-embedding-ada-002"
    )
    return response.data[0].embedding

# 5. Retrieval
def retrieve_relevant_info(query_embedding, doc_embeddings, docs, top_k=5):
    similarities = np.dot(doc_embeddings, query_embedding)
    top_indices = np.argsort(similarities)[-top_k:][::-1]
    return [docs[i] for i in top_indices]

# 6 & 7. Prompt Engineering and Language Model Integration
def generate_answer(history, query, relevant_info):
    prompt = f"""Given the following information, please provide a concise and informative answer to the query based on the relevant information.

History: {history}

Query: {query}

Relevant Information:
{relevant_info}

Answer:"""

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that provides accurate and concise information based on the given query, chat history, and relevant information retrieved."},
            {"role": "user", "content": prompt}
        ]#,
        #max_tokens=
    )
    return response.choices[0].message.content.strip()

# Main RAG function
def rag_web_search(history, query):
    # Perform web search
    search_results = brave_search(query)
    
    # Extract and process content
    docs = []
    for result in search_results:
        content = extract_content(result['url'])
        processed_content = preprocess_text(content)
        docs.append(processed_content)
    
    # Generate embeddings
    doc_embeddings = [generate_embedding(doc) for doc in docs]
    query_embedding = generate_embedding(query)
    
    # Retrieve relevant information
    relevant_info = retrieve_relevant_info(query_embedding, doc_embeddings, docs)
    
    # Generate final answer
    answer = generate_answer(history, query, "\n".join(relevant_info))
    
    return answer

def markdown_to_text(markdown_content):
    # Convert Markdown to HTML
    html_content = markdown2.markdown(markdown_content)
    
    # Parse HTML and extract plain text
    soup = BeautifulSoup(html_content, "html.parser")
    return soup.get_text()

#create a chromadb (vectorized database for RAG) from a given directory of pdf files
def create_db_from_pdf_directory(pdf_directory, db_directory):
    # initialize an empty list to store all documents
    all_documents = []
    
    # iterate through all files in the directory
    for filename in os.listdir(pdf_directory):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(pdf_directory, filename)
            #print(f"Processing {filename}...")
            
            # load pdf
            loader = PyPDFLoader(pdf_path)
            documents = loader.load()
            
            # add documents to the list
            all_documents.extend(documents)
    
    # split text into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    texts = text_splitter.split_documents(all_documents)
    
    # vector embeddings instance
    embeddings = OpenAIEmbeddings()
    
    # create chromadb
    db = Chroma.from_documents(texts, embeddings, persist_directory=db_directory)
    
    print(f"Database created successfully at {db_directory}")
    return db


#specify source and database and create database with it
pdf_directory = "new_pdfs"

db_directory = "chromadb"

if not os.path.exists(db_directory):
    #print("Creating new Chroma database...")
    try:
        db = create_db_from_pdf_directory(pdf_directory, db_directory)
    except ValueError:
        print("Please add content before chatting. Put one or more pdfs in the new_pdfs dir and run this cell again.")
        sys.exit(1)
else:
    #print("Loading existing Chroma database...")
    embeddings = OpenAIEmbeddings()
    db = Chroma(embedding_function=embeddings)
    #print("Database loaded")


def query_db(db, query):
    # Create a retriever from the database
    retriever = db.as_retriever()
    
    # Create a ChatOpenAI model
    chat = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)

    # Add memory to keep track of conversation history
    memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

    # Create a Conversational Retrieval Chain with memory
    qa_chain = ConversationalRetrievalChain.from_llm(chat, retriever=retriever, memory=memory)
    
    # Run the query 
    result = qa_chain.invoke({"memory": memory, "question": query})
    
    return result['answer']

def digest_new_pdfs():
    new_pdf_dir = 'new_pdfs'
    used_pdf_dir = 'used_pdfs'

    added_documents = []

    if os.listdir(new_pdf_dir):

        pdf_found = False

        for file in os.listdir(new_pdf_dir):
            if file.endswith(".pdf"):
                if not pdf_found:
                    pdf_found = true
                    print('processing new content')
                
                
                # load pdf
                pdf_path = os.path.join(new_pdf_dir, file)
                loader = PyPDFLoader(pdf_path)
                documents = loader.load()
                
                # add documents to the list
                added_documents.extend(documents)
                
                source_file = os.path.join(new_pdf_dir, file)
                destination_file = os.path.join(used_pdf_dir, file)
                shutil.move(source_file, destination_file)

        if pdf_found:
            # split text into chunks
            text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
            texts = text_splitter.split_documents(added_documents)
            
            # vector embeddings instance
            embeddings = OpenAIEmbeddings()
        
            # add the embeddings to the existing ChromaDB instance
            db.add_documents(documents=texts, embeddings=embeddings)
    
            print('processing complete')

def encode_latex(text):
    
    # Pattern for inline LaTeX: \(...\) or \[...\]
    inline_pattern = r'\\[\(\[](.+?)\\[\)\]]'
    # Pattern for display LaTeX: \begin{equation}...\end{equation} or \begin{align}...\end{align}
    display_pattern = r'\\begin\{(equation|align)\}(.+?)\\end\{\1\}'
    
    # Replace inline LaTeX
    text = re.sub(inline_pattern, r'$\1$', text)
    
    # Replace display LaTeX
    text = re.sub(display_pattern, r'$$\2$$', text, flags=re.DOTALL)
    
    return text

# First digest new pdfs
digest_new_pdfs()

# Create a retriever from the database
retriever = db.as_retriever()

# Create a ChatOpenAI model
chat = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)

# Add memory to keep track of conversation history
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

# Create a Conversational Retrieval Chain with memory
qa_chain = ConversationalRetrievalChain.from_llm(chat, retriever=retriever, memory=memory)

history = ''

# Interactive loop
while True:
    query = input("\nAsk a question (or type 'exit' to exit): ")
    if query.lower() == 'exit':
        break
    else:
        # Modify and run the query 
        modified_query = query + " enclose any latex expressions, symbols, or equations in $ and use textual evidence where helpful"
        result = qa_chain({"question": modified_query})
        answer = result['answer']

        if (answer == "I don't know." or answer.startswith("The provided context does not contain")):
            print('No answers found in context library. Searching the web...')
            answer = rag_web_search(history, query + " enclose any latex explressions, symbols, or equations in $ and cite specific source links used ")
        
        history += f"query: {query}\n answer: {answer}\n"
        display(Markdown('\n**Response:** ' + encode_latex(answer) + '\n'))



Ask a question (or type 'exit' to exit):  What's the benefit of using a neural network over regression?



**Response:** The benefit of using a neural network over traditional regression methods lies in its ability to model complex, nonlinear relationships between inputs and outputs. Neural networks, particularly those with hidden layers, can capture intricate patterns in the data that linear regression may miss.

1. **Nonlinearity**: Neural networks can take nonlinear functions of linear combinations of the inputs, which allows them to model complex relationships. As stated in the text, "Both projection pursuit regression and neural networks take nonlinear functions of linear combinations ('derived features') of the inputs." This capability makes neural networks particularly effective in scenarios where the relationship between predictors and the response variable is not linear.

2. **Flexibility**: Neural networks can handle multiple quantitative responses and can be adapted for both regression and classification tasks. The text mentions that "these networks can handle multiple quantitative responses in a seamless fashion," indicating their versatility compared to traditional regression models.

3. **High Signal-to-Noise Ratio**: Neural networks are especially effective in problems with a high signal-to-noise ratio, where the goal is prediction without interpretation. The discussion notes that these tools "have been shown to compete well with the best learning methods on many problems," highlighting their performance advantage in certain contexts.

4. **Complex Interactions**: In neural networks, each input can enter the model in many places and in a nonlinear fashion, which allows for the modeling of complex interactions between variables. This is contrasted with traditional regression, which may struggle to capture such interactions without explicitly including interaction terms.

In summary, while traditional regression methods are useful for simpler, linear relationships and for interpreting the roles of individual inputs, neural networks provide a powerful alternative for modeling complex, nonlinear relationships and for tasks where prediction is prioritized over interpretability.



Ask a question (or type 'exit' to exit):  What programs in the real world use regressions over neural networks?


No answers found in context library. Searching the web...



**Response:** In the real world, there are several scenarios where traditional regression methods are preferred over neural networks. Here are some examples:

1. **Medical Research**: In many medical studies, linear regression is used to interpret the relationship between variables due to its simplicity and ease of interpretation. For instance, a study may examine the effect of dosage on patient recovery using linear regression models, which offer clear insights into the importance of the dosage amounts.

2. **Economics**: Economists often use regression analysis to model relationships between economic indicators. For example, multiple linear regression might be employed to assess the impact of education levels and household income on spending patterns. The coefficients provide direct interpretation of how much each predictor influences the outcome variable.

3. **Social Sciences**: In fields such as sociology or psychology, regression techniques are utilized to analyze survey data. Researchers might use logistic regression to understand how demographic factors affect voting behavior, as the results are easier to communicate to a broader audience.

4. **Finance**: Portfolio management often involves the use of regression techniques to determine the relationship between asset returns and market factors. For instance, the Capital Asset Pricing Model (CAPM) relies on linear regression to estimate the expected return of an asset based on its risk relative to the market.

These applications highlight how traditional regression methods still play a crucial role across various fields, largely due to their interpretability and the straightforward nature of the relationships they model.
