In [1]:
import os
import PyPDF2
import markdown2
import json
import pickle

from docx import Document as DocxDocument
from pptx import Presentation

from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_openai.embeddings.base import OpenAIEmbeddings

from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

# Rinats well documented sample project
# https://github.com/trustbit/private-poc-fmw-content-generator/tree/main/backend-and-kb/src/fmw

In [3]:
# folder_path = 'C:/Users/Felix/My Drive/UniVie/1st Semester/IML'
folder_path = r"G:\My Drive\UniVie\1st Semester\IML"
# folder_path = r"C:\Users\felix.krause\code\trustbit\llm_experiments\data"
# folder_path = r"C:\Users\felix.krause\Zotero\storage"

# embeddings_model_name = "sentence-transformers/all-MiniLM-L6-v2" # Hugging Face model

# load key from json
with open("secrets.json") as f:
    keys = json.load(f)
    openai_key = keys["OPENAI_API_KEY"]

In [3]:
# Function to load text from different file types
def load_text_from_file(file_path):
    _, file_extension = os.path.splitext(file_path)
    text = ""

    if file_extension == ".md":
        with open(file_path, 'r') as f:
            text = markdown2.markdown(f.read())
    elif file_extension == ".docx":
        doc = DocxDocument(file_path)
        for para in doc.paragraphs:
            text += para.text + "\n"
    elif file_extension == ".pptx":
        presentation = Presentation(file_path)
        for slide in presentation.slides:
            for shape in slide.shapes:
                if hasattr(shape, "text"):
                    text += shape.text + "\n"
    elif file_extension == ".pdf":
        with open(file_path, 'rb') as f:
            reader = PyPDF2.PdfReader(f, strict=False)
            for page in reader.pages:
                text += page.extract_text() + "\n"
    
    return text

# Function to load documents from a folder
def load_documents_from_folder(folder_path):
    documents = []
    for root, _, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(root, file)
            text = load_text_from_file(file_path)
            if text:
                documents.append(Document(page_content=text, metadata={"source": file_path}))
    return documents

# Load documents from the specified folder
documents = load_documents_from_folder(folder_path)

# Create vector store
# https://python.langchain.com/v0.1/docs/modules/data_connection/retrievers/vectorstore/
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
texts = text_splitter.split_documents(documents)

if len(texts) == 0:
    raise ValueError("No text found in the specified folder")

In [4]:
embeddings = OpenAIEmbeddings(api_key=openai_key) # deprecated?
db = FAISS.from_documents(texts, embeddings)
retriever = db.as_retriever()

In [14]:
# Store vector database
# with open("data/vector_db_uni", "wb") as f:
#     pickle.dump(db.serialize_to_bytes(), f)

In [8]:
# Load vector database
with open("data/vector_db_uni", "rb") as f:
    db_bytes = pickle.load(f)
    db = FAISS.deserialize_from_bytes(db_bytes, OpenAIEmbeddings(api_key=openai_key),
                                      allow_dangerous_deserialization=True)
    retriever = db.as_retriever()

## Define LLM

In [9]:
llm = ChatOpenAI(api_key=openai_key, temperature=0)
llm = llm.bind(logprobs=True, top_logprobs=3)
# could try local models as well

## Inference with chain

In [10]:
system_prompt = (
    "Use the given context to answer the question. "
    # "If you don't know the answer, say you don't know. "
    # "Keep the answer concise. "
    "Context: {context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [11]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
chain = create_retrieval_chain(retriever, question_answer_chain)

# Function to ask a question
def ask_question(query):
    response = chain.invoke({"input": query})
    answer = response["answer"]
    print(f"Question: {query}\nAnswer: {answer}")

In [11]:
ask_question("I am doing research for my thesis about predicting protein-to-mRNA ratios with Deep learning. The Paper 'Quantification and discovery of sequence determinants' of Eraslan et al 2019 is most similar to what I wanna do. Which papers are referenced in this paper and in other papers frequently and are most relevant?")

Question: I am doing research for my thesis about predicting protein-to-mRNA ratios with Deep learning. The Paper 'Quantification and discovery of sequence determinants' of Eraslan et al 2019 is most similar to what I wanna do. Which papers are referenced in this paper and in other papers frequently and are most relevant?
Answer: The paper "Quantification and discovery of sequence determinants of protein-per-mRNA amount in 29 human tissues" by Eraslan et al. (2019) is a valuable resource for your research on predicting protein-to-mRNA ratios with deep learning. In this paper, several references are mentioned that are frequently cited and relevant to the topic. Some of the key references in this paper include:

1. Vogel et al. (2010) - This paper is referenced in the context of multivariate regression analysis to predict protein abundances from mRNA abundances and mRNA sequence features.

2. Wilson et al. (2016) - Mentioned in relation to how amino acid identity affects translation.

3.

In [13]:
ask_question("Check the references of the provided papers. Which papers are most frequently referenced?")

Question: Check the references of the provided papers. Which papers are most frequently referenced?
Answer: The provided context mentions references to two papers: Zolg et al, 2017 and Toprak et al, 2014. Among these two papers, the paper by Zolg et al, 2017 is referenced more frequently.


### Questions about Uni lecture files

In [12]:
ask_question("How much cash does this company have in 2019?")

Question: How much cash does this company have in 2019?
Answer: Based on the provided context, there is no specific information about the cash holdings of the company in 2019. The context mainly discusses datasets, model complexity, bias-variance tradeoff, and other technical aspects related to data analysis and machine learning. Therefore, it is not possible to determine the exact amount of cash the company had in 2019 from the given information.


In [13]:
ask_question("Which main topics did we cover in the IML course?")

Question: Which main topics did we cover in the IML course?
Answer: The main topics covered in the Introduction to Machine Learning (IML) course are:

1. Linear regression
2. Linear classification
3. Kernels and the kernel trick
4. Neural networks & Deep Learning
5. Unsupervised learning
6. The statistical perspective
7. Statistical decision theory
8. Discriminative vs. generative modeling
9. Bayes’ classifiers
10. Bayesian approaches to unsupervised learning

Additionally, there are optional topics mentioned in parentheses, which are:
- Generative modeling with neural networks
- Reinforcement learning


In [60]:
ask_question("Did we cover the Representer Theorem? In which pdf can I find it?")

Question: Did we cover the Representer Theorem? In which pdf can I find it?
Answer: Yes, we covered the Representer Theorem. You can find it in the PDF titled "05-Kernels" on page 19.


In [61]:
ask_question("Which topics were asked in the final exam 2022?")

Question: Which topics were asked in the final exam 2022?
Answer: The final exam in 2022 covered topics such as Basics, Kernels, Neural Networks, Unsupervised Learning (K-Means and PCA), and Decision Theory and Classifier Metrics.


In [62]:
ask_question("What information does the cheatsheet contain about the Representer Theorem?")

Question: What information does the cheatsheet contain about the Representer Theorem?
Answer: The cheatsheet contains information about the Representer Theorem, which states that for a given optimization problem involving a mapping from a data space to a Hilbert space, there exist coefficients such that the optimal solution can be represented as a linear combination of the mapped data points.


## Show uncertainty

In [32]:
from response_certainty import display_uncertainty

In [106]:
def ask_question_probs(query, skip_certain_thres=1):
    context = str(retriever.invoke(query))
    system_prompt = "Use the given context to answer the question. Keep the answer concise. Context:"
    query = "\n User question: " + query
    final_query = system_prompt + context + query
    
    response = llm.invoke(final_query)
    
    display_uncertainty(response.response_metadata["logprobs"],
                        skip_certain_thres=skip_certain_thres, skip_symbols=True)

In [107]:
ask_question_probs("How much cash does this company have in 2019?", skip_certain_thres=1)

In [108]:
ask_question_probs("How much liquidity does this company have at hand?", skip_certain_thres=0.9)