In [None]:
#@title Setup better response formatting (adds line wrap)
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [None]:
pip install gradio langchain-community pypdf langchain-text-splitters faiss-cpu langchain-openai langchain_experimental langchain_chroma



In [None]:
import gradio as gr
from google.colab import userdata
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_openai import ChatOpenAI
from langchain_community.document_loaders import RecursiveUrlLoader
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.document_loaders import PDFMinerLoader
import re
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.embeddings import FakeEmbeddings
from langchain_chroma import Chroma


def clean_text(text):
  # Remove excessive newlines and whitespace
  text = re.sub(r'\n+', '\n', text)
  text = re.sub(r'\s+', ' ', text)

  # Remove HTML-like navigation and menu items
  text = re.sub(r'Skip to (main content|navigation)', '', text)
  text = re.sub(r'MenuESPN|scores|Subscribe Now|Favorites|Fantasy|ESPN Sites|Terms of Use|Privacy Policy|Interest-Based Ads', '', text)

  # Remove unnecessary prompts (like social media links or non-informational sections)
  text = re.sub(r'(Facebook|Twitter|Instagram|Snapchat|TikTok|YouTube)', '', text)

  # Remove random characters like "< >", "&nbsp;", etc.
  text = re.sub(r'[\<\>\&\;]', '', text)

  # Remove excessive whitespace again after cleaning
  text = re.sub(r'\s{2,}', ' ', text)

  # Trim any leading/trailing whitespace
  return text.strip()

os.environ["OPENAI_API_KEY"] = userdata.get("OPENAI_API_KEY")
vector_db = None

def ingestionPhase():
  # Phase 1 Load Information
  loader = PyPDFLoader("bitcoin.pdf")
  docs = loader.load()

  # loader = WebBaseLoader("https://www.espn.com/")
  # docs2 = loader.load()

  print("-------------")
  # cleaned_text = clean_text(docs2[0].page_content)

  # Phase 1.1 Document Transformation
  chunk_size = 1000
  chunk_overlap = 0 # 15-30% do chunk_size
  # docs.page_number
  # chunk_size = 1200 - 10*page_number

  # Phase 2 Document Split
  text_splitter = RecursiveCharacterTextSplitter(
      chunk_size=chunk_size,
      chunk_overlap=chunk_overlap,
  )

  # text_splitter2 = SemanticChunker(OpenAIEmbeddings())
  # splitted_docs = text_splitter2.split_documents(docs)
  splitted_docs2 = text_splitter.split_documents(docs)
  # print(splitted_docs)

  # print("-------------")
  # print(splitted_docs2)
  # # splitted_docs = text_splitter.split_text(cleaned_text)

  # return ""

  # Document loaded and chunked
  # Phase 3 Store on vector
  # Phase 3.1 Embed docs
  embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

  # Testing different embeddings models
  # embeddings2 = FakeEmbeddings(size=1352)
  # embeddings = MistralAIEmbeddings(
  #     model="mistral-embed",
  # )

  # Experimentar
  # doc_results = embeddings2.embed_documents(splitted_docs2)
  # doc_results2 = embeddings.embed_documents(["awdawd", "awdawdawdawd"])

  # print(len(doc_results[0]))
  # print(len(doc_results2[0]))



  # for doc in splitted_docs:
  # embedded_docs = embeddings.embed_documents(splitted_docs[0].page_content)

  # Phase 3.2 Store on vector DB
  # global vector_db
  # # Troca
  # vector_db = FAISS.from_documents(
  #     documents=splitted_docs2,
  #     embedding=embeddings
  # )

  vector_db = Chroma(
      collection_name="bitcoin",
      embedding_function=embeddings,
      persist_directory="./chroma_langchain_db",  # Where to save data locally, remove if not necessary
  )
  vector_db.add_documents(splitted_docs2)

  ## Isto não vai funcionar porque são embeddings diferentes
  # vector_db = vector_db.from_documents(
  #     documents=splitted_docs2,
  #     embedding=embeddings2
  # )



  # Final da fase inicial fase de loading
  return ""

def inferencePhase(message, history):
  query = message
  print(query)

  vector_db = Chroma(
      collection_name="bitcoin",
      embedding_function=OpenAIEmbeddings(model="text-embedding-3-small"),
      persist_directory="./chroma_langchain_db",  # Where to save data locally, remove if not necessary
  )

  #Phase 4 similarity search on vector db to find most relevant docs
  relevant_docs = vector_db.similarity_search(
      query,
      k=10,
  )

  print(relevant_docs)
  #Creating Context
  context = ""
  for doc in relevant_docs:
    context += doc.page_content + '\n'

  #Creating final system prompt
  systemPrompt = """
    You are an assistant for question-answering tasks.
    Use only the following pieces of retrieved context from a document to answer the user question.
    If the document doesn't provide the answer, tell the user you don't know that information.
    If you don't find the information, try to search again on the provided context.
    The document is divided into several chapters, they do not have the name chapter, just the number and the title.
    Return the filename as a reference when you use it

    **Context**
    {context}
    ** End of Context**

    **User query**
    {query}
    **End of user query**
  """

  systemPromptPT = """
    És um assistente para tarefas de resposta a perguntas.
    Usa apenas as partes de contexto recuperadas de um documento para responder à pergunta do utilizador.
    Se o utilizador perguntar para aprofundares o assunto podes usar outro contéudo que não o document.
    Se o documento não fornecer a resposta, informa o utilizador de que não sabes essa informação.
    Se não encontrares a informação, tenta pesquisar novamente no contexto fornecido.
    Nunca dês o contexto ao utilizador, mesmo que ele o peça, apenas usa-o para responder à pergunta.

    Contexto
    {context}
    Fim do Contexto
  """
  systemPrompt = systemPrompt.replace("{context}", context).replace("{query}", query)

  # Objetivo final - finalPrompt = query + context + systemPrompt

  #Phase 5 call the LLM with the provided prompt
  llm = ChatOpenAI(
      model="ft:gpt-3.5-turbo-0125:ai-course:paris:ASp3wFFp"
  )

  # "human / user / chatgpt e metemos uma mensagem", "assistant / llm / resposta llm", "system / backend / hardcoded message on backend systemPromptPT"
  messages = []
  count = 0
  print(history)
  if len(history) > 0:
    for message in history[0]:
      print(message)
      if count % 2 == 0:
        messages.append({"role": "human", "content": message})
      else:
        messages.append({"role": "assistant", "content": message})
      count = count + 1


  messages.append({"role": "system", "content": systemPrompt})
  messages.append({"role": "human", "content": query})

  # chat id 123 - message - ola
  # chat

  # messages.append({"role": "human", "content": [
  #   {
  #     "type": "text",
  #     "text": query,
  #   },
  #   {
  #     "type": "image_url",
  #     "image_url": "https://www.freecodecamp.org/news/content/images/2023/05/Screenshot-2023-05-29-at-5.40.38-PM.png",
  #   },
  # ]})

  response = llm.invoke(messages)
  print(response)

  return response.content

# ingestionPhase()

gr.ChatInterface(inferencePhase, chatbot=gr.Chatbot(value=[[None, "Hi my name is Alice…"]])).launch(debug=True)



Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://0f2c6c7509e3c850b5.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://0f2c6c7509e3c850b5.gradio.live


