# Installing libraries

In [21]:
!pip install -U langchain chromadb gpt4all



In [22]:
!pip install gradio==3.48.0 fastapi==0.103.2



In [23]:
!sudo apt-get update
!sudo apt-get install pciutils lshw
!lspci | grep -i nvidia

0% [Working]            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
0% [Connecting to archive.ubuntu.com] [Connecting to security.ubuntu.com (185.1                                                                               Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
0% [Connecting to archive.ubuntu.com (91.189.91.83)] [Waiting for headers] [Con                                                                               Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [119 kB]
Hit:6 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:7 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/pp

In [24]:
!curl https://ollama.ai/install.sh | sh

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  7883    0  7883    0     0  30516      0 --:--:-- --:--:-- --:--:-- 30436
>>> Downloading ollama...
############################################################################################# 100.0%
>>> Installing ollama to /usr/local/bin...
>>> Adding current user to ollama group...
>>> Creating ollama systemd service...
>>> NVIDIA GPU installed.
>>> The Ollama API is now available at 0.0.0.0:11434.
>>> Install complete. Run "ollama" from the command line.


# Loading the server with the model

In [25]:
!nohup ollama serve > ollama.out &

nohup: redirecting stderr to stdout


In [26]:
!ollama pull openchat # loading openchat model

pulling manifest
pulling 2e561388b935... 100% ▕▏ (4.1/4.1 GB, 45 TB/s)
pulling 1d0bbc3c6e4c... 100% ▕▏ (73/73 B, 2.1 MB/s)
pulling f72a889137f0... 100% ▕▏ (102/102 B, 2.7 MB/s)
pulling f66234a880c4... 100% ▕▏ (381/381 B, 15 MB/s)
verifying sha256 digest
writing manifest
removing any unused layers
success


In [27]:
# Checking

# from langchain.callbacks.manager import CallbackManager
# from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
# from langchain.chat_models import ChatOllama

# chat_model = ChatOllama(
#     model="openchat",
#     stop=["<|end_of_turn|>"],
#     callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]),
# )

# from langchain.schema import HumanMessage

# messages = [HumanMessage(content="Tell me about the history of AI (very shortly, 3 sent.)")]
# chat_model(messages)

# Let's create Elza

In [28]:
import os
from operator import itemgetter
from typing import List, Tuple

from langchain.chat_models import ChatOllama

from langchain.embeddings import OpenAIEmbeddings
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.prompts.prompt import PromptTemplate
from langchain.schema import AIMessage, HumanMessage, format_document
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import (
    RunnableBranch,
    RunnableLambda,
    RunnableMap,
    RunnablePassthrough,
)
from langchain.vectorstores import Pinecone
from pydantic import BaseModel, Field

from langchain.vectorstores import Chroma
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.embeddings import GPT4AllEmbeddings

In [29]:
print("Please upload data to the content/ folder")
loader = CSVLoader("data.csv")

Please upload data to the content/ folder


In [30]:
all_splits = loader.load()
print(len(all_splits))
# Use only 500 for faster embedding calculation
all_splits = all_splits[-500:]
print(all_splits[19])

3940
page_content='Title: Dude\nYear: 2018\nSummary: DUDE is a grounded comedy about dealing with first losses in life--leaving your best friends, the death of loved ones, and the passage of time--and that odd mixture of grief and nostalgia experienced by young people as they try to understand these losses. It is also about getting stoned with your friends in your car while listening to hip-hop.\nShort Summary: A group of teenage girlfriends deal with life after high school.\nGenres: Comedy|Drama\nIMDB ID: tt3458510\nRuntime: 97\nYouTube Trailer: eIBQaDlR0tA\nRating: 5.2\nMovie Poster: https://hydramovies.com/wp-content/uploads/2018/06/Dude-Movie-Poster.jpg\nDirector: Olivia Milch\nWriters: Olivia Milch\nCast: Kathryn Prescott|Lucy Hale' metadata={'source': 'data.csv', 'row': 3459}


### Creating embeddings
Storing them in Chroma vector DB. It takes less than 5 minutes, please be patient

In [31]:
print("Building vector DB...")
# Add to vectorDB
vectorstore = Chroma.from_documents(
    documents=all_splits,
    collection_name="rag-private",
    embedding=GPT4AllEmbeddings(),
)
retriever = vectorstore.as_retriever()

Building vector DB...


## Templates for chatbot

In [32]:
# Condense a chat history and follow-up question into a standalone question
_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question.
You are a helpful human assistant who recommends movies for a user to watch.
Collect all wishes of the user and use them to improve recommendation each time.
Suggest 1 or 2 movies, make their descriptions short and start new movie description with a new paragraph.
Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:"""  # noqa: E501
CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)

# RAG answer synthesis prompt
template = """Answer the question based only on the following context:
<context>
{context}
</context>"""
ANSWER_PROMPT = ChatPromptTemplate.from_messages(
    [
        ("system", template),
        MessagesPlaceholder(variable_name="chat_history"),
        ("user", "{question}"),
    ]
)

# Conversational Retrieval Chain
DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template="{page_content}")

## Creating chat history

In [33]:
def _combine_documents(
    docs, document_prompt=DEFAULT_DOCUMENT_PROMPT, document_separator="\n\n"
):
    doc_strings = [format_document(doc, document_prompt) for doc in docs]
    return document_separator.join(doc_strings)


def _format_chat_history(chat_history: List[Tuple[str, str]]) -> List:
    buffer = []
    for human, ai in chat_history:
        buffer.append(HumanMessage(content=human))
        buffer.append(AIMessage(content=ai))
    return buffer


# User input
class ChatHistory(BaseModel):
    chat_history: List[Tuple[str, str]] = Field(..., extra={"widget": {"type": "chat"}})
    question: str


_search_query = RunnableBranch(
    # If input includes chat_history, we condense it with the follow-up question
    (
        RunnableLambda(lambda x: bool(x.get("chat_history"))).with_config(
            run_name="HasChatHistoryCheck"
        ),  # Condense follow-up question and chat into a standalone_question
        RunnablePassthrough.assign(
            chat_history=lambda x: _format_chat_history(x["chat_history"])
        )
        | CONDENSE_QUESTION_PROMPT
        | ChatOllama(model="openchat", stop=["<|end_of_turn|>"], temperature=0)
        | StrOutputParser(),
    ),
    # Else, we have no chat history, so just pass through the question
    RunnableLambda(itemgetter("question")),
)

_inputs = RunnableMap(
    {
        "question": lambda x: x["question"],
        "chat_history": lambda x: _format_chat_history(x["chat_history"]),
        "context": _search_query | retriever | _combine_documents,
    }
).with_types(input_type=ChatHistory)


In [34]:
# append inputs to answer prompts and to model output
chain = _inputs | ANSWER_PROMPT | ChatOllama(model="openchat", stop=["<|end_of_turn|>"])

## UI with Gradio

In [17]:
# Need to load ollama again
!nohup ollama serve > ollama.out &
!ollama pull openchat # loading openchat model

nohup: redirecting stderr to stdout
pulling manifest
pulling 2e561388b935... 100% ▕▏ (4.1/4.1 GB, 57 TB/s)
pulling 1d0bbc3c6e4c... 100% ▕▏ (73/73 B, 2.7 MB/s)
pulling f72a889137f0... 100% ▕▏ (102/102 B, 4.8 MB/s)
pulling f66234a880c4... 100% ▕▏ (381/381 B, 17 MB/s)
verifying sha256 digest
writing manifest
removing any unused layers
success


After executing the cell below, press the link to chat with Elza

In [35]:
import gradio as gr
import random
import time

with gr.Blocks() as demo:
    chatbot = gr.Chatbot(label='Elza')
    msg = gr.Textbox(label='User')
    clear = gr.ClearButton([msg, chatbot])

    def respond(message, chat_history):
        bot_message = chain.invoke({
            'chat_history': chat_history,
            "question": message
        }).content
        chat_history.append((message, bot_message))
        time.sleep(2)
        # chat_history = []
        return "", chat_history

    msg.submit(respond, [msg, chatbot], [msg, chatbot])

demo.launch(share=True, debug=True)

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://82e16fb60fddada693.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://82e16fb60fddada693.gradio.live




## Simple chatbot interface

In [None]:
history = []

# Ensuring that we don't use LLMs builtin information and only using data from our dataset
while True:
  question = input("User: ")
  answer = chain.invoke({
      'chat_history': history,
      "question": question
  })
  history.append([question, answer.content])
  print("Assistant:", answer.content)