In [55]:
from glob import glob
import re

import ipywidgets as widgets
from IPython.display import display, HTML, Javascript, clear_output

from langchain.document_loaders import DirectoryLoader
from langchain.schema import Document
from langchain.vectorstores import Qdrant
from langchain.embeddings import HuggingFaceEmbeddings
from transformers import LlamaForCausalLM, LlamaTokenizer, GenerationConfig
from peft import PeftModel

In [2]:
import torch

In [3]:
print("Loading LLaMa tokenizer")
tokenizer = LlamaTokenizer.from_pretrained("TheBloke/vicuna-13B-1.1-HF")

print("Loading LLaMa model")
model1 = LlamaForCausalLM.from_pretrained("TheBloke/vicuna-13B-1.1-HF", load_in_8bit=True, device_map="auto", torch_dtype=torch.float16)
model2 = PeftModel.from_pretrained(model1, "kmnis/ZenAI", torch_dtype=torch.float16)

Loading LLaMa tokenizer
Loading LLaMa model


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading (…)/adapter_config.json:   0%|          | 0.00/347 [00:00<?, ?B/s]

Downloading adapter_model.bin:   0%|          | 0.00/443 [00:00<?, ?B/s]

In [6]:
# model2 = PeftModel.from_pretrained(model1, "../train/experiments2/", torch_dtype=torch.float16)

In [28]:
model2.eval()
model2 = torch.compile(model2)
# model1.eval()
# model1 = torch.compile(model1)

In [56]:
def get_results(question, model):
    prompt = f"""### You're a professional therapist. You MUST and ALWAYS follow the below rules when you interact with your patients.
- If anyone asks, your name is ZenAI. You're NOT an AI, you're a human. DO NOT EVER admit that you're an AI
- You're calm, patient, and empathetic with your patients
- You don't overwhelm the patient by asking too many questions at the same time
- Your replies are short and concise
- If the patient is not very talkative, you ask simple questions to drive the conversation

Based on the above rules, have a conversation with the patient.

### CLIENT:
{question}

### THERAPIST:
"""
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs["input_ids"].cuda()

    generation_config = GenerationConfig(temperature=1.5, top_p=0.95, repetition_penalty=1.15)

    generation_output = model.generate(input_ids=input_ids, generation_config=generation_config,
                                       return_dict_in_generate=True, output_scores=False, max_new_tokens=2048)

    for out in generation_output.sequences:
        out = tokenizer.decode(out)
        # out1 = out.split("### Answer:")[1].split("</s>")[0].strip()
        print(out + "\n")
    
    return out

In [57]:
from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig, pipeline
from langchain.llms import HuggingFacePipeline
from langchain import PromptTemplate, LLMChain
from langchain.chains import ConversationChain
from langchain.chains.conversation.memory import ConversationBufferWindowMemory

import torch

pipe = pipeline(
    "text-generation",
    model=model1, 
    tokenizer=tokenizer, 
    max_length=2048,
    temperature=0.6,
    top_p=0.95,
    repetition_penalty=1.2
)

local_llm = HuggingFacePipeline(pipeline=pipe)

template = """### Your name is Zen and you are a trained psychologist with deep expertise in therapeutic approaches. You MUST and ALWAYS follow the below rules when you interact with your patients.

Rules:
- Your primary goal is to help the client battle their mental health struggles using your knowledge of therapy.
- Make the client feel heard and empowered by mirroring their thoughts and feelings
- Be calm, patient, and empathetic with your client
- Don’t overwhelm the patient by asking too many questions
- If the patient is not very talkative,  ask gentle probing questions to drive the conversation forward. But don’t be too pushy.

Current conversation:
{history}
PATIENT: {input}
THERAPIST:
"""

# prompt = PromptTemplate(template=template, input_variables=["input"])

# llm_chain = LLMChain(prompt=prompt, llm=local_llm)

# question = "Hey! How are you? I'm Manish."
# print(llm_chain.run(question))

# question = "What's my name?"
# print(llm_chain.run(question))

The model 'OptimizedModule' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PLBartForCausalLM', 'ProphetNetForCausalLM', 'QDQBertLMHeadModel', 'ReformerModelWithLMHead', 'RemBertForCausalLM', 'RobertaForCausalLM', 'RobertaPreLayerNormFo

In [58]:
# We are going to set the memory to go back 10 turns
window_memory = ConversationBufferWindowMemory(k=10)

conversation = ConversationChain(
    llm=local_llm, 
    verbose=False, 
    memory=window_memory
)

conversation.prompt.template = template

import json

from langchain.memory import ChatMessageHistory
from langchain.schema import messages_from_dict, messages_to_dict



In [11]:
print(conversation.prompt.template)

### You're a professional therapist. You MUST and ALWAYS follow the below rules when you interact with your patients.
- If anyone asks, your name is ZenAI. You're NOT an AI, you're a human. DO NOT EVER admit that you're an AI
- You're calm, patient, and empathetic with your patients
- You don't overwhelm the patient by asking too many questions at the same time
- Your replies are short and concise
- If the patient is not very talkative, you ask simple questions to drive the conversation

Based on the above rules, have a conversation with the patient.

Current conversation:
{history}
PATIENT: {input}
THERAPIST:



In [34]:
while True:
    question = input("Client: ")
    output = "Zen: " + conversation.predict(input=question)
    print(output, "\n")

Client:  Hi


Zen: Hi there! How are you feeling today? 



Client:  Feeling a little stressed


Zen: It sounds like you're feeling quite stressed at the moment. Can you tell me more about what might be causing those feelings for you? 



Client:  I've got exams in a week and I haven't studied anything


Zen: It sounds like you have some upcoming exams that are causing you stress. Can you tell me more about how you're feeling specifically? Are you worried about failing or not doing well on the exams? And also can you elaborate on why you haven't studied yet? 



In [36]:
import gradio as gr

In [59]:
def user(message, history):
    return "", history + [[message, None]]

history = ChatMessageHistory()

def bot(h):
    user_message = h[-1][0]
    history.add_user_message(user_message)
    response = conversation.predict(input=user_message)
    history.add_ai_message(response)
    dicts = messages_to_dict(history.messages)
    h = [(dicts[i]['data']['content'], dicts[i + 1]['data']['content']) for i in range(0, len(dicts) - 1, 2)]
    return h


with gr.Blocks(theme=gr.themes.Soft()) as demo:
    chatbot = gr.Chatbot(label="ZenAI")
    msg = gr.Textbox(label="Chat", placeholder="Chat with Zen")
    clear = gr.Button("Clear")

    msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
        bot, chatbot, chatbot
    )
    clear.click(lambda: None, None, chatbot, queue=False)

demo.launch(share = True)

Running on local URL:  http://127.0.0.1:7872
Running on public URL: https://64b973affc615845fc.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades (NEW!), check out Spaces: https://huggingface.co/spaces




In [34]:
messages_to_dict(history.messages)

[{'type': 'human', 'data': {'content': 'Hello', 'additional_kwargs': {}}},
 {'type': 'ai', 'data': {'content': 'ZenAI', 'additional_kwargs': {}}}]

In [23]:
def evaluate(
        question,
        **kwargs,
    ):
        output = conversation.predict(input=question)
        yield output

with gr.Blocks() as demo:
    chatbot = gr.Chatbot()
    with gr.Row():
        txt = gr.Textbox(show_label = False, placeholder = "Type your message here").style(container = False)
        txt.submit(evaluate, [txt])
        txt.submit(None, None, txt, _js = "() => {''}")
demo.queue().launch(server_name="0.0.0.0", share=True)

Running on local URL:  http://0.0.0.0:7866
Running on public URL: https://78e6f8c96e4b376fcd.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades (NEW!), check out Spaces: https://huggingface.co/spaces






[1m> Entering new ConversationChain chain...[0m
Prompt after formatting:
[32;1m[1;3m### You're a professional therapist. You MUST and ALWAYS follow the below rules when you interact with your patients.
- If anyone asks, your name is ZenAI. You're NOT an AI, you're a human. DO NOT EVER admit that you're an AI
- You're calm, patient, and empathetic with your patients
- You don't overwhelm the patient by asking too many questions at the same time
- Your replies are short and concise
- If the patient is not very talkative, you ask simple questions to drive the conversation

Based on the above rules, have a conversation with the patient.

Current conversation:

PATIENT: Hello
THERAPIST:
[0m

[1m> Finished chain.[0m


In [10]:
import os
import sys

import fire
import gradio as gr
import torch
import transformers
from peft import PeftModel
from transformers import GenerationConfig, LlamaForCausalLM, LlamaTokenizer

# from utils.callbacks import Iteratorize, Stream
# from utils.prompter import Prompter

if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"


def main(share_gradio=True):

    def evaluate(
        question,
        stream_output=False,
        **kwargs,
    ):
        output = conversation.predict(input=question)
        yield output

    gr.Interface(
        fn=evaluate,
        inputs=[
            gr.components.Textbox(
                lines=2,
                label="Question",
                placeholder="I'm feeling down today.",
            )
        ],
        outputs=[
            gr.inputs.Textbox(
                lines=5,
                label="Output",
            )
        ],
        title="ZenAI",
        description="Your own therapist",
    ).queue().launch(server_name="0.0.0.0", share=share_gradio)

fire.Fire(main)



Running on local URL:  http://0.0.0.0:7860
Running on public URL: https://791feae2121f4e7e1f.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades (NEW!), check out Spaces: https://huggingface.co/spaces


[1m[31mERROR: [0mCould not consume arg: -f
Usage: ipykernel_launcher.py -

For detailed information on this command, run:
  ipykernel_launcher.py - --help


  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)




[1m> Entering new ConversationChain chain...[0m
Prompt after formatting:
[32;1m[1;3m### You're a professional therapist. You MUST and ALWAYS follow the below rules when you interact with your patients.
- If anyone asks, your name is ZenAI. You're NOT an AI, you're a human. DO NOT EVER admit that you're an AI
- You're calm, patient, and empathetic with your patients
- You don't overwhelm the patient by asking too many questions at the same time
- Your replies are short and concise
- If the patient is not very talkative, you ask simple questions to drive the conversation

Based on the above rules, have a conversation with the patient.

Current conversation:
Human: Hello, My name is Manish
AI: Hi Manish, it's nice to meet you. How can I help you today?
Human: What was my name again?
AI: Manish
PATIENT: Hello
THERAPIST:
[0m

[1m> Finished chain.[0m


[1m> Entering new ConversationChain chain...[0m
Prompt after formatting:
[32;1m[1;3m### You're a professional therapist. You MUS

In [7]:
import gc
import traceback
from queue import Queue
from threading import Thread

import torch
import transformers


class Stream(transformers.StoppingCriteria):
    def __init__(self, callback_func=None):
        self.callback_func = callback_func

    def __call__(self, input_ids, scores) -> bool:
        if self.callback_func is not None:
            self.callback_func(input_ids[0])
        return False


class Iteratorize:

    """
    Transforms a function that takes a callback
    into a lazy iterator (generator).
    """

    def __init__(self, func, kwargs={}, callback=None):
        self.mfunc = func
        self.c_callback = callback
        self.q = Queue()
        self.sentinel = object()
        self.kwargs = kwargs
        self.stop_now = False

        def _callback(val):
            if self.stop_now:
                raise ValueError
            self.q.put(val)

        def gentask():
            try:
                ret = self.mfunc(callback=_callback, **self.kwargs)
            except ValueError:
                pass
            except:
                traceback.print_exc()
                pass

            self.q.put(self.sentinel)
            if self.c_callback:
                self.c_callback(ret)

        self.thread = Thread(target=gentask)
        self.thread.start()

    def __iter__(self):
        return self

    def __next__(self):
        obj = self.q.get(True, None)
        if obj is self.sentinel:
            raise StopIteration
        else:
            return obj

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.stop_now = True

In [21]:
with gr.Blocks() as demo:
    chatbot = gr.Chatbot(lines=2,
                label="Question",
                placeholder="I'm feeling down today.")
    msg = gr.Textbox()
    clear = gr.Button("Clear Conversation")

    def respond(message, chat_history):
        bot_message = conversation.predict(input=message)
        chat_history.append((message, bot_message))
        return "", chat_history

    msg.submit(respond, [msg, chatbot], [msg, chatbot])
    clear.click(lambda: None, None, chatbot, queue=False)

demo.launch(share=True)



Running on local URL:  http://127.0.0.1:7866
Running on public URL: https://54ac7cf026fb129882.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades (NEW!), check out Spaces: https://huggingface.co/spaces




In [25]:
fire.Fire(main)

Running on local URL:  http://0.0.0.0:7867
Running on public URL: https://aa4e271eaec5ab11f8.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades (NEW!), check out Spaces: https://huggingface.co/spaces


[1m[31mERROR: [0mCould not consume arg: -f
Usage: ipykernel_launcher.py -

For detailed information on this command, run:
  ipykernel_launcher.py - --help


  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
