In [None]:
!pip install transformers==4.34.0
!pip install sentencepiece==0.1.99
!pip install gradio

In [None]:
import gradio as gr
import torch
import gc

# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM


device = "cuda:0" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained("df-h/viachat-t5-large-v2.0")
model = AutoModelForSeq2SeqLM.from_pretrained("df-h/viachat-t5-large-v2.0")
model = model.to(device)

input_text = 'Translate from english to german: How old are you'
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)
print('input_text', input_text)
print('input_ids', input_ids)
outputs = model.generate(input_ids, max_length=500)
print('outputs', outputs)
text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("outputs text:", text)

def greet0(human_inputs):

    system_message = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."

    input_text = f"{system_message}\n### Human: {human_inputs}\n### Assistant: "
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)
    print('input_text', input_text)
    print('input_ids', input_ids)
    outputs = model.generate(input_ids, max_length=500)
    print('outputs', outputs)
    text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print("outputs text:", text)
    return text


Downloading (…)okenizer_config.json:   0%|          | 0.00/2.57k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading (…)lve/main/config.json:   0%|          | 0.00/816 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

input_text Translate from english to german: How old are you
input_ids tensor([[30355,    15, 32103,    45, 32103, 22269, 32103,    12, 32103, 13692,
            10, 32103,   571, 32103,   625, 32103,    33, 32103,    25,     1]])
outputs tensor([[    0,  2739, 32106,  4445, 32106,   436, 32106,   292,    58, 32103,
             1]])
outputs text: <pad> Wie  alt  sind  Sie?



In [None]:
yield_tokens = []
def greet_stream(human_inputs):
    system_message = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."
    input_text = f"{system_message}\n### Human: {human_inputs}\n### Assistant: "
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)

    max_length = 200
    encoder_output = model.encoder(
        input_ids=input_ids
    )['last_hidden_state']  # (1, len, d_model)

    gen_ids = torch.tensor([[0]]).to(device)
    past_key_values = None
    yield_tokens = []
    for i in range(max_length):
        out = model.decoder(
            input_ids=gen_ids,
            encoder_hidden_states=encoder_output,
            use_cache=True,
            past_key_values=past_key_values,
        )
        # print(out.past_key_values)
        past_key_values = out.past_key_values
        last_hidden_state = out.last_hidden_state  # (1, 1, d_model)
        lm_logits = model.lm_head(last_hidden_state)  # (1, 1, len_dict)
        values, indices = lm_logits[0].topk(2)
        # print(tokenizer.convert_ids_to_tokens(indices[0]))
        gen_ids = torch.index_select(indices, 1, torch.tensor([0]).to(device))
        yield_tokens.append(tokenizer.decode(
            gen_ids[0].cpu(), skip_special_tokens=True))
        # print(gen_ids[0])
        if torch.equal(gen_ids[0], torch.tensor([1])):
          # eos token
          break
        gc.collect()
        torch.cuda.empty_cache()
        yield "".join(yield_tokens)

    # stream
    return "".join(yield_tokens)

iface = gr.Interface(fn=greet_stream, inputs=[gr.Textbox(
    label="Text 1",
    info="Initial text",
    lines=3,
    value="Who are you?",
)], outputs="text")

iface.queue()
iface.launch(debug=True)


# for value in greet_stream('who are you'):
#   print(value)


Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://442ce60696c84f64c2.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)
