In [None]:
#@markdown # Dependencies

from IPython.utils import capture
import time
import os

print('Installing dependencies...')
with capture.capture_output() as cap:
    %cd /content/
    !git clone https://github.com/xaiguy/chippy
    %cd chippy
    !pip install -r requirements.txt
    !pip install accelerate
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

print('Done, proceed')

In [None]:
from typing import List
from pydantic import BaseModel

from transformers import GPTNeoXForCausalLM, AutoTokenizer

class Data(BaseModel):
    input_prompt: str

model = GPTNeoXForCausalLM.from_pretrained(
    "Rallio67/chip_1.4B_instruct_alpha",
    device_map="auto", 
    #load_in_8bit=True
)

tokenizer = AutoTokenizer.from_pretrained(
    "Rallio67/chip_1.4B_instruct_alpha"
)

In [None]:
def model_predict(prompt):
    inputs = tokenizer("User: " + prompt, return_tensors="pt").to("cuda")
    tokens = model.generate(**inputs, 
                            top_p=0.95,
                            temperature=0.5,
                            top_k=4, 
                            repetition_penalty=1.03,
                            max_length=100,
                            early_stopping=True
    )

    output = tokenizer.decode(tokens[0])
    return output.replace("<|endoftext|>", "")

In [None]:
# Le's test it
prompt="Hello, what are you?"
reply = model_predict(prompt)
print(reply)

In [None]:
#@markdown # Writing Streamlit app file

%%writefile app.py
# Imports
import streamlit as st
from streamlit_chat import message
import requests
import regex as re

from typing import List
from pydantic import BaseModel

from transformers import GPTNeoXForCausalLM, AutoTokenizer

# Create our simple data structure
class Data(BaseModel):
    input_prompt: str

# Load the model
model = GPTNeoXForCausalLM.from_pretrained(
    "Rallio67/chip_1.4B_instruct_alpha",
    device_map="auto", 
    #load_in_8bit=True
)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    "Rallio67/chip_1.4B_instruct_alpha"
)

# Define prediction function
def model_predict(prompt):
    inputs = tokenizer("User: " + prompt, return_tensors="pt").to("cuda")
    tokens = model.generate(**inputs, 
                            top_p=0.95,
                            temperature=0.5,
                            top_k=4, 
                            repetition_penalty=1.03,
                            max_length=100,
                            early_stopping=True
    )

    output = tokenizer.decode(tokens[0])
    return output.replace("<|endoftext|>", "")

# Streamlit App Title
st.title("Chippy Google Colab")

# Streamlit input field
input_prompt = st.text_input("Enter a prompt", "What is a Large Language Model?")

placeholder = st.empty()  # placeholder for latest message
message_history = []
message_history.append(input_prompt)

for j, message_ in enumerate(message_history):
    if j % 2 == 0:
        message(message_, is_user=True) # display all the previous message

res = model_predict(prompt=input_prompt)
cleaned_answer = re.sub("User:.+\n+Chip: ", "", res)
message(cleaned_answer)

In [None]:
# Might take some time since we're loading the model IN the app, normally we would use an API for that
!streamlit run app.py > /dev/null & npx localtunnel --port 8501