<a href="https://www.kaggle.com/code/antonchernov/llm-openai-api?scriptVersionId=196992858" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Installs

In [None]:
%%capture

!pip install flask-localtunnel
!pip install flask
!pip install flask-pydantic

# !pip install transformers
!pip install uvicorn

In [1]:
%%capture
# !pip install streamlit

!pip install uvicorn
!npm install -g localtunnel

!pip install fastapi
!pip install fastapi[standard]
!pip install sse-starlette

# Streamlit basic demo

In [None]:
%%writefile app.py
import streamlit as st
from transformers import pipeline
import time

# Load your model (with caching to improve performance)
@st.cache_resource()
def load_model():
    model = pipeline('text-generation', model='gpt2')
    return model

model = load_model()

# Set up your web app
st.title('GPT-2 Story Completer')
st.header('Enter text to complete:')

user_input = st.text_area('Write something to activate the AI:', height=200)
max_length = st.slider("Select max story length:", min_value=50, max_value=200, value=100, step=10)
num_sequences = st.selectbox("Select number of stories to generate:", options=[1, 2, 3], index=0)

if st.button('Generate Story'):
    with st.spinner('Generating Story...'):
        response = model(user_input, max_length=max_length, num_return_sequences=num_sequences)
        for i, summary in enumerate(response):
            st.write(f'**Story {i+1}:**')
            st.write(summary['generated_text'])
            st.markdown("---")

st.sidebar.markdown("## Guide")
st.sidebar.info("This tool uses GPT-2 to generate a story of your provided text. Adjust the sliders to change the story length and number of stories generated. The model is optimized for short to medium length paragraphs.")
st.sidebar.markdown("### Examples")
st.sidebar.write("1. Paste a beginning to see how the AI completes it.")
st.sidebar.write("2. Try different settings to see how the story changes.")
#Note : You may need to add HF_TOKEN in Colab Secrets depending on the LLM. You can access your tokens from https://huggingface.co/settings/tokens

In [None]:
!curl ipv4.icanhazip.com
!streamlit run app.py &>./logs.txt & npx localtunnel --port 8501

# FastAPI basic demo

In [19]:
%%writefile main.py
from fastapi import FastAPI

app = FastAPI()

@app.get("/")
def read_root():
    return {"message": "Hello, World!"}


Overwriting main.py


In [None]:
PORT = 8081

!curl ipv4.icanhazip.com
!uvicorn main:app --host 0.0.0.0 --port {PORT} & lt --port {PORT}

# FastAPI GPT demo

In [None]:
#@title Simple app

%%writefile main.py
from fastapi import FastAPI
from pydantic import BaseModel

# Initialize FastAPI app
app = FastAPI()

# Define the request model
class CompletionRequest(BaseModel):
    prompt: str
    max_tokens: int

# Define a POST endpoint that returns a static string
@app.post("/v1/completions")
async def generate_completion(request: CompletionRequest):
    # Instead of generating text, just return a static response
    return {"completion": f"Received prompt: {request.prompt}, max_tokens: {request.max_tokens}"}


In [None]:
#@title Completion GPT-2 API

%%writefile main.py
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from transformers import pipeline

# Initialize FastAPI app
app = FastAPI()

# Load the Hugging Face GPT-2 model
model = pipeline("text-generation", model="gpt2")

# Define the request model
class CompletionRequest(BaseModel):
    prompt: str
    max_tokens: int = 50

# Define the response model
class CompletionResponse(BaseModel):
    completion: str

@app.post("/v1/completions", response_model=CompletionResponse)
async def generate_completion(request: CompletionRequest):
    try:
        # Generate the text completion
        completions = model(request.prompt, max_length=request.max_tokens, num_return_sequences=1)
        completion_text = completions[0]["generated_text"]
        return CompletionResponse(completion=completion_text)
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

In [None]:
#@title Embeddings API

%%writefile main.py
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from transformers import BertTokenizer, BertModel
import torch

# Initialize FastAPI app
app = FastAPI()

# Load BERT emb_tokenizer and emb_model (a small BERT emb_model)
emb_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
emb_model = BertModel.from_pretrained("bert-base-uncased")

# Define the request emb_model
class EmbeddingRequest(BaseModel):
    text: str

# Define the response emb_model
class EmbeddingData(BaseModel):
    object: str
    embedding: list
    index: int

class EmbeddingResponse(BaseModel):
    object: str
    data: list[EmbeddingData]
    emb_model: str

@app.post("/v1/embeddings", response_model=EmbeddingResponse)
async def generate_embeddings(request: EmbeddingRequest):
    try:
        # Tokenize the input text and convert to tensor
        inputs = emb_tokenizer(request.text, return_tensors="pt", truncation=True, padding=True)

        # Generate embeddings with BERT emb_model
        with torch.no_grad():
            outputs = emb_model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().tolist()  # Averaging token embeddings

        # Build response
        response = EmbeddingResponse(
            object="list",
            data=[
                EmbeddingData(object="embedding", embedding=embeddings, index=0)
            ],
            emb_model="bert-base-uncased"
        )

        return response
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

In [None]:
PORT = 8089

!curl ipv4.icanhazip.com
!uvicorn main:app --host 0.0.0.0 --port {PORT} & lt --port {PORT} --subdomain 'myuniquesubdomain'

# Flask demo

In [None]:
#@title Simple example flask

%%writefile main.py

from flask import Flask

app = Flask(__name__)

@app.route("/", methods=['GET'])
def hello():
    return "Hello World!"

In [None]:
PORT = 8081

!curl ipv4.icanhazip.com
!uvicorn main:app --host 0.0.0.0 --port {PORT} & lt port {PORT}

In [None]:
#@title Completions API Flask

from flask import Flask, request, jsonify
from pydantic import BaseModel, ValidationError
from transformers import pipeline
from flask_pydantic import validate

# Initialize Flask app
app = Flask(__name__)

# Load the Hugging Face GPT-2 model
model = pipeline("text-generation", model="gpt2")

# Define the request model using Pydantic
class CompletionRequest(BaseModel):
    prompt: str
    max_tokens: int = 50

# Define the response model using Pydantic
class CompletionResponse(BaseModel):
    completion: str

@app.route("/v1/completions", methods=["POST"])
@validate(body=CompletionRequest)
def generate_completion():
    try:
        # Parse request data
        body = request.get_json()
        completion_request = CompletionRequest(**body)

        # Generate the text completion
        completions = model(completion_request.prompt, max_length=completion_request.max_tokens, num_return_sequences=1)
        completion_text = completions[0]["generated_text"]

        # Return response in JSON format
        completion_response = CompletionResponse(completion=completion_text)
        return jsonify(completion_response.dict())
    except ValidationError as ve:
        return jsonify({"error": str(ve)}), 400
    except Exception as e:
        return jsonify({"error": str(e)}), 500

In [None]:
# flask_ngrok_example.py

if __name__ == '__main__':
    run_with_lt(app)
    !curl ipv4.icanhazip.com
    app.run()

# LLM demo

In [2]:
%%capture
!pip install pip3-autoremove

!pip-autoremove torch torchvision torchaudio -y
!pip install torch torchvision torchaudio xformers triton

!pip-autoremove torch -y
!pip install torch xformers triton

!pip install "unsloth[kaggle-new] @ git+https://github.com/unslothai/unsloth.git"

!pip install sentence-transformers
!pip install einops

import os
os.environ["WANDB_DISABLED"] = "true"

# import os
# os._exit(00) # Перезапустить ядро

In [None]:
#@title Load model
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = torch.float16 # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
#     model_name = "unsloth/Phi-3.5-mini-instruct",
#     model_name="unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
#     model_name="unsloth/gemma-2-9b-bnb-4bit",
    model_name="unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

FastLanguageModel.for_inference(model) # Enable native 2x faster inference

In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
# alpaca_prompt = Copied from above

alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

prompt = alpaca_prompt.format(
        "Continue the fibonnaci sequence.", # instruction
        "1, 1, 2, 3, 5, 8", # input
        "", # output - leave this blank for generation!
    )

prompt = "Continue the fibonnaci sequence: 1, 1, 2, 3, 5, 8"
# prompt = "Why is sky blue?"


In [None]:
prompt = """
-Goal-
Given a text document that is potentially relevant to this activity and a list of entity types, identify all entities of those types from the text and all relationships among the identified entities.

-Steps-
1. Identify all entities. For each identified entity, extract the following information:
- entity_name: Name of the entity, capitalized
- entity_type: One of the following types: [organization,person,geo,event]
- entity_description: Comprehensive description of the entity's attributes and activities
Format each entity as ("entity"<|><entity_name><|><entity_type><|><entity_description>)
 
2. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are *clearly related* to each other.
For each pair of related entities, extract the following information:
- source_entity: name of the source entity, as identified in step 1
- target_entity: name of the target entity, as identified in step 1
- relationship_description: explanation as to why you think the source entity and the target entity are related to each other
- relationship_strength: a numeric score indicating strength of the relationship between the source entity and target entity
 Format each relationship as ("relationship"<|><source_entity><|><target_entity><|><relationship_description><|><relationship_strength>)
 
3. Return output in English as a single list of all the entities and relationships identified in steps 1 and 2. Use **##** as the list delimiter.
 
4. When finished, output <|COMPLETE|>.
 
######################
-Examples-
######################
Example 1:
Entity_types: ORGANIZATION,PERSON
Text:
The Verdantis's Central Institution is scheduled to meet on Monday and Thursday, with the institution planning to release its latest policy decision on Thursday at 1:30 p.m. PDT, followed by a press conference where Central Institution Chair Martin Smith will take questions. Investors expect the Market Strategy Committee to hold its benchmark interest rate steady in a range of 3.5%-3.75%.
######################
Output:
("entity"<|>CENTRAL INSTITUTION<|>ORGANIZATION<|>The Central Institution is the Federal Reserve of Verdantis, which is setting interest rates on Monday and Thursday)
##
("entity"<|>MARTIN SMITH<|>PERSON<|>Martin Smith is the chair of the Central Institution)
##
("entity"<|>MARKET STRATEGY COMMITTEE<|>ORGANIZATION<|>The Central Institution committee makes key decisions about interest rates and the growth of Verdantis's money supply)
##
("relationship"<|>MARTIN SMITH<|>CENTRAL INSTITUTION<|>Martin Smith is the Chair of the Central Institution and will answer questions at a press conference<|>9)
<|COMPLETE|>

######################
Example 2:
Entity_types: ORGANIZATION
Text:
TechGlobal's (TG) stock skyrocketed in its opening day on the Global Exchange Thursday. But IPO experts warn that the semiconductor corporation's debut on the public markets isn't indicative of how other newly listed companies may perform.

TechGlobal, a formerly public company, was taken private by Vision Holdings in 2014. The well-established chip designer says it powers 85% of premium smartphones.
######################
Output:
("entity"<|>TECHGLOBAL<|>ORGANIZATION<|>TechGlobal is a stock now listed on the Global Exchange which powers 85% of premium smartphones)
##
("entity"<|>VISION HOLDINGS<|>ORGANIZATION<|>Vision Holdings is a firm that previously owned TechGlobal)
##
("relationship"<|>TECHGLOBAL<|>VISION HOLDINGS<|>Vision Holdings formerly owned TechGlobal from 2014 until present<|>5)
<|COMPLETE|>

######################
Example 3:
Entity_types: ORGANIZATION,GEO,PERSON
Text:
Five Aurelians jailed for 8 years in Firuzabad and widely regarded as hostages are on their way home to Aurelia.

The swap orchestrated by Quintara was finalized when $8bn of Firuzi funds were transferred to financial institutions in Krohaara, the capital of Quintara.

The exchange initiated in Firuzabad's capital, Tiruzia, led to the four men and one woman, who are also Firuzi nationals, boarding a chartered flight to Krohaara.

They were welcomed by senior Aurelian officials and are now on their way to Aurelia's capital, Cashion.

The Aurelians include 39-year-old businessman Samuel Namara, who has been held in Tiruzia's Alhamia Prison, as well as journalist Durke Bataglani, 59, and environmentalist Meggie Tazbah, 53, who also holds Bratinas nationality.
######################
Output:
("entity"<|>FIRUZABAD<|>GEO<|>Firuzabad held Aurelians as hostages)
##
("entity"<|>AURELIA<|>GEO<|>Country seeking to release hostages)
##
("entity"<|>QUINTARA<|>GEO<|>Country that negotiated a swap of money in exchange for hostages)
##
##
("entity"<|>TIRUZIA<|>GEO<|>Capital of Firuzabad where the Aurelians were being held)
##
("entity"<|>KROHAARA<|>GEO<|>Capital city in Quintara)
##
("entity"<|>CASHION<|>GEO<|>Capital city in Aurelia)
##
("entity"<|>SAMUEL NAMARA<|>PERSON<|>Aurelian who spent time in Tiruzia's Alhamia Prison)
##
("entity"<|>ALHAMIA PRISON<|>GEO<|>Prison in Tiruzia)
##
("entity"<|>DURKE BATAGLANI<|>PERSON<|>Aurelian journalist who was held hostage)
##
("entity"<|>MEGGIE TAZBAH<|>PERSON<|>Bratinas national and environmentalist who was held hostage)
##
("relationship"<|>FIRUZABAD<|>AURELIA<|>Firuzabad negotiated a hostage exchange with Aurelia<|>2)
##
("relationship"<|>QUINTARA<|>AURELIA<|>Quintara brokered the hostage exchange between Firuzabad and Aurelia<|>2)
##
("relationship"<|>QUINTARA<|>FIRUZABAD<|>Quintara brokered the hostage exchange between Firuzabad and Aurelia<|>2)
##
("relationship"<|>SAMUEL NAMARA<|>ALHAMIA PRISON<|>Samuel Namara was a prisoner at Alhamia prison<|>8)
##
("relationship"<|>SAMUEL NAMARA<|>MEGGIE TAZBAH<|>Samuel Namara and Meggie Tazbah were exchanged in the same hostage release<|>2)
##
("relationship"<|>SAMUEL NAMARA<|>DURKE BATAGLANI<|>Samuel Namara and Durke Bataglani were exchanged in the same hostage release<|>2)
##
("relationship"<|>MEGGIE TAZBAH<|>DURKE BATAGLANI<|>Meggie Tazbah and Durke Bataglani were exchanged in the same hostage release<|>2)
##
("relationship"<|>SAMUEL NAMARA<|>FIRUZABAD<|>Samuel Namara was a hostage in Firuzabad<|>2)
##
("relationship"<|>MEGGIE TAZBAH<|>FIRUZABAD<|>Meggie Tazbah was a hostage in Firuzabad<|>2)
##
("relationship"<|>DURKE BATAGLANI<|>FIRUZABAD<|>Durke Bataglani was a hostage in Firuzabad<|>2)
<|COMPLETE|>

######################
-Real Data-
######################
Entity_types: organization,person,geo,event
Text: п»їThe Project Gutenberg eBook of A Christmas Carol
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org. If you are not located in the United States,
you will have to check the laws of the country where you are located
before using this eBook.
######################
"""

In [None]:
prompt = """
Continue the fibonnaci sequence: 1, 1, 2, 3, 5, 8...
Stop when you generate exactly 10 terms of a sequence.
When finished, output <|COMPLETE|>. 
Fibonnaci sequence:
"""

In [None]:
#If you have included (source_entity, target_entity), then do not include (target_entity, source_entity).

prompt_tuned_list = [
    '-General instructions-',
#     'Be brief.',
#     'Do NOT generate any code.',
#     'Do not generate any comments.',
#     'Do not generate any notes.'
#     'Do not generate time.'
]

prompt_tuned = '\n'.join(prompt_tuned_list)

# prompt = prompt_tuned + '\n' + prompt
# prompt[:len(prompt_tuned)+30]
prompt = prompt + prompt_tuned
prompt[-len(prompt_tuned)-30:]

In [None]:
%%time

inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
outputs = model.generate(
    **inputs,
    max_new_tokens=400,
    temperature=1,
#     do_sample=True, 
#     top_p=1.0,
    repetition_penalty=1,
)

completions = tokenizer.batch_decode(outputs, skip_special_tokens=True)
completion_text = completions[0][len(prompt):]
print(completion_text)

## LLM Flask API

In [None]:
#@title Completions API Flask

from flask import Flask, request, jsonify
from pydantic import BaseModel, ValidationError
from transformers import pipeline
from flask_pydantic import validate
from flask_lt import run_with_lt

# Initialize Flask app
app = Flask(__name__)

# Define the request model using Pydantic
class CompletionRequest(BaseModel):
    prompt: str
    max_tokens: int = 50

# Define the response model using Pydantic
class CompletionResponse(BaseModel):
    completion: str

@app.route("/v1/completions", methods=["POST"])
@validate(body=CompletionRequest)
def generate_completion():
    try:
        # Parse request data
        body = request.get_json()
        completion_request = CompletionRequest(**body)

        # # Generate the text completion
        # completions = model(completion_request.prompt, max_length=completion_request.max_tokens, num_return_sequences=1)
        # completion_text = completions[0]["generated_text"]

        inputs = tokenizer(completion_request.prompt, return_tensors = "pt").to("cuda")
        outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
        completions = tokenizer.batch_decode(outputs)
        completion_text = completions[0]

        # Return response in JSON format
        completion_response = CompletionResponse(completion=completion_text)
        return jsonify(completion_response.dict())
    except ValidationError as ve:
        return jsonify({"error": str(ve)}), 400
    except Exception as e:
        return jsonify({"error": str(e)}), 500

In [None]:
#@title Embeddings API Flask

%%writefile main.py

from flask import Flask, request, jsonify
from transformers import BertTokenizer, BertModel
from transformers import AutoTokenizer, AutoModel
import torch

# Initialize Flask app
# app = Flask(__name__)

# Load BERT emb_tokenizer and emb_model (a small BERT emb_model)
emb_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
emb_model = BertModel.from_pretrained("bert-base-uncased")

# emb_tokenizer = AutoTokenizer.from_pretrained('intfloat/e5-mistral-7b-instruct')
# emb_model = AutoModel.from_pretrained('intfloat/e5-mistral-7b-instruct')

@app.route("/v1/embeddings", methods=["POST"])
def generate_embeddings():
    try:
        # Parse the incoming JSON request
        data = request.json
        text = data.get("text", "")

        # Tokenize the input text and convert to tensor
        inputs = emb_tokenizer(text, return_tensors="pt", truncation=True, padding=True)

        # Generate embeddings with BERT emb_model
        with torch.no_grad():
            outputs = emb_model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().tolist()  # Averaging token embeddings

        # Build response
        response = {
            "object": "list",
            "data": [
                {
                    "object": "embedding",
                    "embedding": embeddings,
                    "index": 0
                }
            ],
            "emb_model": "bert-base-uncased"
        }

        return jsonify(response)

    except Exception as e:
        # Handle exceptions and return a 500 error with the exception message
        return jsonify({"error": str(e)}), 500

In [None]:
#@title Completions + Embeddings API Flask

%%writefile main.py

from flask import Flask, request, jsonify
from pydantic import BaseModel, ValidationError
from transformers import pipeline
from flask_pydantic import validate
from flask_lt import run_with_lt
from transformers import BertTokenizer, BertModel
from transformers import AutoTokenizer, AutoModel
import torch

# Initialize Flask app
app = Flask(__name__)

# Define the request model using Pydantic
class CompletionRequest(BaseModel):
    prompt: str
    max_tokens: int = 50

# Define the response model using Pydantic
class CompletionResponse(BaseModel):
    completion: str

@app.route("/v1/completions", methods=["POST"])
@validate(body=CompletionRequest)
def generate_completion():
    try:
        # Parse request data
        body = request.get_json()
        completion_request = CompletionRequest(**body)

        # # Generate the text completion
        # completions = model(completion_request.prompt, max_length=completion_request.max_tokens, num_return_sequences=1)
        # completion_text = completions[0]["generated_text"]

        inputs = tokenizer(completion_request.prompt, return_tensors = "pt").to("cuda")
        outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
        completions = tokenizer.batch_decode(outputs)
        completion_text = completions[0]

        # Return response in JSON format
        completion_response = CompletionResponse(completion=completion_text)
        return jsonify(completion_response.dict())
    except ValidationError as ve:
        return jsonify({"error": str(ve)}), 400
    except Exception as e:
        return jsonify({"error": str(e)}), 500

# Load BERT emb_tokenizer and emb_model (a small BERT emb_model)
emb_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
emb_model = BertModel.from_pretrained("bert-base-uncased")

# emb_tokenizer = AutoTokenizer.from_pretrained('intfloat/e5-mistral-7b-instruct')
# emb_model = AutoModel.from_pretrained('intfloat/e5-mistral-7b-instruct')

@app.route("/v1/embeddings", methods=["POST"])
def generate_embeddings():
    try:
        # Parse the incoming JSON request
        data = request.json
        text = data.get("text", "")

        # Tokenize the input text and convert to tensor
        inputs = emb_tokenizer(text, return_tensors="pt", truncation=True, padding=True)

        # Generate embeddings with BERT emb_model
        with torch.no_grad():
            outputs = emb_model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().tolist()  # Averaging token embeddings

        # Build response
        response = {
            "object": "list",
            "data": [
                {
                    "object": "embedding",
                    "embedding": embeddings,
                    "index": 0
                }
            ],
            "emb_model": "bert-base-uncased"
        }

        return jsonify(response)

    except Exception as e:
        # Handle exceptions and return a 500 error with the exception message
        return jsonify({"error": str(e)}), 500

In [None]:
#@title Run app
from flask import Flask
from flask_lt import run_with_lt

if __name__ == '__main__':
    !curl ipv4.icanhazip.com

    run_with_lt(app, subdomain='myuniquesubdomain')
    app.run()

## LLM FastAPI API

In [None]:
%%writefile main.py
#@title Completion LLM FastAPI

from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from transformers import pipeline

from unsloth import FastLanguageModel
import torch

# Initialize FastAPI app
app = FastAPI()

max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = torch.float16 # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
#     model_name = "unsloth/Phi-3.5-mini-instruct",
    model_name="unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# Define the request model
class CompletionRequest(BaseModel):
    prompt: str
    max_tokens: int = 50

# Define the response model
class CompletionResponse(BaseModel):
    completion: str

@app.post("/v1/completions", response_model=CompletionResponse)
async def generate_completion(completion_request: CompletionRequest):
    try:
        # Generate the text completion
        inputs = tokenizer(completion_request.prompt, return_tensors = "pt").to("cuda")
        outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
        completions = tokenizer.batch_decode(outputs)
        completion_text = completions[0]

        return CompletionResponse(completion=completion_text)
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

In [None]:
#@title Chat completion FastAPI

%%writefile main.py

from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from transformers import pipeline

from unsloth import FastLanguageModel
import torch

# Initialize FastAPI app
app = FastAPI()

max_seq_length = 2048
dtype = torch.float16
load_in_4bit = True

fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",
    # ... other models ...
]

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Phi-3.5-mini-instruct",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

FastLanguageModel.for_inference(model)

# Define the message structure for the chat-based interaction
class ChatMessage(BaseModel):
    role: str
    content: str

# Define the request model for chat-based completion
class ChatCompletionRequest(BaseModel):
    messages: list[ChatMessage]
    max_tokens: int = 50
    n: int = 1

# Define the response model for chat-based completion
class ChatCompletionResponse(BaseModel):
    completions: list[str]

import logging

# Configure basic logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
async def generate_chat_completion(chat_request: ChatCompletionRequest):
    print(f"REQUEST: {chat_request}")
    try:
        # Convert the chat messages into a single prompt
        prompt = "\n".join([f"{msg.role}: {msg.content}" for msg in chat_request.messages])

        inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
        outputs = model.generate(
            **inputs,
            max_new_tokens=chat_request.max_tokens,
            do_sample=True,          # Enable sampling
#             top_p=0.9,               # Set nucleus sampling threshold
            num_return_sequences=chat_request.n  # Generate n completions
        )
        completion_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)

        # Return all completions as a list
        return ChatCompletionResponse(completions=completion_texts)
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))


In [None]:
#@title [OAI] Chat Completion LLM FastAPI

%%writefile main.py

from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from transformers import pipeline

from unsloth import FastLanguageModel
import torch

# Initialize FastAPI app
app = FastAPI()

max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = torch.float16 # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Phi-3.5-mini-instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# Define the message structure for the chat-based interaction
class ChatMessage(BaseModel):
    role: str
    content: str

# Define the request model for chat-based completion
class ChatCompletionRequest(BaseModel):
    messages: list[ChatMessage]
    max_tokens: int = 50
    n: int = 1

# Define the response model for chat-based completion
class ChatCompletionChoice(BaseModel):
    message: ChatMessage
    index: int

class ChatCompletionResponse(BaseModel):
    id: str
    object: str
    created: int
    model: str
    choices: list[ChatCompletionChoice]
    usage: dict

import logging
import uuid
import time

# Configure basic logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
async def generate_chat_completion(chat_request: ChatCompletionRequest):
    print(f"REQUEST: {chat_request}")
    try:
        # Convert the chat messages into a single prompt
        prompt = "\n".join([f"{msg.role}: {msg.content}" for msg in chat_request.messages])

        inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
        outputs = model.generate(
            **inputs,
            max_new_tokens=chat_request.max_tokens,
            do_sample=True,          # Enable sampling
#             top_p=0.9,               # Set nucleus sampling threshold
            num_return_sequences=chat_request.n  # Generate n completions
        )
        completion_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)

        # Create response according to OpenAI Chat Completions format
        response = ChatCompletionResponse(
            id=str(uuid.uuid4()),  # Generate a unique ID for the response
            object="chat.completion",
            created=int(time.time()),  # Current timestamp
            model="your-model-id",  # Replace with your actual model ID
            choices=[
                ChatCompletionChoice(
                    message=ChatMessage(role="assistant", content=text),
                    index=i
                )
                for i, text in enumerate(completion_texts)
            ],
            usage={
                "prompt_tokens": len(inputs["input_ids"][0]),
                "completion_tokens": sum(len(tokenizer.encode(text)) for text in completion_texts),
                "total_tokens": len(inputs["input_ids"][0]) + sum(len(tokenizer.encode(text)) for text in completion_texts)
            }
        )

        return response
    except Exception as e:
        logger.error(f"Error occurred: {e}")
        raise HTTPException(status_code=500, detail=str(e))


In [None]:
#@title Embeddings API

%%writefile main.py

from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from transformers import BertTokenizer, BertModel
import torch

# Initialize FastAPI app
app = FastAPI()

# Load BERT emb_tokenizer and emb_model (a small BERT emb_model)
emb_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
emb_model = BertModel.from_pretrained("bert-base-uncased")

# Define the request emb_model
class EmbeddingRequest(BaseModel):
    text: str

# Define the response emb_model
class EmbeddingData(BaseModel):
    object: str
    embedding: list
    index: int

class EmbeddingResponse(BaseModel):
    object: str
    data: list[EmbeddingData]
    emb_model: str

@app.post("/v1/embeddings", response_model=EmbeddingResponse)
async def generate_embeddings(request: EmbeddingRequest):
    try:
        # Tokenize the input text and convert to tensor
        inputs = emb_tokenizer(request.text, return_tensors="pt", truncation=True, padding=True)

        # Generate embeddings with BERT emb_model
        with torch.no_grad():
            outputs = emb_model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().tolist()  # Averaging token embeddings

        # Build response
        response = EmbeddingResponse(
            object="list",
            data=[
                EmbeddingData(object="embedding", embedding=embeddings, index=0)
            ],
            emb_model="bert-base-uncased"
        )

        return response
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

In [None]:
#@title Completion + Embeddings + Chat LLM FastAPI

%%writefile main.py
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from transformers import pipeline
from flask import Flask, request, jsonify
from transformers import BertTokenizer, BertModel
from transformers import AutoTokenizer, AutoModel
import torch

from unsloth import FastLanguageModel
import torch

# Initialize FastAPI app
app = FastAPI()

max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = torch.float16 # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Phi-3.5-mini-instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# Define the request model
class CompletionRequest(BaseModel):
    prompt: str
    max_tokens: int = 50

# Define the response model
class CompletionResponse(BaseModel):
    completion: str

@app.post("/v1/completions", response_model=CompletionResponse)
async def generate_completion(completion_request: CompletionRequest):
    try:
        # Generate the text completion
        inputs = tokenizer(completion_request.prompt, return_tensors = "pt").to("cuda")
        outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
        completions = tokenizer.batch_decode(outputs)
        completion_text = completions[0]

        return CompletionResponse(completion=completion_text)
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from transformers import BertTokenizer, BertModel
import torch

# Load BERT emb_tokenizer and emb_model (a small BERT emb_model)
emb_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
emb_model = BertModel.from_pretrained("bert-base-uncased")

# Define the request emb_model
class EmbeddingRequest(BaseModel):
    text: str

# Define the response emb_model
class EmbeddingData(BaseModel):
    object: str
    embedding: list
    index: int

class EmbeddingResponse(BaseModel):
    object: str
    data: list[EmbeddingData]
    emb_model: str

@app.post("/v1/embeddings", response_model=EmbeddingResponse)
async def generate_embeddings(request: EmbeddingRequest):
    try:
        # Tokenize the input text and convert to tensor
        inputs = emb_tokenizer(request.text, return_tensors="pt", truncation=True, padding=True)

        # Generate embeddings with BERT emb_model
        with torch.no_grad():
            outputs = emb_model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().tolist()  # Averaging token embeddings

        # Build response
        response = EmbeddingResponse(
            object="list",
            data=[
                EmbeddingData(object="embedding", embedding=embeddings, index=0)
            ],
            emb_model="bert-base-uncased"
        )

        return response
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

# Define the message structure for the chat-based interaction
class ChatMessage(BaseModel):
    role: str
    content: str

# Define the request model for chat-based completion
class ChatCompletionRequest(BaseModel):
    messages: list[ChatMessage]
    max_tokens: int = 50

# Define the response model for chat-based completion
class ChatCompletionResponse(BaseModel):
    completion: str

import logging
# Configure basic logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
async def generate_chat_completion(chat_request: ChatCompletionRequest):
    print(f"REQUEST: {chat_request}")
    try:
        # Convert the chat messages into a single prompt
        prompt = "\n".join([f"{msg.role}: {msg.content}" for msg in chat_request.messages])

        # Generate the text completion
        inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
        outputs = model.generate(**inputs, max_new_tokens=chat_request.max_tokens, use_cache=True)
        completion_text = tokenizer.batch_decode(outputs)[0]

        print(f"RESPONSE: {completion_text}")
        return ChatCompletionResponse(completion=completion_text)
    except Exception as e:
        logger.error(f"Error generating completion: {e}")
        raise HTTPException(status_code=500, detail=str(e))

# [OAI] Completion + Embeddings + Chat LLM FastAPI

## [OAI] Load models

In [138]:
%%writefile main.py
#@title Models loading

from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from transformers import pipeline
from flask import Flask, request, jsonify
from unsloth import FastLanguageModel
import torch
from typing import List
import logging
import uuid
import time

max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = torch.float16 # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

# MODEL_NAME = "unsloth/Phi-3.5-mini-instruct"
# MODEL_NAME = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit"
MODEL_NAME="unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit"

# Suppress warning
loggers = [logging.getLogger(name) for name in logging.root.manager.loggerDict]
for logger in loggers:
    if "transformers" in logger.name.lower():
        logger.setLevel(logging.ERROR)
        
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

FastLanguageModel.for_inference(model) # Enable native 2x faster inference


Overwriting main.py


In [139]:
# # %%writefile -a main.py

# from transformers import BertTokenizer, BertModel
# from transformers import AutoTokenizer, AutoModel
# from transformers import BertTokenizer, BertModel

# EMB_MODEL_NAME = "bert-base-uncased"
# emb_tokenizer = BertTokenizer.from_pretrained(EMB_MODEL_NAME)
# emb_model = BertModel.from_pretrained(EMB_MODEL_NAME)

In [140]:
# input_texts = ['search_document: TSNE is a dimensionality reduction algorithm created by Laurens van Der Maaten']

# text = input_texts[0]
# encoded_input = emb_tokenizer(text, return_tensors='pt')
# output = emb_model(**encoded_input)
# embedding = output.last_hidden_state.mean(dim=1).squeeze().tolist()
# len(embedding)

In [141]:
%%writefile -a main.py

from sentence_transformers import SentenceTransformer

EMB_MODEL_NAME = "nomic-ai/nomic-embed-text-v1.5"
emb_model = SentenceTransformer(EMB_MODEL_NAME, trust_remote_code=True)

Appending to main.py


In [142]:
# input_texts = ['search_document: TSNE is a dimensionality reduction algorithm created by Laurens van Der Maaten']

# embeddings = emb_model.encode(input_texts)
# len(embeddings[0])
# embeddings[0].tolist()

## [OAI] Completions

In [143]:
%%writefile -a main.py

# Initialize FastAPI app
app = FastAPI()

# # Configure basic logging
# logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
# logger = logging.getLogger(__name__)

# Define the request model
class CompletionRequest(BaseModel):
    prompt: str
    max_tokens: int = 50

# Define the response model for completion
class CompletionChoice(BaseModel):
    text: str
    index: int

class CompletionResponse(BaseModel):
    id: str
    object: str
    created: int
    model: str
    choices: List[CompletionChoice]
    usage: dict

@app.post("/v1/completions", response_model=CompletionResponse)
async def generate_completion(completion_request: CompletionRequest):
    print(f"\n\nREQUEST:")
    print(f"{completion_request.prompt}")
    
    try:
        # Generate the text completion
        inputs = tokenizer(completion_request.prompt, return_tensors="pt").to("cuda")
        outputs = model.generate(
            **inputs,
            max_new_tokens=completion_request.max_tokens,
            do_sample=True,
            top_p=0.9
        )
    
        completions = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        completion_start = len(completion_request.prompt)
        completions = [comp[completion_start:] for comp in completions]

        response = CompletionResponse(
            id=str(uuid.uuid4()),  # Unique ID for the completion
            object="text_completion",
            created=int(time.time()),  # Current timestamp
            model=MODEL_NAME,  # Model used
            choices=[
                CompletionChoice(
                    text=text,
                    index=i
                )
                for i, text in enumerate(completions)
            ],
            usage={
                "prompt_tokens": len(inputs["input_ids"][0]),
                "completion_tokens": sum(len(tokenizer.encode(text)) for text in completions),
                "total_tokens": len(inputs["input_ids"][0]) + sum(len(tokenizer.encode(text)) for text in completions)
            }
        )
        
        print(f"\nRESPONSE:")
        print(f"{completions[0]}")
        
        return response
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

Appending to main.py


## [OAI] Embeddings

In [144]:
%%writefile -a main.py

from typing import List, Union
from typing import Optional

# Define the input and response models
class EmbeddingRequest(BaseModel):
    input: Union[str, List[str]]  # Required field
    model: Optional[str] = None   # Optional field

class EmbeddingData(BaseModel):
    object: str                   # "embedding"
    embedding: List[float]        # Embedding vector
    index: int                    # Index of the input

class EmbeddingResponse(BaseModel):
    object: str                   # "list"
    data: List[EmbeddingData]     # List of embedding data
    model: str                    # Model name

@app.post("/v1/embeddings", response_model=EmbeddingResponse)
async def generate_embeddings(request: EmbeddingRequest):
    try:
        input_texts = [request.input] if isinstance(request.input, str) else request.input
        
        # MODEL CALL INTERFACE: Tokenizer + Model
#         responses = []
#         for idx, text in enumerate(input_texts):
#             inputs = emb_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
#             with torch.no_grad():
#                 outputs = emb_model(**inputs)
#                 embedding = outputs.last_hidden_state.mean(dim=1).squeeze().tolist()

#             responses.append(EmbeddingData(object="embedding", embedding=embedding, index=idx))

        # MODEL CALL INTERFACE: Model only (nomic-ai)
        input_texts = ['search_document: ' + txt for txt in input_texts]
        embeddings = emb_model.encode(input_texts).tolist()
        responses = [
            EmbeddingData(object="embedding", embedding=embedding, index=idx)
            for idx, embedding in enumerate(embeddings)
        ]
    
        return EmbeddingResponse(
            object="list",
            data=responses,
            model=request.model or EMB_MODEL_NAME  # Use default model name if not provided
        )

    except Exception as e:
        # Handle and log unexpected errors
        print(f"Error: {str(e)}")
        raise HTTPException(status_code=500, detail="Internal Server Error")

Appending to main.py


## [OAI] Chat completions

In [145]:
%%writefile -a main.py

# Define the message structure for the chat-based interaction
class ChatMessage(BaseModel):
    role: str
    content: str

# Define the request model for chat-based completion
class ChatCompletionRequest(BaseModel):
    messages: List[ChatMessage]
    max_tokens: int = 50

# Define the response model for chat-based completion
class ChatCompletionChoice(BaseModel):
    message: ChatMessage
    index: int

class ChatCompletionResponse(BaseModel):
    id: str
    object: str
    created: int
    model: str
    choices: List[ChatCompletionChoice]
    usage: dict

Appending to main.py


In [146]:
%%writefile -a main.py

# Chat completions: non-streaming

@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
async def generate_chat_completion(chat_request: ChatCompletionRequest):
    print(f"\n\nREQUEST:")
#     print(chat_request)
    for k, msg in enumerate(chat_request.messages):
            print(f"({msg.role}): {msg.content}")

    try:
                  
        # Convert the chat messages into a single prompt
        prompt = "\n".join([f"{msg.role}: {msg.content}" for msg in chat_request.messages])
#         prompt += " Respond with your own thoughts, do not complete the user's input."
        prompt += "\nassistant:"
        # Generate the text completion
        inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
        
        outputs = model.generate(
            **inputs,
            max_new_tokens=chat_request.max_tokens,
            do_sample=True,          # Enable sampling
            top_p=0.9,               # Set nucleus sampling threshold
            num_return_sequences=1  # Generate a single completion
        )
        
        
        completion_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        completion_start = len(prompt)
        completion_texts = [comp[completion_start:] for comp in completion_texts]
        
        print(f'completion_texts len: {len(completion_texts)}')
        print(f'completion_start: {completion_start}')
        
        # Create response according to OpenAI Chat Completions format
        response = ChatCompletionResponse(
            id=str(uuid.uuid4()),  # Generate a unique ID for the response
            object="chat.completion",
            created=int(time.time()),  # Current timestamp
            model=MODEL_NAME,  # Model used
            choices=[
                ChatCompletionChoice(
                    message=ChatMessage(role="assistant", content=text),
                    index=i
                )
                for i, text in enumerate(completion_texts)
            ],
            usage={
                "prompt_tokens": len(inputs["input_ids"][0]),
                "completion_tokens": sum(len(tokenizer.encode(text)) for text in completion_texts),
                "total_tokens": len(inputs["input_ids"][0]) + sum(len(tokenizer.encode(text)) for text in completion_texts)
            }
        )

        res = response.choices[0]
        print(f"\nRESPONSE:")
        print(f"({res.message.role}): {res.message.content}")
              
        return response
    except Exception as e:
        print(f"Error generating completion: {e}")
        raise HTTPException(status_code=500, detail=str(e))

Appending to main.py


In [147]:
# %%writefile -a main.py

# # Chat completions: streaming = True

# from fastapi.responses import StreamingResponse
# from sse_starlette.sse import EventSourceResponse
# import uuid
# import json
# import time

# @app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
# async def generate_chat_completion(chat_request: ChatCompletionRequest):
#     print(f"\n\nREQUEST:")
#     for k, msg in enumerate(chat_request.messages):
#         print(f"({msg.role}): {msg.content}")

#     try:
#         # Convert the chat messages into a single prompt
#         prompt = "\n".join([f"{msg.role}: {msg.content}" for msg in chat_request.messages])
#         prompt += "\nassistant:"

#         # Tokenize the prompt
#         inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
        
#         if chat_request.streaming:
#             # Define a generator function to yield tokens as they're generated
#             async def token_generator(prompt, max_tokens):
#                 # Tokenize the prompt
#                 input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda")

#                 # Initialize generated tokens with the prompt
#                 generated_tokens = input_ids
#                 total_generated_tokens = 0

#                 # Manually generate tokens one by one
#                 for _ in range(max_tokens):
#                     with torch.no_grad():
#                         # Generate the next token
#                         outputs = model(input_ids=generated_tokens)
#                         next_token_logits = outputs.logits[:, -1, :]
#                         next_token_id = torch.argmax(next_token_logits, dim=-1).unsqueeze(0)

#                         # Append the generated token to the sequence
#                         generated_tokens = torch.cat((generated_tokens, next_token_id), dim=1)

#                         # Decode and get the latest token generated
#                         next_token_text = tokenizer.decode(next_token_id[0], clean_up_tokenization_spaces=True)

#                         # Increment total generated tokens
#                         total_generated_tokens += 1

#                         print(next_token_text, end="")
#                         # Ensure we're not yielding empty strings
#                         if next_token_text.strip():
#                             # Yield the token in OpenAI API's streaming format
#                             yield f'data: {json.dumps({"choices": [{"delta": {"content": next_token_text}, "index": 0, "finish_reason": None}]})}\n\n'

#                         # Stop generation if the EOS token is encountered
#                         if next_token_id.item() == tokenizer.eos_token_id:
#                             yield f'data: {json.dumps({"choices": [{"delta": {}, "index": 0, "finish_reason": "stop"}]})}\n\n'
#                             break

#                 # If max_tokens is reached without an EOS token, send the finish event
#                 yield f'data: {json.dumps({"choices": [{"delta": {}, "index": 0, "finish_reason": "length"}]})}\n\n'


#             print(f"\nRESPONSE:\n(assistant):")
#             # Return a streaming response
#             return EventSourceResponse(token_generator(prompt, chat_request.max_tokens))

In [148]:
# %%writefile -a main.py

#     # Chat completions: streaming = False
#     else:
#         outputs = model.generate(
#             **inputs,
#             max_new_tokens=chat_request.max_tokens,
#             do_sample=True,          # Enable sampling
#             top_p=0.9,               # Set nucleus sampling threshold
#             num_return_sequences=1  # Generate a single completion
#         )
        
        
#         completion_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
#         completion_start = len(prompt)
#         completion_texts = [comp[completion_start:] for comp in completion_texts]
        
#         print(f'completion_texts len: {len(completion_texts)}')
#         print(f'completion_start: {completion_start}')
        
#         # Create response according to OpenAI Chat Completions format
#         response = ChatCompletionResponse(
#             id=str(uuid.uuid4()),  # Generate a unique ID for the response
#             object="chat.completion",
#             created=int(time.time()),  # Current timestamp
#             model=MODEL_NAME,  # Model used
#             choices=[
#                 ChatCompletionChoice(
#                     message=ChatMessage(role="assistant", content=text),
#                     index=i
#                 )
#                 for i, text in enumerate(completion_texts)
#             ],
#             usage={
#                 "prompt_tokens": len(inputs["input_ids"][0]),
#                 "completion_tokens": sum(len(tokenizer.encode(text)) for text in completion_texts),
#                 "total_tokens": len(inputs["input_ids"][0]) + sum(len(tokenizer.encode(text)) for text in completion_texts)
#             }
#         )

#         res = response.choices[0]
#         print(f"\nRESPONSE:")
#         print(f"({res.message.role}): {res.message.content}")
              
#         return response
#     except Exception as e:
#         print(f"Error generating completion: {e}")
#         raise HTTPException(status_code=500, detail=str(e))

In [149]:
%%writefile -a main.py

from fastapi import FastAPI, Request
from fastapi.exceptions import RequestValidationError
from fastapi.responses import JSONResponse

# Exception handler to log invalid requests
@app.exception_handler(RequestValidationError)
async def validation_exception_handler(request: Request, exc: RequestValidationError):
    print(f"Invalid request: {request}")
    print(f"Validation error: {exc.errors()}")
    return JSONResponse(
        status_code=422,
        content={"detail": exc.errors()},
    )

Appending to main.py


## Launch API

In [None]:
PORT = 8089

!curl ipv4.icanhazip.com
!uvicorn main:app --host 0.0.0.0 --port {PORT} & lt --port {PORT} --subdomain 'very-unique-sub-domain'



REQUEST:
Hello, world!

RESPONSE:
 This is the [Introduction](/introduction) page
[32mINFO[0m:     79.139.204.44:0 - "[1mPOST /v1/completions HTTP/1.1[0m" [32m200 OK[0m


# Experimental

In [None]:
#@title Dummy completions API Flask

from flask import Flask, request, jsonify
from pydantic import BaseModel, ValidationError
from flask_pydantic import validate
from flask_lt import run_with_lt

# Initialize Flask app
app = Flask(__name__)

# Define the request model using Pydantic
class CompletionRequest(BaseModel):
    prompt: str
    max_tokens: int = 50

# Define the response model using Pydantic
class CompletionResponse(BaseModel):
    completion: str

@app.route("/v1/completions", methods=["POST"])
@validate(body=CompletionRequest)
def generate_completion():
    try:
        # Parse request data
        body = request.get_json()
        completion_request = CompletionRequest(**body)

        completion_text = "Successul POST request."

        # Return response in JSON format
        completion_response = CompletionResponse(completion=completion_text)
        return jsonify(completion_response.dict())
    except ValidationError as ve:
        return jsonify({"error": str(ve)}), 400
    except Exception as e:
        return jsonify({"error": str(e)}), 500

In [None]:
#@title Custom Local Tunnel

import time
from threading import Timer
from py_localtunnel.lt import run_localtunnel
import threading
import concurrent
import logging

__version__ = "1.0.7"

class HaltableTimer(Timer):
    def __init__(self, *args, name='DefaultTimer', delay=5, **kwargs,):
        super().__init__(*args, **kwargs)

        # port = kwargs.get('port', 5000)
        # subdomain='subdomain'
        # super().__init__(1, start_lt, args=(port, subdomain,))

        self.name = name
        self.delay = delay
        self.halt = False

    def run(self):
        while not self.halt:
            logging.info('Thread %s running.' %self.name)
            time.sleep(self.delay)

    def do_halt(self):
        self.halt = True
        self.join()

def run_lt(port: int, subdomain: str = None, local_host: str = "127.0.0.1"):
    run_localtunnel(port, subdomain, local_host)

def start_lt(port: int, subdomain: str = None):
    lt_adress = run_lt(port, subdomain)
    print(lt_adress)

def monitor_process(process, stop_event):
    while not stop_event.is_set():
        if process.poll() is not None:
            break
        time.sleep(1)

    if process.poll() is None:
        process.terminate()
        process.wait()

# Create a stop event
stop_event = threading.Event()

def run_with_lt(app, subdomain: str = None):
    old_run = app.run

    def new_run(*args, **kwargs):
        port = kwargs.get('port', 5000)

        # with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
        #     future = executor.submit(start_lt, port, subdomain)
        #     process = future.result()
        #     monitor_process(process, stop_event)
        #     time.sleep(5)
        #     stop_event.set()

        # if process.poll() is None:
        #     process.terminate()
        #     process.wait()

        thread = Timer(1, start_lt, args=(port, subdomain,))
        thread.setDaemon(True)
        thread.start()

        # stop_event = threading.Event()
        # time.sleep(5)
        # stop_event.set()
        # thread.join()

        old_run(*args, **kwargs)
    app.run = new_run