<a href="https://www.kaggle.com/code/antonchernov/llm-openai-api?scriptVersionId=196999610" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Installs

In [1]:
%%capture
# !pip install streamlit

!pip install uvicorn
!npm install -g localtunnel

!pip install fastapi
!pip install fastapi[standard]
!pip install sse-starlette

In [2]:
%%capture
!pip install pip3-autoremove

!pip-autoremove torch torchvision torchaudio -y
!pip install torch torchvision torchaudio xformers triton

!pip-autoremove torch -y
!pip install torch xformers triton

!pip install "unsloth[kaggle-new] @ git+https://github.com/unslothai/unsloth.git"

!pip install sentence-transformers
!pip install einops

import os
os.environ["WANDB_DISABLED"] = "true"

# import os
# os._exit(00) # Перезапустить ядро

# [OAI] FastAPI LLM app

## [OAI] Load models

In [26]:
%%writefile main.py
#@title Models loading

from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from transformers import pipeline
from flask import Flask, request, jsonify
from unsloth import FastLanguageModel
import torch
from typing import List
import logging
import uuid
import time

max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = torch.float16 # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

# MODEL_NAME = "unsloth/Phi-3.5-mini-instruct"
# MODEL_NAME = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit"
MODEL_NAME="unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit"

# Suppress warning
loggers = [logging.getLogger(name) for name in logging.root.manager.loggerDict]
for logger in loggers:
    if "transformers" in logger.name.lower():
        logger.setLevel(logging.ERROR)
        
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

FastLanguageModel.for_inference(model) # Enable native 2x faster inference


Overwriting main.py


In [27]:
# # %%writefile -a main.py

# from transformers import BertTokenizer, BertModel
# from transformers import AutoTokenizer, AutoModel
# from transformers import BertTokenizer, BertModel

# EMB_MODEL_NAME = "bert-base-uncased"
# emb_tokenizer = BertTokenizer.from_pretrained(EMB_MODEL_NAME)
# emb_model = BertModel.from_pretrained(EMB_MODEL_NAME)

In [29]:
%%writefile -a main.py

from sentence_transformers import SentenceTransformer

EMB_MODEL_NAME = "nomic-ai/nomic-embed-text-v1.5"
emb_model = SentenceTransformer(EMB_MODEL_NAME, trust_remote_code=True)

Appending to main.py


## [OAI] Completions

In [31]:
%%writefile -a main.py

# Initialize FastAPI app
app = FastAPI()

# # Configure basic logging
# logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
# logger = logging.getLogger(__name__)

# Define the request model
class CompletionRequest(BaseModel):
    prompt: str
    max_tokens: int = 50

# Define the response model for completion
class CompletionChoice(BaseModel):
    text: str
    index: int

class CompletionResponse(BaseModel):
    id: str
    object: str
    created: int
    model: str
    choices: List[CompletionChoice]
    usage: dict

@app.post("/v1/completions", response_model=CompletionResponse)
async def generate_completion(completion_request: CompletionRequest):
    print(f"\n\nREQUEST:")
    print(f"{completion_request.prompt}")
    
    try:
        # Generate the text completion
        inputs = tokenizer(completion_request.prompt, return_tensors="pt").to("cuda")
        outputs = model.generate(
            **inputs,
            max_new_tokens=completion_request.max_tokens,
            do_sample=True,
            top_p=0.9
        )
    
        completions = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        completion_start = len(completion_request.prompt)
        completions = [comp[completion_start:] for comp in completions]

        response = CompletionResponse(
            id=str(uuid.uuid4()),  # Unique ID for the completion
            object="text_completion",
            created=int(time.time()),  # Current timestamp
            model=MODEL_NAME,  # Model used
            choices=[
                CompletionChoice(
                    text=text,
                    index=i
                )
                for i, text in enumerate(completions)
            ],
            usage={
                "prompt_tokens": len(inputs["input_ids"][0]),
                "completion_tokens": sum(len(tokenizer.encode(text)) for text in completions),
                "total_tokens": len(inputs["input_ids"][0]) + sum(len(tokenizer.encode(text)) for text in completions)
            }
        )
        
        print(f"\nRESPONSE:")
        print(f"{completions[0]}")
        
        return response
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

Appending to main.py


## [OAI] Embeddings

In [32]:
%%writefile -a main.py

from typing import List, Union
from typing import Optional

# Define the input and response models
class EmbeddingRequest(BaseModel):
    input: Union[str, List[str]]  # Required field
    model: Optional[str] = None   # Optional field

class EmbeddingData(BaseModel):
    object: str                   # "embedding"
    embedding: List[float]        # Embedding vector
    index: int                    # Index of the input

class EmbeddingResponse(BaseModel):
    object: str                   # "list"
    data: List[EmbeddingData]     # List of embedding data
    model: str                    # Model name

@app.post("/v1/embeddings", response_model=EmbeddingResponse)
async def generate_embeddings(request: EmbeddingRequest):
    try:
        input_texts = [request.input] if isinstance(request.input, str) else request.input
        
        # MODEL CALL INTERFACE: Tokenizer + Model
#         responses = []
#         for idx, text in enumerate(input_texts):
#             inputs = emb_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
#             with torch.no_grad():
#                 outputs = emb_model(**inputs)
#                 embedding = outputs.last_hidden_state.mean(dim=1).squeeze().tolist()

#             responses.append(EmbeddingData(object="embedding", embedding=embedding, index=idx))

        # MODEL CALL INTERFACE: Model only (nomic-ai)
        input_texts = ['search_document: ' + txt for txt in input_texts]
        embeddings = emb_model.encode(input_texts).tolist()
        responses = [
            EmbeddingData(object="embedding", embedding=embedding, index=idx)
            for idx, embedding in enumerate(embeddings)
        ]
    
        return EmbeddingResponse(
            object="list",
            data=responses,
            model=request.model or EMB_MODEL_NAME  # Use default model name if not provided
        )

    except Exception as e:
        # Handle and log unexpected errors
        print(f"Error: {str(e)}")
        raise HTTPException(status_code=500, detail="Internal Server Error")

Appending to main.py


## [OAI] Chat completions

In [33]:
%%writefile -a main.py

# Define the message structure for the chat-based interaction
class ChatMessage(BaseModel):
    role: str
    content: str

# Define the request model for chat-based completion
class ChatCompletionRequest(BaseModel):
    messages: List[ChatMessage]
    max_tokens: int = 50

# Define the response model for chat-based completion
class ChatCompletionChoice(BaseModel):
    message: ChatMessage
    index: int

class ChatCompletionResponse(BaseModel):
    id: str
    object: str
    created: int
    model: str
    choices: List[ChatCompletionChoice]
    usage: dict

Appending to main.py


In [34]:
%%writefile -a main.py

# Chat completions: non-streaming

@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
async def generate_chat_completion(chat_request: ChatCompletionRequest):
    print(f"\n\nREQUEST:")
#     print(chat_request)
    for k, msg in enumerate(chat_request.messages):
            print(f"({msg.role}): {msg.content}")

    try:
                  
        # Convert the chat messages into a single prompt
        prompt = "\n".join([f"{msg.role}: {msg.content}" for msg in chat_request.messages])
#         prompt += " Respond with your own thoughts, do not complete the user's input."
        prompt += "\nassistant:"
        # Generate the text completion
        inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
        
        outputs = model.generate(
            **inputs,
            max_new_tokens=chat_request.max_tokens,
            do_sample=True,          # Enable sampling
            top_p=0.9,               # Set nucleus sampling threshold
            num_return_sequences=1  # Generate a single completion
        )
        
        
        completion_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        completion_start = len(prompt)
        completion_texts = [comp[completion_start:] for comp in completion_texts]
        
#         print(f'completion_texts len: {len(completion_texts)}')
#         print(f'completion_start: {completion_start}')
        
        # Create response according to OpenAI Chat Completions format
        response = ChatCompletionResponse(
            id=str(uuid.uuid4()),  # Generate a unique ID for the response
            object="chat.completion",
            created=int(time.time()),  # Current timestamp
            model=MODEL_NAME,  # Model used
            choices=[
                ChatCompletionChoice(
                    message=ChatMessage(role="assistant", content=text),
                    index=i
                )
                for i, text in enumerate(completion_texts)
            ],
            usage={
                "prompt_tokens": len(inputs["input_ids"][0]),
                "completion_tokens": sum(len(tokenizer.encode(text)) for text in completion_texts),
                "total_tokens": len(inputs["input_ids"][0]) + sum(len(tokenizer.encode(text)) for text in completion_texts)
            }
        )

        res = response.choices[0]
        print(f"\nRESPONSE:")
        print(f"({res.message.role}): {res.message.content}")
              
        return response
    except Exception as e:
        print(f"Error generating completion: {e}")
        raise HTTPException(status_code=500, detail=str(e))

Appending to main.py


In [35]:
# %%writefile -a main.py

# # Chat completions: streaming = True

# from fastapi.responses import StreamingResponse
# from sse_starlette.sse import EventSourceResponse
# import uuid
# import json
# import time

# @app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
# async def generate_chat_completion(chat_request: ChatCompletionRequest):
#     print(f"\n\nREQUEST:")
#     for k, msg in enumerate(chat_request.messages):
#         print(f"({msg.role}): {msg.content}")

#     try:
#         # Convert the chat messages into a single prompt
#         prompt = "\n".join([f"{msg.role}: {msg.content}" for msg in chat_request.messages])
#         prompt += "\nassistant:"

#         # Tokenize the prompt
#         inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
        
#         if chat_request.streaming:
#             # Define a generator function to yield tokens as they're generated
#             async def token_generator(prompt, max_tokens):
#                 # Tokenize the prompt
#                 input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda")

#                 # Initialize generated tokens with the prompt
#                 generated_tokens = input_ids
#                 total_generated_tokens = 0

#                 # Manually generate tokens one by one
#                 for _ in range(max_tokens):
#                     with torch.no_grad():
#                         # Generate the next token
#                         outputs = model(input_ids=generated_tokens)
#                         next_token_logits = outputs.logits[:, -1, :]
#                         next_token_id = torch.argmax(next_token_logits, dim=-1).unsqueeze(0)

#                         # Append the generated token to the sequence
#                         generated_tokens = torch.cat((generated_tokens, next_token_id), dim=1)

#                         # Decode and get the latest token generated
#                         next_token_text = tokenizer.decode(next_token_id[0], clean_up_tokenization_spaces=True)

#                         # Increment total generated tokens
#                         total_generated_tokens += 1

#                         print(next_token_text, end="")
#                         # Ensure we're not yielding empty strings
#                         if next_token_text.strip():
#                             # Yield the token in OpenAI API's streaming format
#                             yield f'data: {json.dumps({"choices": [{"delta": {"content": next_token_text}, "index": 0, "finish_reason": None}]})}\n\n'

#                         # Stop generation if the EOS token is encountered
#                         if next_token_id.item() == tokenizer.eos_token_id:
#                             yield f'data: {json.dumps({"choices": [{"delta": {}, "index": 0, "finish_reason": "stop"}]})}\n\n'
#                             break

#                 # If max_tokens is reached without an EOS token, send the finish event
#                 yield f'data: {json.dumps({"choices": [{"delta": {}, "index": 0, "finish_reason": "length"}]})}\n\n'


#             print(f"\nRESPONSE:\n(assistant):")
#             # Return a streaming response
#             return EventSourceResponse(token_generator(prompt, chat_request.max_tokens))

In [36]:
# %%writefile -a main.py

#     # Chat completions: streaming = False
#     else:
#         outputs = model.generate(
#             **inputs,
#             max_new_tokens=chat_request.max_tokens,
#             do_sample=True,          # Enable sampling
#             top_p=0.9,               # Set nucleus sampling threshold
#             num_return_sequences=1  # Generate a single completion
#         )
        
        
#         completion_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
#         completion_start = len(prompt)
#         completion_texts = [comp[completion_start:] for comp in completion_texts]
        
#         print(f'completion_texts len: {len(completion_texts)}')
#         print(f'completion_start: {completion_start}')
        
#         # Create response according to OpenAI Chat Completions format
#         response = ChatCompletionResponse(
#             id=str(uuid.uuid4()),  # Generate a unique ID for the response
#             object="chat.completion",
#             created=int(time.time()),  # Current timestamp
#             model=MODEL_NAME,  # Model used
#             choices=[
#                 ChatCompletionChoice(
#                     message=ChatMessage(role="assistant", content=text),
#                     index=i
#                 )
#                 for i, text in enumerate(completion_texts)
#             ],
#             usage={
#                 "prompt_tokens": len(inputs["input_ids"][0]),
#                 "completion_tokens": sum(len(tokenizer.encode(text)) for text in completion_texts),
#                 "total_tokens": len(inputs["input_ids"][0]) + sum(len(tokenizer.encode(text)) for text in completion_texts)
#             }
#         )

#         res = response.choices[0]
#         print(f"\nRESPONSE:")
#         print(f"({res.message.role}): {res.message.content}")
              
#         return response
#     except Exception as e:
#         print(f"Error generating completion: {e}")
#         raise HTTPException(status_code=500, detail=str(e))

In [37]:
%%writefile -a main.py

from fastapi import FastAPI, Request
from fastapi.exceptions import RequestValidationError
from fastapi.responses import JSONResponse

# Exception handler to log invalid requests
@app.exception_handler(RequestValidationError)
async def validation_exception_handler(request: Request, exc: RequestValidationError):
    print(f"Invalid request: {request}")
    print(f"Validation error: {exc.errors()}")
    return JSONResponse(
        status_code=422,
        content={"detail": exc.errors()},
    )

Appending to main.py


## Launch API

In [None]:
PORT = 8089
SUBDOMAIN = 'very-unique-sub-domain'

!curl ipv4.icanhazip.com
!uvicorn main:app --host 0.0.0.0 --port {PORT} & lt --port {PORT} --subdomain {SUBDOMAIN}

34.127.45.136
your url is: https://very-unique-sub-domain.loca.lt
🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.9: Fast Mistral patching. Transformers = 4.44.0.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Loading checkpoint shards: 100%|██████████████████| 2/2 [00:05<00:00,  2.80s/it]
[32mINFO[0m:     Started server process [[36m558[0m]
[32mINFO[0m:     Waiting for application startup.
[32mINFO[0m:     Application startup complete.
[32mINFO[0m:     Uvicorn running on [1mhttp://0.0.0.0:8089[0m (Press CTRL+C to quit)


REQUEST:
(system): Always answer in rhythmes.
(user): Why is sky blue?

RESPONSE:
(as