In [1]:
import base64
import copy
import json
import time
from argparse import ArgumentParser
from contextlib import asynccontextmanager
from pprint import pprint
from typing import Dict, List, Literal, Optional, Union
import numpy

import torch
import uvicorn
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel, Field, field_validator
from sse_starlette.sse import EventSourceResponse
from starlette.middleware.base import BaseHTTPMiddleware
from starlette.requests import Request
from starlette.responses import Response
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation import GenerationConfig

In [3]:
tokenizer = AutoTokenizer.from_pretrained(
        '/mnt/resource/public_models/Qwen_Qwen1.5-14B-Chat',
        trust_remote_code=True,
    )

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
model = AutoModelForCausalLM.from_pretrained(
        '/mnt/resource/public_models/Qwen_Qwen1.5-14B-Chat',
        torch_dtype="auto",
        trust_remote_code=True,
    ).eval()

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [20]:
text = "鲁迅暴打周树人哈哈哈哈"    
    

model_inputs = tokenizer([text], return_tensors="pt").to("cuda")
model_inputs

{'input_ids': tensor([[113183,  99956,  75437,  40542,  99613,  17340, 118687]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [21]:
model.to("cuda")
outputs = model(**model_inputs)
outputs

CausalLMOutputWithPast(loss=None, logits=tensor([[[ 3.6406,  4.4688,  2.1250,  ..., -1.8203, -1.8906, -2.0156],
         [ 6.3438,  4.4062,  7.3750,  ..., -3.8594, -3.1406, -3.2031],
         [ 7.0938, 12.5625,  4.6562,  ..., -2.6875, -2.3281, -1.8828],
         ...,
         [ 7.6250,  3.3281,  5.0938,  ..., -2.6562, -2.5469, -2.4688],
         [ 7.9688,  4.1562,  4.8125,  ..., -4.4688, -3.1875, -2.4531],
         [10.9375,  3.9219,  6.3125,  ..., -4.4062, -2.9219, -2.1250]]],
       device='cuda:0', grad_fn=<ToCopyBackward0>), past_key_values=((tensor([[[[ 1.9531, -0.1436,  1.9922,  ...,  0.3652,  0.1289,  0.0747],
          [ 1.7031,  1.4453,  2.0469,  ..., -0.1885, -0.2754, -0.0121],
          [-0.3867,  2.3281,  1.1406,  ..., -0.1338, -0.2266,  0.0427],
          ...,
          [-1.8594, -0.0679, -1.6016,  ..., -0.5430, -0.5156, -0.0718],
          [ 0.0938, -1.7812, -2.2969,  ..., -0.8320, -0.5664, -0.0952],
          [ 1.7109, -1.9766, -1.7500,  ...,  0.3281, -0.0562, -0.2334]],

In [23]:
generated_ids = model.generate(
            **model_inputs,
            max_new_tokens=1024
        )
generated_ids

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


tensor([[113183,  99956,  75437,  40542,  99613,  17340, 118687,    271, 106249,
          15235,   3837,  35946,  53153, 100143,  57191, 104125, 116389,   9370,
         105905, 101070,   1773, 113183,  33108,  40542,  99613,  17340,   9909,
          91676,  40542,  19403,  17340,   7552,  20412, 105352,  58695,  59258,
         100390, 105891, 104179,  45629,   3837,  99650, 104186, 100145, 111592,
         104380, 101913, 101069,  33108, 110691,   3837, 104610,  99605, 100697,
         102852,   1773, 111815, 104430, 100022,  33108,  99348,   3837, 100020,
         104056, 104179, 102174,  33108, 100383, 102007,   3837, 104610,  44063,
          99650,  99605,  32108,  17714, 104945, 102109, 111167,   1773, 151643]],
       device='cuda:0')

In [None]:
generated_ids = [
            output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
        ]
generated_ids

In [None]:
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
response