In [1]:
# basics
import os
import sys
import time
import datetime   # 
import re

# load self-written func
sys.path.append("func/")      # add path 
import helper
import prompts

# load env variables
from dotenv import dotenv_values
from huggingface_hub import HfApi
ENV_VAR = dotenv_values("../env/.env")
Gemini_key = ENV_VAR['Gemini_key']
HF_key = ENV_VAR['HF_key']
from huggingface_hub import login
login(token=HF_key)                 # log-in for HF

# DS
import pandas as pd
import numpy as np

# DL
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM  # HF models
import google.generativeai as genai     # Gemini model
print(f"GPU detected: {torch.cuda.is_available()}")
print(f"GPU device count: {torch.cuda.device_count()}")

# AWQ quantization:
from awq import AutoAWQForCausalLM

GPU detected: True
GPU device count: 1


In [2]:
# reload module if modified
from importlib import reload
reload(prompts)

<module 'prompts' from 'd:\\code\\LLM_quantize\\func\\prompts.py'>

## Load original model

In [50]:
# load tiny llama model from HF API:
model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tinyllama = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True, padding_side='left')



'left'

In [47]:
# save model to local:
save_folder = "../models/TinyLlama-1.1B-chat-v1.0"
tinyllama.save_pretrained(save_folder)
tokenizer.save_pretrained(save_folder)

('../models/TinyLlama-1.1B-chat-v1.0\\tokenizer_config.json',
 '../models/TinyLlama-1.1B-chat-v1.0\\special_tokens_map.json',
 '../models/TinyLlama-1.1B-chat-v1.0\\tokenizer.json')

In [48]:
# basic performance of the model:
from transformers import pipeline
# set inference config:
from transformers import GenerationConfig
generation_config = GenerationConfig(
	max_new_tokens=100, 
	do_sample=True,        # sampling or not, use greedy decoding if False
	temperature = 1,
	top_k = 30,          # default 50
	# top_p = 0.3          # default 1.0
)
messages = [
    {
        "role": "system",
        "content": "You are a friendly chatbot who always responds in the style of a pirate",
    },
    {"role": "user", "content": "Hey, can you recommend me sushi restaurant in new york city?"}
]
input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
generator = pipeline('text-generation', model=save_folder, tokenizer=save_folder, do_sample=True, num_return_sequences=3, generation_config = generation_config)
output = generator(input_text)

In [117]:
%%time

output = generator([input_text, input_text, input_text], return_tensors="pt", num_return_sequences=1, generation_config = generation_config)

CPU times: total: 7min 33s
Wall time: 57.3 s


## Quantization

In [71]:
# load models to autoawq module:
model_path = "../models/TinyLlama-1.1B-chat-v1.0"
model = AutoAWQForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# parameters:
quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }

In [72]:
# prepare calib_data:
calib_df = pd.read_csv('data/restaurant_chat_2024-05-11_21_38_18_603307.csv')
calib_data = []
for index, row in calib_df.iterrows():
    if index % 2 == 0:
        sys_msg = prompts.SYSTEM_MESSAGES['restaurant']
    else:
        sys_msg = prompts.SYSTEM_MESSAGES['default']
    msg = [
        {"role": "system", "content": sys_msg,},
        {"role": "user",  "content": row['user_input']},
        {'role': 'assistant', 'content': row['model_output']}
    ]
    input_text = tokenizer.apply_chat_template(msg, tokenize=False, add_generation_prompt=False)
    calib_data.append(input_text.strip())
# input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
print(calib_data[5])

<|system|>

You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  
Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. 
Please ensure that your responses are socially unbiased and positive in nature.
</s>
<|user|>
I'm looking for a restaurant that caters to vegetarians and vegans.</s>
<|assistant|>
For vegetarian and vegan options, check out [Restaurant Name]. Their menu offers a wide variety of plant-based dishes that are both flavorful and satisfying.</s>


In [73]:
# Quantize
model.quantize(tokenizer, quant_config=quant_config, calib_data=calib_data)

AWQ: 100%|██████████| 22/22 [05:50<00:00, 15.95s/it]


In [85]:
# save model to local
output_dir = '../models/TinyLlama-1.1B-chat-v1.0-awq'
model.save_quantized(output_dir)
tokenizer.save_pretrained(output_dir)

('../models/TinyLlama-1.1B-chat-v1.0-awq\\tokenizer_config.json',
 '../models/TinyLlama-1.1B-chat-v1.0-awq\\special_tokens_map.json',
 '../models/TinyLlama-1.1B-chat-v1.0-awq\\tokenizer.json')

In [77]:
# load model back:
tinyllama_awq = AutoModelForCausalLM.from_pretrained(
    '../models/TinyLlama-1.1B-chat-v1.0-awq',
    device_map="auto",
)

In [87]:
from huggingface_hub import create_repo
create_repo("tctsung/TinyLlama-1.1B-chat-v1.0-awq", repo_type="model")

RepoUrl('https://huggingface.co/tctsung/TinyLlama-1.1B-chat-v1.0-awq', endpoint='https://huggingface.co', repo_type='model', repo_id='tctsung/TinyLlama-1.1B-chat-v1.0-awq')

In [88]:
from huggingface_hub import HfApi
api = HfApi()
api.upload_folder(
    folder_path='../models/TinyLlama-1.1B-chat-v1.0-awq',
    repo_id="tctsung/TinyLlama-1.1B-chat-v1.0-awq",
    repo_type="model"
)

model.safetensors:   0%|          | 0.00/766M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/tctsung/TinyLlama-1.1B-chat-v1.0-awq/commit/322c9a3ff9a3cdd9380fba2e3c28bdfd266e7207', commit_message='Upload folder using huggingface_hub', commit_description='', oid='322c9a3ff9a3cdd9380fba2e3c28bdfd266e7207', pr_url=None, pr_revision=None, pr_num=None)

In [80]:
x = helper.generate(
    tinyllama_awq, tokenizer, "Hi how are you", generation_config, torch.device('cuda')
)

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


In [81]:
tokenizer.padding_side

'left'

In [84]:
print(x[0])

[INST]<<SYS>>

You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  
Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. 
Please ensure that your responses are socially unbiased and positive in nature.

<</SYS>>

Hi how are you[/INST] 

Greetings! I am a dedicated and responsible assistant. I ensure that my responses are truthful, honest, and socially unbiased. 

Please, do not share any harmful, toxic or unhelpful content regarding any personal information. Also, I recommend you keep your answers simple and straightforward, avoiding technical jargon.

Wishing you warm regards. [INST]<<SYS>> 
<|user|>

