In [1]:
!pip install autoawq -q

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
autotrain-advanced 0.6.51 requires datasets[vision]~=2.14.0, but you have datasets 2.15.0 which is incompatible.
autotrain-advanced 0.6.51 requires evaluate==0.3.0, but you have evaluate 0.4.1 which is incompatible.
autotrain-advanced 0.6.51 requires fastapi==0.104.1, but you have fastapi 0.105.0 which is incompatible.
autotrain-advanced 0.6.51 requires packaging==23.1, but you have packaging 23.2 which is incompatible.
autotrain-advanced 0.6.51 requires protobuf==4.23.4, but you have protobuf 4.25.1 which is incompatible.
autotrain-advanced 0.6.51 requires pydantic==2.4.2, but you have pydantic 1.10.13 which is incompatible.
autotrain-advanced 0.6.51 requires tqdm==4.65.0, but you have tqdm 4.66.1 which is incompatible.


In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
from awq import AutoAWQCausalLM
from transformers import Autotokenizer
import torch

model_path = 'PY007/TinyLlama-1.1B-Chat-v0.3'

quant_name = model_path.split('/')[-1] + "-AWQ"

quant_path = 'Trelis/' + quant_name
quant_config = {"zero_point": True, "q_group_size": 128, "w_bit":4}

#load model
model = AutoAWQCausalLM.from_pretrained(model_path,device_map = 'auto')
tokenizer = Autotokenizer.from_pretrained(model_path,trust_remote_code = True)

#quantize
model.quantize(tokenizer, quant_config = quant_config)

#save quantized model
model.save_quantized(quant_name, safetensors = True, shard_size = '10GB')
tokenizer.save_pretrained(quant_name)

Upload model files

In [None]:
from huggingface_hub import HfApi

#initialize the HfApi Class
api = HfApi()

# Specify the path where you want the file to be uploaded in the repository
path_in_repo = 'model.safetensors'

local_file_path = './'+ quant_name + "/" + path_in_repo

#generate repo_id from model path
repo_id = 'Trelis/' +quant_name

api.upload_file(
    path_or_fileobj = local_file_path,
    path_in_repo = path_in_repo,
    repo_id = repo_id,
    repo_type = "model"
)

Upload non model files

In [None]:
from huggingface_hub import HfApi

api = HfApi()

repo_id = 'Trelis/' + quant_name

local_file_paths = [
    "./" + quant_name + "/config.jason",
    "./" + quant_name + "/genreation_config.jason",
    "./" + quant_name + "/quant_config.jason",
    "./" + quant_name + "/special_tokens_map.jason",
    "./" + quant_name + "/tokenizer_config.jason",
    "./" + quant_name + "/tokenizer.jason",
]


for local_file_path in local_file_paths:
    file_name = local_file_path.split("/")[-1]
    path_in_repo = file_name

    api.upload_file(
        path_or_fileobj = local_file_path,
        path_in_repo= path_in_repo,
        repo_id= repo_id,
        repo_type= "model",
    )

    print(f"Uploaded {file_name} to {repo_id}")

In [None]:
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer
model_name_or_path = "Trelis/Llama-2-13b-chat-longlora-32-sft-AWQ"

model = AutoAWQCausalLM.from_quatized(model_name_or_path, fuse_layers = True,
                                      trust_remote_code = False, safetensors = True)

In [None]:
prompt = "What planets are in our solar system?"

formatted_prompt = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"

tokens = tokenizer(
    formatted_prompt,
    return_tensors = 'pt',
).input_ids.cuda()

generation_output = model.generate(
    tokens,
    do_sample = False,
    max_new_tokens = 512
)

print(tokenizer.decode(generation_output[0],skip_special_tokens = True))

GGUF

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
cache_dir = ''

In [None]:
import drive
drive.mount('/content/drive')

In [None]:
import os 
cache_dir = '/content/drive/My Drive/huggingface_cache'
os.makedirs(cache_dir, exist_ok= True)

In [None]:
!pip install transformers
!pip install accelerate
!pip install einops
!pip install numpy
!pip install sentencepiece

In [None]:
import os 
import torch
import torch.nn as nn
from transformers import AutoTokenizer,AutoConfig,AutoModelForCausalLM

In [None]:
model_name = 'Py007/TinyLlama-1.1B-intermediate-step-480k-1T'

model = AutoAWQCausalLM.from_pretrained(
    model_name,
    trust_remote_code = True,
    torch_dtype = torch.bfloat16,
    device_map = 'cpu',
    offload_folder = 'offload',
    cache_dir =  cache_dir
)

In [None]:
!git clone https://github.com/ggerganov/llama.cpp.git

In [None]:
%cd llama.cpp

In [None]:
model.save_pretrained('./models/')

In [None]:
import os
import requests

def download_file_from_huggingface(model_name, filename, save_path):
    url = f"https://huggingface.co/{model_name}/resolve/main/{filename}"
    r = requests.get(url)
    if r.status_code != 200:
        print(f"Failed to download {filename}.HTTP Status Code:"{r.status_code}
              return False
    with open(os.path.join(save_path,filename), 'wb') as f:
         f.write(r.content) )
    return True


def main():
    files_to_download = [   
        "tokenizer_config.jason",
        "tokenizer.model",
        "tokenizer.json",
        "special_tokens_map.json",
        "added_tokens.json"
    ]

    for filename in files_to_download:
        success = download_file_from_huggingface(model_name, filename, save_path)
        if success:
            print(f"Successfully downloaded {filename}")
        else:
            print(f"Failed to download {filename}")

if __name__ == "__main__":
    main()

In [None]:
!apt update -y
!apt install build-essential git cmake libopenblas-dev libeigen3-dev -y

In [None]:
!make LLAMA_OPENBLAS =1 

In [None]:
!python convert.py models/

In [None]:
parts = model_name.split('/')

model_name_pure = parts[1]

quant_type = "Q4_K"
quantized_model = f'models/{model_name_pure}.{quant_type}.gguf'
print(f'Preparing {quantized_model} with {quant_type} quantization.')

import subprocess

command = ["./quantize", "models/ggml-model-f16.gguf", quantized_model, quant_type]

subprocess.run(command)

after above now push the model to huggingface