In [None]:
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer
import torch
import huggingface_hub

print(torch.cuda.is_available())

In [None]:
model_path = "meta-llama/Llama-2-7b-chat-hf"
print(model_path)
target_model_path = "autora-doc/Llama-2-7b-chat-hf-nf4"

In [None]:
# Load the model in 4bit quantization for faster inference on smaller GPUs
conf = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

In [None]:
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, quantization_config=conf, device_map="auto")

In [None]:
# This will work when running from a Jupyter notebook or Colab.
# For other authentication methods, see https://huggingface.co/docs/huggingface_hub/main/en/quick-start#authentication
huggingface_hub.notebook_login(new_session=False, write_permission=True)

In [None]:
tokenizer.push_to_hub(target_model_path)
model.push_to_hub(target_model_path)

In [None]:
# Alternatvely, upload to Azure Blob Storage (currently not used)
from azureml.core import Workspace

# save locally first
tokenizer.save_pretrained(f"./models/{model_path}")
model.save_pretrained(f"./models/{model_path}")

# If all goes well, upload to blob storage:
workspace = Workspace.from_config()
ds = workspace.get_default_datastore()
ds.upload(f"./models/{model_path}", f"./base_models/{target_model_path}", show_progress=True, overwrite=True)