In [1]:
import torch, gc
from huggingface_hub import login
from transformers import AutoTokenizer, BitsAndBytesConfig, AutoModelForCausalLM, AutoConfig
from accelerate import infer_auto_device_map, init_empty_weights

from llama_index.llms import HuggingFaceLLM
from llama_index import SQLDatabase,ServiceContext
from sqlalchemy import create_engine, Table, inspect, MetaData
from llama_index.indices.struct_store.sql_query import NLSQLTableQueryEngine
from IPython.display import Markdown, display

login("")

hf_model="mistralai/Mixtral-8x7B-Instruct-v0.1"

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/newuser/.cache/huggingface/token
Login successful


In [2]:
config = AutoConfig.from_pretrained(hf_model)
with init_empty_weights():
    model = AutoModelForCausalLM.from_config(config)

device_map = infer_auto_device_map(model, no_split_module_classes=["OPTDecoderLayer"], dtype="float16")
device_map

for key, value in device_map.items():
    if key in ['model.embed_tokens', 'model.layers.0', 'model.layers.1', 'model.layers.2', 'model.layers.3']:
        continue  # Skip these keys and keep their values as 0
    elif key.startswith('model.layers') and key.split('.')[-1].isdigit() and int(key.split('.')[-1]) in range(2, 4):
        device_map[key] = 'cpu'  # Update values for layers 4-10 to 'cpu'
    else:
        device_map[key] = 'disk'  # Update all other values to 'disk'

config.json:   0%|          | 0.00/720 [00:00<?, ?B/s]

In [3]:
tokenizer = AutoTokenizer.from_pretrained(hf_model, pad=True, max_length=4096, truncation=False)

# device_map = {
#     "transformer.word_embeddings": 0,
#     "transformer.word_embeddings_layernorm": 0,
#     "lm_head": "cpu",
#     "transformer.h": 0,
#     "transformer.ln_f": 0,
# }

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    # load_in_8bit=True,
    # llm_int8_threshold=6.0,
    # llm_int8_skip_modules=[],
    # llm_int8_enable_fp32_cpu_offload=True,
    # llm_int8_has_fp16_weight=False,
    kwargs={"device_map": device_map},
)

model = AutoModelForCausalLM.from_pretrained(
    hf_model,
    max_memory = { 0: "6GiB",
    "cpu": "30GiB" },
    quantization_config = bnb_config,
    offload_folder = "offload",
    offload_state_dict = True,
    max_length = 2048,
    use_safetensors = True,
)
# locally_run = HuggingFaceLLM(
#     model_name=hf_model,
#     device_map="auto",
#     max_new_tokens=512,
#     tokenizer=tokenizer,
#     model_kwargs={
#         "max_length": 2048,
#         "max_memory": {0: "10GiB", "cpu": "30GiB"},
#         "use_safetensors": True,
#         "quantization_config": bnb_config,
#         "offload_folder": "offload",
#         "offload_state_dict": True,
#     }
# )

locally_run = HuggingFaceLLM(
    model
)

tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/92.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/19 [00:00<?, ?it/s]

model-00001-of-00019.safetensors:   0%|          | 0.00/4.89G [00:00<?, ?B/s]

model-00002-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00019.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00005-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00006-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00007-of-00019.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00008-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00009-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00010-of-00019.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00011-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00012-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00013-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00014-of-00019.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00015-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00016-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00017-of-00019.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00018-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00019-of-00019.safetensors:   0%|          | 0.00/4.22G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/19 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 28.00 MiB. GPU 0 has a total capacity of 11.76 GiB of which 12.44 MiB is free. Process 541066 has 11.74 GiB memory in use. Of the allocated memory 11.57 GiB is allocated by PyTorch, and 58.84 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
from llama_index import set_global_service_context
engine = create_engine("mysql+pymysql://sa:.@192.168.1.100:3306/employees?charset=utf8mb4")

service_context = ServiceContext.from_defaults(llm=locally_run,embed_model="local")
set_global_service_context(service_context)
sql_database = SQLDatabase(engine)

### NLSQLTableQueryEngine
Good for simple queries, simple datasets

In [None]:
query_engine = NLSQLTableQueryEngine(sql_database, service_context=service_context)
response = query_engine.query("Who are the department managers?")

In [None]:
response_template = """
## Answer
```
{response}
```
## Generated SQL Query
```
{sql}
```
"""

display(Markdown(response_template.format(
        response=str(response),
        sql=response.metadata["sql_query"],
    )))

In [None]:
query_engine = None
response = None

### SQLTableRetrieverQueryEngine
Better for larger, complex datasets

In [None]:
from llama_index.indices.struct_store import SQLTableRetrieverQueryEngine
from llama_index.objects import (
    SQLTableNodeMapping,
    ObjectIndex,
    SQLTableSchema,
)
from llama_index import VectorStoreIndex

inspector = inspect(engine)
table_names = inspector.get_table_names()

table_node_mapping = SQLTableNodeMapping(sql_database)

table_schema_objs = []
for table_name in table_names:
    table_schema_objs.append(SQLTableSchema(table_name=table_name))

obj_index = ObjectIndex.from_objects(
    table_schema_objs,
    table_node_mapping,
    VectorStoreIndex
)

In [None]:
query_engine = SQLTableRetrieverQueryEngine(
    sql_database,
    obj_index.as_retriever(similarity_top_k=1),
)

In [None]:
response = query_engine.query("On average, how many employees does each dept manager manage?")

response.metadata

In [None]:
response_template = """
## Answer
```
{response}
```
## Generated SQL Query
```
{sql}
```
"""

display(Markdown(response_template.format(
        response=str(response),
        sql=response.metadata["sql_query"],
    )))

In [None]:
locally_run = None
sql_database = None
query_engine = None
response = None
table_node_mapping = None
obj_index = None
table_schema_objs = None

torch.cuda.empty_cache()
gc.collect()