In [1]:
import json
from pathlib import Path

params = {}
params_path = Path("/content/params.json")
if params_path.is_file():
    with params_path.open("r", encoding="UTF-8") as params_file:
        params = json.load(params_file)

params

{'hf_dataset': 'weaviate/WithoutRetrieval-SchemaSplit-Test-80',
 'prompt_template': '## Instruction\nYour task is to write GraphQL for the Natural Language Query provided. Use the provided API reference and Schema to generate the GraphQL. The GraphQL should be valid for Weaviate.\n\nOnly use the API reference to understand the syntax of the request.\n\n## Natural Language Query\n{nlcommand}\n\n## Schema\n{schema}\n\n## API reference\n{apiRef}\n\n## Answer\n```graphql\n',
 'push_to_hub': 'substratusai/wgql-WithRetrieval-SchemaSplit-Train-80'}

In [2]:
from datasets import load_dataset

hf_dataset = params.get("hf_dataset")
if hf_dataset:
    dataset = load_dataset(hf_dataset)
else:
    dataset = load_dataset("json", data_files="/content/data/*.json*")

dataset

Downloading readme:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/3.90M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'nlcommand', 'apiRef', 'apiRefPath', 'schema', 'schemaPath'],
        num_rows: 825
    })
})

In [3]:
default_prompt = """
## Instruction
Your task is to write GraphQL for the Natural Language Query provided. Use the provided API reference and Schema to generate the GraphQL. The GraphQL should be valid for Weaviate.

Only use the API reference to understand the syntax of the request.

## Natural Language Query
{nlcommand}

## Schema
{schema}

## API reference
{apiRef}

## Answer
```graphql
"""

prompt = params.get("prompt_template", default_prompt)
print(prompt.format_map(dataset["train"][0]))

## Instruction
Your task is to write GraphQL for the Natural Language Query provided. Use the provided API reference and Schema to generate the GraphQL. The GraphQL should be valid for Weaviate.

Only use the API reference to understand the syntax of the request.

## Natural Language Query
```text
Show me the event name, description, year, significant impact, and the countries involved with their population for the top 10 historical events.
```

## Schema
{
"classes": [
{
"class": "HistoricalEvent",
"description": "Information about historical events",
"vectorIndexType": "hnsw",
"vectorizer": "text2vec-transformers",
"properties": [
{
"name": "eventName",
"dataType": ["text"],
"description": "Name of the historical event"
},
{
"name": "description",
"dataType": ["text"],
"description": "Detailed description of the event"
},
{
"name": "year",
"dataType": ["int"],
"description": "Year the event occurred"
},
{
"name": "hadSignificantImpact",
"dataType": ["boolean"],
"description": "Whethe

In [4]:
import transformers
import torch
import sys
from transformers import AutoTokenizer, AutoModelForCausalLM

model_path = "/content/model/"

tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
model = AutoModelForCausalLM.from_pretrained(
            model_path, device_map="auto", trust_remote_code=True,
            torch_dtype=torch.bfloat16, 
            use_flash_attention_2=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
! nvidia-smi

Wed Oct 18 02:24:32 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA L4           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   52C    P0    29W /  72W |  13676MiB / 23034MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [6]:
device = "cuda"
model.generation_config

GenerationConfig {
  "bos_token_id": 1,
  "do_sample": true,
  "eos_token_id": 2,
  "max_length": 4096,
  "pad_token_id": 0,
  "temperature": 0.6,
  "top_p": 0.9
}

In [7]:
stop_ids = torch.LongTensor(tokenizer.encode("```", add_special_tokens=False))
## Note the stop_ids aren't correct, for some reason there are multiple possible token IDs for ```
## so instead we're using tensor([13940, 28832], device='cuda:0') as the stop_ids, because that's
## what the model normally generates
print(stop_ids)
print(tokenizer.decode([8789]) == "```")
print(tokenizer.decode([13940, 28832]) == "```")
print(tokenizer.decode(tokenizer.encode("```", add_special_tokens=False)))

tensor([7521])
False
False
```


In [8]:
from transformers import StoppingCriteria, StoppingCriteriaList

class BacktickStoppingCriteria(StoppingCriteria):
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        if self.tokenizer.decode(input_ids[0][-2:]) == "```" or self.tokenizer.decode(input_ids[0][-1]) == "```":
            return True
        return False



stopping_criteria = StoppingCriteriaList([BacktickStoppingCriteria(tokenizer)])

In [15]:
model.config.bos_token_id = tokenizer.bos_token_id = 1
model.config.eos_token_id = tokenizer.eos_token_id = 2
model.config.pad_token_id = tokenizer.pad_token_id = 0

In [16]:
%%time
import torch


device = "cuda"
model_inputs = tokenizer([prompt.format_map(dataset["train"][0])],
                         return_tensors="pt").to(device)

generated_ids = model.generate(**model_inputs,
                               max_new_tokens=300,
                               stopping_criteria=stopping_criteria)

print(tokenizer.decode(generated_ids[0], skip_special_tokens=True))


## Instruction
Your task is to write GraphQL for the Natural Language Query provided. Use the provided API reference and Schema to generate the GraphQL. The GraphQL should be valid for Weaviate.

Only use the API reference to understand the syntax of the request.

## Natural Language Query
```text
Show me the event name, description, year, significant impact, and the countries involved with their population for the top 10 historical events.
```

## Schema
{
"classes": [
{
"class": "HistoricalEvent",
"description": "Information about historical events",
"vectorIndexType": "hnsw",
"vectorizer": "text2vec-transformers",
"properties": [
{
"name": "eventName",
"dataType": ["text"],
"description": "Name of the historical event"
},
{
"name": "description",
"dataType": ["text"],
"description": "Detailed description of the event"
},
{
"name": "year",
"dataType": ["int"],
"description": "Year the event occurred"
},
{
"name": "hadSignificantImpact",
"dataType": ["boolean"],
"description": "Whethe

In [11]:
print(model_inputs["input_ids"].shape)
input_length = model_inputs["input_ids"].shape[1]
print(tokenizer.decode(generated_ids[0][input_length:], skip_special_tokens=True).strip("```"))


torch.Size([1, 534])
aut aut Aut Aut AutautAutJeAutčka aut autJePhoneautAutAut Aut AutAututt AutautJeommAut AutAut Autaut AutAutЉAutPhonewickAut  Aut AututtJeJeAut Autčka Autński AutAut autPhone autPhonečka Aut AutAut autAut Aut Aut Aututt AutJe Aut Aut Autaut AutAut aut Aut AutAutAutAutAutPhoneAutAut AutAutčkaAut Aut aut aut Aut autčkaAutJePhone Aut Autaut aut Aut aut AutAut AutAutJe Aut Aut AutAut autAut autAut Aut  AutAutPhone Aut Aut Aut Aut autJePhone aut Aut AutJeJePhoneAutAG Aut AutAutAut aut AutAut AutAutAut AutAut AutAutAutpngAut AutPhoneAut Aut aut AutAut Aut autAut AutAutAut Aut AutAut AutJe Aut autAutAutJe Aut AutPhoneAutAutAutAutautčka autautAut Aut Aut AutAutPhone Aut AutautAut AutAG AutAut AutAut Aut AutAut autAutAutAutaut Aut Aut Aut Aut AutAutAut Aut Aut autAut Aut AutAutAut AutAutAut Aut Aut AutAutAutAut Aut Aut Aut Aut AutAutAut Autaut Aut Aut Autaut autAutAut AutAut Autčka AutAut Aut Aut AutPhoneAut Aut aut Aut Aut Aut Aut autAut Aut Aut autAutAut Aut Aut AutautAutP

In [None]:
import json
dataset_size = len(dataset["train"])
output_path = "/content/artifacts/test-output.json"
entries = []
print(f"Running inference for {dataset_size} entries in dataset")
for i in range(dataset_size):
    print(f"entry {i+1} of {dataset_size}")
    entry = dataset["train"][i]
    model_inputs = tokenizer([prompt.format_map(entry)],
                         return_tensors="pt").to(device)



    generated_ids = model.generate(**model_inputs,
                               max_new_tokens=300,
                               stopping_criteria=stopping_criteria)
    input_length = model_inputs["input_ids"].shape[1]
    output = tokenizer.decode(generated_ids[0][input_length:], skip_special_tokens=True)
    entry["modelOutput"] = output.strip("```")
    entries.append(entry)

    with open(output_path, 'a') as file:
        json.dump(entry, file)
        file.write("\n")

Store the test dataset with model output in the original HuggingFace Model repo

In [None]:
from huggingface_hub import HfApi
repo_id = params.get("push_to_hub")
if repo_id:
    hf_api = HfApi()
    hf_api.upload_file(
            path_or_fileobj=Path(output_path),
            path_in_repo=Path(output_path).name,
            repo_id=repo_id,
    )
    logs_path = Path("/content/artifacts/eval.ipynb")
    if logs_path.exists():
        hf_api.upload_file(
            path_or_fileobj=logs_path,
            path_in_repo=logs_path.name,
            repo_id=repo_id,
        )

## Execute the model output on a live Weaviate cluster

In [19]:
valid = []
for i in range(len(dataset["train"])):
    try:
        json.loads(dataset["train"][i]["schema"])
        valid.append(i)
    except:
        pass

print("Valid:", valid)

Valid: []


In [None]:
# import weaviate
# from weaviate.embedded import EmbeddedOptions

# client = weaviate.Client(
#   embedded_options=EmbeddedOptions()
# )