In [1]:
import json
from pathlib import Path

params = {}
params_path = Path("/content/params.json")
if params_path.is_file():
    with params_path.open("r", encoding="UTF-8") as params_file:
        params = json.load(params_file)

params

{'hf_dataset': 'weaviate/WithoutRetrieval-SchemaSplit-Test-80',
 'push_to_hub': 'substratusai/wgql-WithRetrieval-SchemaSplit-Train-80-3epochs'}

In [2]:
from datasets import load_dataset

hf_dataset = params.get("hf_dataset")
if hf_dataset:
    dataset = load_dataset(hf_dataset)
else:
    dataset = load_dataset("json", data_files="/content/data/*.json*")

dataset

Downloading readme:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/3.90M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'nlcommand', 'apiRef', 'apiRefPath', 'schema', 'schemaPath'],
        num_rows: 825
    })
})

In [3]:
default_prompt = """
## Instruction
Your task is to write GraphQL for the Natural Language Query provided. Use the provided API reference and Schema to generate the GraphQL. The GraphQL should be valid for Weaviate.

Only use the API reference to understand the syntax of the request.

## Natural Language Query
{nlcommand}

## Schema
{schema}

## API reference
{apiRef}

## Answer
```graphql
"""

prompt = params.get("prompt_template", default_prompt)
print(prompt.format_map(dataset["train"][0]))


## Instruction
Your task is to write GraphQL for the Natural Language Query provided. Use the provided API reference and Schema to generate the GraphQL. The GraphQL should be valid for Weaviate.

Only use the API reference to understand the syntax of the request.

## Natural Language Query
```text
Show me the event name, description, year, significant impact, and the countries involved with their population for the top 10 historical events.
```

## Schema
{
"classes": [
{
"class": "HistoricalEvent",
"description": "Information about historical events",
"vectorIndexType": "hnsw",
"vectorizer": "text2vec-transformers",
"properties": [
{
"name": "eventName",
"dataType": ["text"],
"description": "Name of the historical event"
},
{
"name": "description",
"dataType": ["text"],
"description": "Detailed description of the event"
},
{
"name": "year",
"dataType": ["int"],
"description": "Year the event occurred"
},
{
"name": "hadSignificantImpact",
"dataType": ["boolean"],
"description": "Wheth

In [18]:
import transformers
import torch
import sys
from transformers import AutoTokenizer, AutoModelForCausalLM

model_path = "/content/model/"

tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
model = AutoModelForCausalLM.from_pretrained(
            model_path, load_in_8bit=True, device_map="auto", trust_remote_code=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [19]:
! nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Sat Oct 14 06:52:37 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA L4           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   60C    P0    32W /  72W |  19552MiB / 23034MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [23]:
device = "cuda"
model_inputs = tokenizer([prompt.format_map(dataset["train"][0])], return_tensors="pt").to(device)

generated_ids = model.generate(**model_inputs, max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95, temperature=0.3)
print(tokenizer.batch_decode(generated_ids)[0])

<s> 
## Instruction
Your task is to write GraphQL for the Natural Language Query provided. Use the provided API reference and Schema to generate the GraphQL. The GraphQL should be valid for Weaviate.

Only use the API reference to understand the syntax of the request.

## Natural Language Query
```text
Show me the name, description, year introduced, and whether it is a string of 10 instruments. Also, show me the name, genre, and years active of the musicians who play those instruments.
```

## Schema
{
"classes": [
{
"class": "Instrument",
"description": "A musical instrument.",
"vectorIndexType": "hnsw",
"vectorizer": "text2vec-transformers",
"properties": [
{
"name": "name",
"dataType": ["text"],
"description": "Name of the instrument."
},
{
"name": "description",
"dataType": ["text"],
"description": "Description of the instrument."
},
{
"name": "yearIntroduced",
"dataType": ["int"],
"description": "Year the instrument was introduced."
},
{
"name": "isString",
"dataType": ["boolean"]