In [3]:
import pandas as pd
import json
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Load your dataset (adjust the path and file name)
# The original data is prepared and make available by Databricks at Databricks Demos and llm/databricks-documentation repository.
# It is a labeled training set (questions+answers) with state-of-the-art technical answers from Databricks support team.
# This .parquet format dataset is downloaded as .csv and used to fine tune our model.
dataset_path = '/content/03_Offline_Evaluation.csv'  # or .xlsx, .parquet, etc.
df = pd.read_csv(dataset_path)  # Use pd.read_excel() for Excel files

# Verify columns
print("Dataset columns:", df.columns.tolist())
print("First few rows:")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Dataset columns: ['id', 'question', 'answer']
First few rows:


In [4]:
df.head(2)

Unnamed: 0,id,question,answer
0,1131,"""How can I use the `stack` function to generat...",You can use the `stack` function in Databricks...
1,1231,How can I determine if the data input pipeline...,To determine if the data input pipeline or mod...


Create training and validation dataset

In [4]:
# Convert to instruction format
formatted_data = []
for _, row in df.iterrows():
    formatted_entry = {
        "instruction": row['question'],
        "input": "",  # Empty since we're doing Q&A
        "output": row['answer'],
        "context": "Databricks technical documentation"  # You can modify this per question if needed
    }
    formatted_data.append(formatted_entry)

# Save as JSON file
output_path = '/content/drive/MyDrive/databricks_docs_instructions.json'
with open(output_path, 'w') as f:
    json.dump(formatted_data, f, indent=2)

print(f"Dataset successfully converted and saved to {output_path}")
print(f"Total entries converted: {len(formatted_data)}")

# Optional: Create a smaller subset for testing
if len(formatted_data) > 100:
    test_subset = formatted_data[:100]
    test_path = '/content/drive/MyDrive/databricks_docs_instructions_test.json'
    with open(test_path, 'w') as f:
        json.dump(test_subset, f, indent=2)
    print(f"Test subset saved to {test_path}")

Dataset successfully converted and saved to /content/drive/MyDrive/databricks_docs_instructions.json
Total entries converted: 5169
Test subset saved to /content/drive/MyDrive/databricks_docs_instructions_test.json


Formatted dataset for finetuning

In [20]:
with open(output_path, 'r') as f:
    data = json.load(f)
    print(json.dumps(data[0], indent=2))

{
  "instruction": "\"How can I use the `stack` function to generate a set of rows with specified values in Databricks SQL Databricks Runtime?\"",
  "input": "",
  "output": "You can use the `stack` function in Databricks SQL Databricks Runtime to generate a set of rows with specified values by providing the number of rows and the values as arguments. The function separates the values into the specified number of rows. Here's an example:\n\n```sql\nSELECT 'hello', s.*, 'world' FROM stack(2, 1, 2, 3) AS s(first, second);\n```\n\nIn this example, the `stack` function generates two rows with the values 1, 2 and 3. The first column is named 'first' and the second column is named 'second'. The result of this query would be:\n\n```\nhello | first | second | world\n------+-------+--------+-------\nhello |     1 |      2 | world\nhello |     3 |   NULL | world\n```\n\nRemember that in Databricks SQL Databricks Runtime 12.2 LTS and above, you should invoke the `stack` function as a table_refere

In [19]:
with open(output_path, 'r') as f:
  data = json.load(f)
  if isinstance(data, list) and len(data) > 0:
    output_value = data[0].get('output', 'Key "output" not found')
    print(output_value)
  else:
    print("The JSON file is empty or not a list.")

You can use the `stack` function in Databricks SQL Databricks Runtime to generate a set of rows with specified values by providing the number of rows and the values as arguments. The function separates the values into the specified number of rows. Here's an example:

```sql
SELECT 'hello', s.*, 'world' FROM stack(2, 1, 2, 3) AS s(first, second);
```

In this example, the `stack` function generates two rows with the values 1, 2 and 3. The first column is named 'first' and the second column is named 'second'. The result of this query would be:

```
hello | first | second | world
------+-------+--------+-------
hello |     1 |      2 | world
hello |     3 |   NULL | world
```

Remember that in Databricks SQL Databricks Runtime 12.2 LTS and above, you should invoke the `stack` function as a table_reference, as shown in the example.
