<a href="https://colab.research.google.com/github/tarunpahade/100xdevs/blob/main/finetuning_json_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118


In [2]:
import json
## Not Run this ?
def create_object_summaries(data):
    """First pass: Create plain English summaries of each object"""
    summaries = []

    if 'countries' in data:
        for country in data['countries']:
            summary = f"Country object with ID {country.get('id')}: Name is {country.get('name')}"
            summaries.append(summary)

    if 'regions' in data:
        for region in data['regions']:
            summary = f"Region object with ID {region.get('id')}: Name is {region.get('name')}, belongs to country {region.get('countryId')}"
            summaries.append(summary)

    if 'cities' in data:
        for city in data['cities']:
            summary = f"City object with ID {city.get('id')}: Name is {city.get('name')}, belongs to region {city.get('regionId')}"
            summaries.append(summary)

    return summaries


def save_to_jsonl(qa_pairs, output_file):
    """Save the QA pairs to a JSONL file"""
    with open(output_file, 'w', encoding='utf-8') as f:
        for pair in qa_pairs:
            json.dump(pair, f, ensure_ascii=False)
            f.write('\n')

def main():
    # Load your data
    input_file = 'building_codes_db.json'
    output_file = 'training_data.jsonl'

    try:
        # Load the original data
        with open(input_file, 'r', encoding='utf-8') as f:
            data = json.load(f)

        # Step 1: Create summaries of each object
        print("Creating object summaries...")
        summaries = create_object_summaries(data)
        print(f"Created {len(summaries)} summaries")

        # Step 2: Generate QA pairs using local model
        print("Generating QA pairs using local model...")
        qa_pairs = generate_qa_pairs(summaries)
        print(f"Generated {len(qa_pairs)} QA pairs")

        # Step 3: Save to JSONL
        print("Saving to JSONL file...")
        save_to_jsonl(qa_pairs, output_file)
        print(f"Data saved to {output_file}")

        # Print a sample QA pair
        if qa_pairs:
            print("\nSample QA pair:")
            print(json.dumps(qa_pairs[0], indent=2))

    except Exception as e:
        print(f"An error occurred: {str(e)}")

if __name__ == "__main__":
    main()

An error occurred: [Errno 2] No such file or directory: 'building_codes_db.json'


In [2]:
!nvdia.smi

/bin/bash: line 1: nvdia.smi: command not found


In [7]:
def analyze_json_file(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        content = file.read()

    # Check for basic structural issues
    brace_count = content.count('{') - content.count('}')
    bracket_count = content.count('[') - content.count(']')
    quote_count = content.count('"')

    print(f"Unmatched braces: {brace_count}")
    print(f"Unmatched brackets: {bracket_count}")
    print(f"Quote count (should be even): {quote_count}")

    # Try to read file line by line to find where it breaks
    lines = content.splitlines()
    accumulated = ""
    for i, line in enumerate(lines, 1):
        accumulated += line
        try:
            json.loads(accumulated)
        except json.JSONDecodeError as e:
            if "Expecting value" in str(e):
                return i, line
            continue

analyze_json_file('./building_codes.json')

Unmatched braces: 0
Unmatched brackets: 0
Quote count (should be even): 920351


(2, '    "countries": [')

In [8]:
def clean_and_validate_json(filename):
    # Read the file as binary first to check for encoding issues
    with open(filename, 'rb') as file:
        content = file.read()

    # Try to decode with different encodings and remove BOM if present
    try:
        # Remove BOM if present
        if content.startswith(b'\xef\xbb\xbf'):
            content = content[3:]
        text = content.decode('utf-8')
    except UnicodeDecodeError:
        try:
            text = content.decode('utf-8-sig')
        except UnicodeDecodeError:
            text = content.decode('latin-1')

    # Basic cleanup
    text = text.replace('\r', '')  # Remove carriage returns
    text = text.replace('\u0000', '')  # Remove null bytes

    # Ensure the JSON has proper structure
    if not text.strip().startswith('{'):
        text = '{' + text
    if not text.strip().endswith('}'):
        text = text + '}'

    # Write cleaned version to new file
    clean_filename = filename.replace('.json', '_cleaned.json')
    with open(clean_filename, 'w', encoding='utf-8') as file:
        file.write(text)

    return clean_filename

# Clean and validate the file
cleaned_file = clean_and_validate_json('./building_codes.json')

# Try to load the cleaned file
with open(cleaned_file, 'r', encoding='utf-8') as file:
    try:
        data = json.load(file)
        print("JSON successfully cleaned and loaded!")
    except json.JSONDecodeError as e:
        print(f"Error still exists: {str(e)}")
        # If error persists, let's look at the content around the error
        with open(cleaned_file, 'r', encoding='utf-8') as f:
            lines = f.readlines()
            error_line = e.lineno
            context_start = max(0, error_line - 5)
            context_end = min(len(lines), error_line + 5)
            print("\nContext around error:")
            for i in range(context_start, context_end):
                print(f"Line {i+1}: {lines[i].rstrip()}")

JSON successfully cleaned and loaded!


In [16]:
import json
from typing import Dict, List, Any

# Assuming this is your corrected JSON data
with open('./building_codes.json', 'r', encoding='utf-8') as file:
    data = json.load(file)


def process_codes_data(data: Dict[str, Any]) -> List[Dict[str, str]]:
    """Process main building codes with their chapters, sections, and subsections."""
    codes_data = []

    if 'codes' in data and isinstance(data['codes'], list):
        for code in data['codes']:
            # Main code information
            codes_data.append({
                "input_text":"",
                'instructions': f"What is the code with ID {code.get('id')}?",
                'output_text': f"The code with ID {code.get('id')} is the {code.get('code')}, "
                             f"version {code.get('code_version')}, effective from {code.get('effective_date')}. "
                             f"It is adopted by {', '.join(code.get('adopted_by', []))}"
            })

            # Process chapters, sections, and subsections
            if 'chapters' in code:
                for chapter in code['chapters']:
                    chapter_title = chapter.get('chapter')
                    codes_data.append({
                        "input_text":"",
                        'instructions': f"What does Chapter {chapter_title} in {code.get('code')} cover?",
                        'output_text': f"Chapter {chapter_title} in {code.get('code')} covers the following:"
                    })

                    for section in chapter.get('sections', []):
                        section_title = section.get('section')
                        codes_data.append({
                            "input_text":"",
                            'instructions': f"What is covered in Section {section_title} of Chapter {chapter_title}?",
                            'output_text': f"Section {section_title} of Chapter {chapter_title} discusses: {section.get('content')}"
                        })

                        for subsection in section.get('subsections', []):
                            codes_data.append({
                                "input_text":"",
                                'instructions': f"What does Subsection {subsection.get('title')} in Section {section_title} say?",
                                'output_text': f"Subsection {subsection.get('title')} states: {subsection.get('content')}",
                                "input":""
                            })

    return codes_data

def process_zoning_safety_data(data: Dict[str, Any]) -> List[Dict[str, str]]:
    """Process zoning and safety-related data."""
    zoning_safety = []

    # Process zoning data
    if 'zoning' in data and isinstance(data['zoning'], list):
        for zoning_entry in data['zoning']:
            city_name = zoning_entry.get('cityName')
            for project in zoning_entry.get('zoning', []):
                project_type = project.get('projectType')
                zoning_details = project.get('zoningDetails', {})

                details_str = ""
                for key, value in zoning_details.items():
                    if isinstance(value, dict):
                        details_str += f"{key.capitalize()}: {value.get('value')} {value.get('unit')}, "
                    else:
                        details_str += f"{key.capitalize()}: {value}, "

                zoning_safety.append({
                    'instructions': f"What are the zoning regulations for {project_type} projects in {city_name}?",
                    'output_text': f"For {project_type} projects in {city_name}, the zoning regulations include: {details_str[:-2]}",
                    "input":""
                })

    # Process safety measures
    if 'safetyMeasures' in data:
        for safety in data['safetyMeasures']:
            location_name = safety.get('location_name')
            safety_measures = safety.get('safetyMeasures', [{}])[0]

            details_str = ""
            for key, value in safety_measures.items():
                if isinstance(value, dict):
                    if 'requirements' in value:
                        details_str += f"{key.capitalize()}: {value.get('requirements')}, "
                    else:
                        is_active = any(value.get(k, False) for k in ['isSeismicActive', 'isWildfireProne', 'isFloodProne', 'isHurricaneProne'])
                        details_str += f"{key.capitalize()}: {'Yes' if is_active else 'No'}, "
                else:
                    details_str += f"{key.capitalize()}: {value}, "

            zoning_safety.append({
                'instructions': f"What are the safety measures in {location_name}?",
                'output_text': f"In {location_name}, the safety measures include: {details_str[:-2]}",
                "input":""
            })

    return zoning_safety

def process_code_metadata(data: Dict[str, Any]) -> List[Dict[str, str]]:
    """Process code types, versions, and related metadata."""
    metadata = []

    # Process code types
    if 'codeTypes' in data:
        for code in data['codeTypes']:
            metadata.append({
                'instructions': f"What does code type {code.get('id')} represent?",
                'output_text': f"Code type {code.get('id')} represents {code.get('name')}",
                "input":''
            })

    # Process code versions
    if 'codeVersions' in data:
        for version in data['codeVersions']:
            metadata.append({
                'instructions': f"What is code version {version.get('id')}?",
                'output_text': f"Version {version.get('id')} is {version.get('version')} "
                             f"for code type {version.get('codeTypeId')}",
            "input":""
            })

    # Process special requirements and standards
    for category in ['specialRequirements', 'accessibilityStandards', 'codeApplicabilities']:
        if category in data:
            for item in data[category]:
                metadata.append({
                    'input_text': f"What are the {category} for code version {item.get('codeVersionId')}?",
                    'output_text': f"For version {item.get('codeVersionId')}, "
                                 f"the {category} are: {item.get('requirement') or item.get('standard') or item.get('applicability')}",
                    'input':""
                })

    return metadata

def save_to_jsonl(data: List[Dict[str, str]], filename: str) -> None:
    """Save data to a JSONL file."""
    with open(filename, 'w', encoding='utf-8') as f:
        for entry in data:
            json.dump(entry, f, ensure_ascii=False)
            f.write('\n')

def main():
    try:
        # Load data
        with open('building_codes.json', 'r', encoding='utf-8') as f:
            data = json.load(f)

        # Process and save each category
        categories = {
            'building_codes_train.jsonl': process_codes_data(data),
            'zoning_safety.jsonl': process_zoning_safety_data(data),
            'code_metadata.jsonl': process_code_metadata(data)
        }

        # Save each category
        for filename, category_data in categories.items():
            save_to_jsonl(category_data, filename)
            print(f"Saved {len(category_data)} entries to {filename}")
           # save_to_jsonl(category_data, "all_data.jsonl")

    except Exception as e:
        print(f"An error occurred: {str(e)}")

if __name__ == "__main__":
    main()

Saved 6650 entries to building_codes_train.jsonl
Saved 11 entries to zoning_safety.jsonl
Saved 27 entries to code_metadata.jsonl


In [10]:
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "triton<0.9.0" peft accelerate bitsandbytes
import torch
print(torch.cuda.is_available())  # Should print True if CUDA is available
print(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No CUDA GPU")

Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-4nmt3w4d/unsloth_268a7ce74c4d48e6a3c04f3d6658e059
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-4nmt3w4d/unsloth_268a7ce74c4d48e6a3c04f3d6658e059
  Resolved https://github.com/unslothai/unsloth.git to commit d8ad96b018bbe90861144818c2b3e1b229287fc7
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting xformers<0.0.27
  Downloading xformers-0.0.26.post1-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.0 kB)
[31mERROR: Could not find a version that satisfies the requirement triton<0.9.0 (from versions: 2.0.0, 2.1.0, 2.2.0, 2.3.0, 2.3.1, 3.0.0, 3.1.0)[0m[31m
[0m[31mERROR: No matching dist

Input output to putput JSONL data

In [19]:
import json
from typing import Dict, List, Any

# Assuming this is your corrected JSON data
with open('./building_codes.json', 'r', encoding='utf-8') as file:
    data = json.load(file)


def process_codes_data(data: Dict[str, Any]) -> List[Dict[str, str]]:
    """Process main building codes with their chapters, sections, and subsections."""
    codes_data = []

    if 'codes' in data and isinstance(data['codes'], list):
        for code in data['codes']:
            # Main code information
            codes_data.append({
                "input_text":"",
                'instructions': f"What is the code with ID {code.get('id')}?",
                'output_text': str(code)
            })

            # Process chapters, sections, and subsections
            if 'chapters' in code:
                for chapter in code['chapters']:
                    chapter_title = chapter.get('chapter')
                    codes_data.append({
                        "input_text":"",
                        'instructions': f"What does Chapter {chapter_title} in {code.get('code')} cover?",
                        'output_text': str(chapter)
                    })

                    for section in chapter.get('sections', []):
                        section_title = section.get('section')
                        codes_data.append({
                            "input_text":"",
                            'instructions': f"What is covered in Section {section_title} of Chapter {chapter_title}?",
                            'output_text': str(section)
                        })

                        for subsection in section.get('subsections', []):
                            codes_data.append({
                                "input_text":"",
                                'instructions': f"What does Subsection {subsection.get('title')} in Section {section_title} say?",
                                'output_text': str(subsection)

                            })

    return codes_data

def process_zoning_safety_data(data: Dict[str, Any]) -> List[Dict[str, str]]:
    """Process zoning and safety-related data."""
    zoning_safety = []

    # Process zoning data
    if 'zoning' in data and isinstance(data['zoning'], list):
        for zoning_entry in data['zoning']:
            city_name = zoning_entry.get('cityName')
            for project in zoning_entry.get('zoning', []):
                project_type = project.get('projectType')
                zoning_details = project.get('zoningDetails', {})

                details_str = ""
                for key, value in zoning_details.items():
                    if isinstance(value, dict):
                        details_str += f"{key.capitalize()}: {value.get('value')} {value.get('unit')}, "
                    else:
                        details_str += f"{key.capitalize()}: {value}, "

                zoning_safety.append({
                    'instructions': f"What are the zoning regulations for {project_type} projects in {city_name}?",
                    'output_text': str(zoning_entry),
                    "input_text":""
                })

    # Process safety measures
    if 'safetyMeasures' in data:
        for safety in data['safetyMeasures']:
            location_name = safety.get('location_name')
            safety_measures = safety.get('safetyMeasures', [{}])[0]

            details_str = ""
            for key, value in safety_measures.items():
                if isinstance(value, dict):
                    if 'requirements' in value:
                        details_str += f"{key.capitalize()}: {value.get('requirements')}, "
                    else:
                        is_active = any(value.get(k, False) for k in ['isSeismicActive', 'isWildfireProne', 'isFloodProne', 'isHurricaneProne'])
                        details_str += f"{key.capitalize()}: {'Yes' if is_active else 'No'}, "
                else:
                    details_str += f"{key.capitalize()}: {value}, "

            zoning_safety.append({
                'instructions': f"What are the safety measures in {location_name}?",
                'output_text': str(safety),
                "input_text":""
            })

    return zoning_safety

def process_code_metadata(data: Dict[str, Any]) -> List[Dict[str, str]]:
    """Process code types, versions, and related metadata."""
    metadata = []

    # Process code types
    if 'codeTypes' in data:
        for code in data['codeTypes']:
            metadata.append({
                'instructions': f"What does code type {code.get('id')} represent?",
                'output_text': str(code),
                "input_text":''
            })

    # Process code versions
    if 'codeVersions' in data:
        for version in data['codeVersions']:
            metadata.append({
                'instructions': f"What is code version {version.get('id')}?",
                'output_text': str(version),
                 "input_text":""
            })

    # Process special requirements and standards
    for category in ['specialRequirements', 'accessibilityStandards', 'codeApplicabilities']:
        if category in data:
            for item in data[category]:
                metadata.append({
                    'instructions': f"What are the {category} for code version {item.get('codeVersionId')}?",
                    'output_text': str(item),
                    'input_text':""
                })

    return metadata

def save_to_jsonl(data: List[Dict[str, str]], filename: str) -> None:
    """Save data to a JSONL file."""
    with open(filename, 'w', encoding='utf-8') as f:
        for entry in data:
            json.dump(entry, f, ensure_ascii=False)
            f.write('\n')

def main():
    try:
        # Load data
        with open('building_codes.json', 'r', encoding='utf-8') as f:
            data = json.load(f)

        # Process and save each category
        categories = {
            'building_codes_train.jsonl': process_codes_data(data),
            'zoning_safety.jsonl': process_zoning_safety_data(data),
            'code_metadata.jsonl': process_code_metadata(data)
        }

        # Save each category
        for filename, category_data in categories.items():
            # save_to_jsonl(category_data, filename)
            print(f"Saved {len(category_data)} entries to {filename}")
            save_to_jsonl(category_data, "output_JSON.jsonl")

    except Exception as e:
        print(f"An error occurred: {str(e)}")

if __name__ == "__main__":
    main()

Saved 6650 entries to building_codes_train.jsonl
Saved 11 entries to zoning_safety.jsonl
Saved 27 entries to code_metadata.jsonl


In [11]:
import torch
!nvidia-smi
import torch
print(torch.cuda.is_available())  # Should return True if GPU is available

print(torch.cuda.is_available())  # Should return True
print(torch.cuda.get_device_name(0))  # Prints the GPU name if available


Wed Jan 15 06:09:28 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P8               9W /  70W |      3MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [12]:
import os
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "0"
from unsloth import FastLanguageModel
import torch
max_seq_length=2048
dtype=None

model, tokenizer= FastLanguageModel.from_pretrained(
    model_name="unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype=dtype,
    load_in_4bit=True
)




🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.1.5: Fast Llama patching. Transformers: 4.47.1.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/345 [00:00<?, ?B/s]

In [13]:
model= FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    random_state=3407,
    use_rslora=False,
    loftq_config=None
)

Unsloth 2025.1.5 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [None]:
from unsloth import to_sharegpt
dataset = to_sharegpt(
    dataset,
    merged_prompt = \
        "[[The passenger embarked from {Embarked}.]]"\
        "[[\nThey are {Sex}.]]"\
        "[[\nThey have {Parch} parents and childen.]]"\
        "[[\nThey have {SibSp} siblings and spouses.]]"\
        "[[\nTheir passenger class is {Pclass}.]]"\
        "[[\nTheir age is {Age}.]]"\
        "[[\nThey paid ${Fare} for the trip.]]",
    conversation_extension = 5, # Randomnly combines conversations into 1! Good for long convos
    output_column_name = "Survived",
)

In [23]:
import json

# Open the JSONL file
with open('./all_data.jsonl', 'r', encoding='utf-8') as file:
    # Read each line, parse it as JSON, and store it in a list
    data = [json.loads(line) for line in file]

# Transform the data into the desired format
transformed_data = []
for item in data:
    # Extract the user input (instructions) and assistant output (output_text)
    user_input = item.get("instructions", "")
    assistant_output = item.get("output_text", "")

    # Create the conversation structure
    conversation = {
        "conversations": [
            {"content": user_input, "from": "user"},
            {"content": assistant_output, "from": "assistant"}
        ]
    }

    # Add the transformed conversation to the list
    transformed_data.append(conversation)

# Now `transformed_data` is in the desired format
print(transformed_data[:2])  # Print the first 2 transformed items for verification
dataset = standardize_sharegpt(transformed_data)


TypeError: list indices must be integers or slices, not str