In [2]:
import argparse
import random
import logging
from tqdm import tqdm
from resources import webpage_purposes
from utils import save_json, llm_generate
from generate_prompts import generate_prompts

In [2]:
# utils.py
import json
import re
import logging
from transformers import pipeline
from tqdm import tqdm
import requests
from typing import Optional

def llm_generate(prompt: str) -> Optional[str]:
    """Generate response using local CodeLlama API"""
    try:
        response = requests.post(
            "http://localhost:11434/api/generate",
            json={"model": "deepseek-r1:32b", "prompt": prompt}
        )
        
        # Concatenate all response chunks
        full_response = ""
        for line in response.iter_lines():
            if line:
                json_response = json.loads(line)
                if 'response' in json_response:
                    full_response += json_response['response']
        
        # Extract response after </think> tag if present
        if '</think>' in full_response:
            full_response = full_response.split('</think>')[-1].strip()
            # Split by newlines and take everything after the first empty line
            if '\n\n' in full_response:
                full_response = full_response.split('\n\n', 1)[1].strip()
            
        return full_response
    except Exception as e:
        print(f"Error generating with CodeLlama: {str(e)}")
        return None

#### Create a small dataset 4000 data points max

In [9]:
import json

with open('./old_datasets/final_dataset_old_data/final_dataset.json', 'r') as f:
    data = json.load(f)

In [3]:
import random

# Randomly select 4000 items
selected_data = random.sample(data, 4000)

# Extract prompts from selected items
prompts = [item['prompt'] for item in selected_data]

# Save the selected data for future use
selected_data_with_all_fields = selected_data

print(f"Number of selected prompts: {len(prompts)}")

Number of selected prompts: 4000


In [7]:
with open('old_dataset_4000.json', 'w') as f:
    json.dump(selected_data, f, indent=4)

#### Convert json dataset to HF dataset

In [4]:
def remove_before_doctype(input_string):
    doctype_index = input_string.find('<!DOCTYPE html>')
    
    if doctype_index == -1:
        # If the doctype tag doesn't exist in the string
        return input_string
    
    # Return only the part of the string starting from the doctype tag
    return input_string[doctype_index:]

# Example usage:
html_string = "Some unwanted content here <!DOCTYPE html><html><body>Hello world!</body></html>"
cleaned_string = remove_before_doctype(html_string)
print(cleaned_string)
# Output: <!DOCTYPE html><html><body>Hello world!</body></html>

<!DOCTYPE html><html><body>Hello world!</body></html>


In [8]:
import re

def clean_html_string(input_string):
    # Find the doctype tag
    doctype_index = input_string.find('<!DOCTYPE html>')
    
    if doctype_index == -1:
        # If the doctype tag doesn't exist in the string
        cleaned_string = input_string
    else:
        # Return only the part of the string starting from the doctype tag
        cleaned_string = input_string[doctype_index:]
    
    # Remove everything after </html> tag if it exists
    html_end_pattern = r'(</html>)[\s\S]*$'
    cleaned_string = re.sub(html_end_pattern, r'\1', cleaned_string)
    
    return cleaned_string

# Example usage:
html_string = "<!DOCTYPE html><html><body>Hello world!</body></html>"
cleaned_string = clean_html_string(html_string)
print(cleaned_string)
# Output: <!DOCTYPE html><html><body>Hello world!</body></html>

<!DOCTYPE html><html><body>Hello world!</body></html>


##### Clean the String

In [15]:
import json
unclean_webpages_path = "./generated_dataset/webpages.json"
with open(unclean_webpages_path, 'r') as f:
    unclean_webpages = json.load(f)

In [16]:
from tqdm import tqdm
cleaned_wepages = []
for item in tqdm(unclean_webpages, desc="Cleaning code"):
    code  = clean_html_string(item['code'])
    item['code'] = code
    cleaned_wepages.append(item)

Cleaning code: 100%|██████████| 4000/4000 [00:00<00:00, 80973.07it/s]


In [18]:
cleaned_wepages_path = './generated_dataset/cleaned_webpages.json'

with open(cleaned_wepages_path, 'w') as f:
    json.dump(cleaned_wepages, f, indent=2)

In [19]:
from utils import create_and_split_dataset

create_and_split_dataset(cleaned_wepages, 0.0, output_dir='./final_dataset/final_webpages_only_4000', save_jsonl=False)

Saving the dataset (1/1 shards): 100%|██████████| 4000/4000 [00:00<00:00, 138908.39 examples/s]


DatasetDict({
    train: Dataset({
        features: ['prompt', 'code'],
        num_rows: 4000
    })
})

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel, PeftConfig

# Load base model and tokenizer
model_name = "meta-llama/CodeLlama-7b-hf"  # example base model
base_model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load the LoRA configuration and model
lora_path = "./models/finetuned_codellama_4bit_lora_run_1"  # path to your LoRA weights
config = PeftConfig.from_pretrained(lora_path)
model = PeftModel.from_pretrained(base_model, lora_path)

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.15s/it]


In [None]:
# Set model to evaluation mode
model.eval()

# Define a prompt
prompt = "Create a simple HTML webpage"

# Tokenize input
inputs = tokenizer(prompt, return_tensors="pt")

# Generate text
outputs = model.generate(
    input_ids=inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    max_length=2048,
    temperature=0.7,
    do_sample=True,
    top_p=0.95,
    pad_token_id=tokenizer.pad_token_id,
    eos_token_id=tokenizer.eos_token_id
)

# Decode the generated text
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Create a simple HTML webpage where users can input their email address and receive a confirmation message when clicking on a button.### Response:
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Email Confirmation Form</title>
    <style>
        body {
            font-family: Arial, sans-serif;
            line-height: 1.6;
            margin: 0;
            padding: 20px;
            background-color: #f0f0f0;
        }
        .container {
            max-width: 600px;
            margin: 0 auto;
            background-color: white;
            padding: 30px;
            border-radius: 10px;
            box-shadow: 0 0 10px rgba(0,0,0,0.1);
        }
        .form-group {
            margin-bottom: 15px;
        }
        label {
            display: block;
            margin-bottom: 5px;
            font-weight: bold;
        }
        input[type="email"] {
            width: 100%;
            padding: 8px;
            border: 1px solid #ddd;
         

In [3]:
# Count parameters
total_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters: {total_params}")
print(f"Size in MB: {total_params * 4 / 1024 / 1024:.2f} MB") 

Total parameters: 6778523648
Size in MB: 25858.02 MB


In [5]:
from utils import clean_html_string

cleaned_generated_text = clean_html_string(generated_text)

In [6]:
print(cleaned_generated_text)

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Email Confirmation Form</title>
    <style>
        body {
            font-family: Arial, sans-serif;
            line-height: 1.6;
            margin: 0;
            padding: 20px;
            background-color: #f0f0f0;
        }
        .container {
            max-width: 600px;
            margin: 0 auto;
            background-color: white;
            padding: 30px;
            border-radius: 10px;
            box-shadow: 0 0 10px rgba(0,0,0,0.1);
        }
        .form-group {
            margin-bottom: 15px;
        }
        label {
            display: block;
            margin-bottom: 5px;
            font-weight: bold;
        }
        input[type="email"] {
            width: 100%;
            padding: 8px;
            border: 1px solid #ddd;
            border-radius: 4px;
        }
        button {
            background-color: #4CAF50;
            color: white;
            padding: 12px 24px;

In [7]:
def write_html_to_file(html_content: str, output_path: str) -> None:
    """
    Writes HTML content to a file while preserving formatting.
    
    Args:
        html_content (str): The HTML content to write
        output_path (str): Path where the HTML file should be saved
    """
    try:
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(html_content)
        print(f"Successfully wrote HTML to {output_path}")
    except Exception as e:
        print(f"Error writing HTML file: {str(e)}")

In [8]:
# Example usage
html_content =  cleaned_generated_text 
output_path = "accessible_travel_blog.html"
write_html_to_file(html_content, output_path)

Successfully wrote HTML to accessible_travel_blog.html


In [15]:
import pandas as pd

dev_dataset = pd.read_csv('./developer_prompt_dataset.csv', encoding='latin-1')

In [5]:
dev_dataset

Unnamed: 0,Developer,Student Name,Years of Experience,Prompt
0,D1,Ayo,0-2,Can you create a website that advertises socce...
1,D2,Alla Sreerma Reddy,0-2,Make me a website that is a shopping website t...
2,D3,Rohitha Aradhyula,0-2,Can you create a website for me to display viv...
3,D4,Harshitha Arugonda,0-2,i want you to help me with an assignment.
4,D5,Yashvikumari Bhagat,0-2,"use this language HTML, CSS, JavaScript,build ..."
5,D6,sri duga viswa venkata sai varma,0-2,"adidas,prices,color,product,shirts/saivarma080..."
6,D7,Dharma Sai Bhuvan Samba Siva Deepak Cheemakurthi,0-2,create a fully functional website for clothing...
7,D8,Mahimanvitha Chinnamsetti,0-2,"Design a webpage for career advancement, inclu..."
8,D9,Sai Pavani Danda,0-2,Give me a website advertising Pixel products t...
9,D10,Aparna Desi Reddy,0-2,"provide me a website with html, css, javascrip..."


In [16]:
dev_dataset_dict = dev_dataset.to_dict('records')
print(f"Number of records: {len(dev_dataset_dict)}")

Number of records: 42


In [None]:
dev_dataset_dict

{'Developer': 'D1',
 'Student Name': 'Ayo',
 'Years of Experience': '0-2',
 'Prompt': 'Can you create a website that advertises soccer jerseys with features that include background images, a navigation bar, video options, text descriptions, buttons, various sections, headers, footers, input text fields. Provide the source code. It can be in any language.'}

In [None]:
prompt = dev_dataset_dict[0]['Prompt']

cleaned_generated_text = clean_html_string(generated_text)

# Example usage
html_content =  cleaned_generated_text 
output_path = f"{dev_dataset_dict[0]['Prompt']}.html"
write_html_to_file(html_content, output_path)



In [12]:
import os

In [None]:
os.makedirs('html_file', exist_ok=True)

AttributeError: module 'os' has no attribute 'makedir'