**Checking the data**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install kaggle



In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d vikramtiwari/pix2code

Dataset URL: https://www.kaggle.com/datasets/vikramtiwari/pix2code
License(s): unknown
Downloading pix2code.zip to /content
 99% 878M/891M [00:04<00:00, 219MB/s]
100% 891M/891M [00:04<00:00, 220MB/s]


In [None]:
!unzip pix2code.zip -d pix2code_kaggle

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: pix2code_kaggle/pix2code-dataset/web/all_data/8E00D88A-D1D2-44C3-BEB3-5825440C90F9.gui  
  inflating: pix2code_kaggle/pix2code-dataset/web/all_data/8E00D88A-D1D2-44C3-BEB3-5825440C90F9.png  
  inflating: pix2code_kaggle/pix2code-dataset/web/all_data/8E1BE7E1-5C10-4B9F-AB53-AEA1BD04D3FB.gui  
  inflating: pix2code_kaggle/pix2code-dataset/web/all_data/8E1BE7E1-5C10-4B9F-AB53-AEA1BD04D3FB.png  
  inflating: pix2code_kaggle/pix2code-dataset/web/all_data/8E1D77C9-8476-48C1-A29B-1BA833D536CA.gui  
  inflating: pix2code_kaggle/pix2code-dataset/web/all_data/8E1D77C9-8476-48C1-A29B-1BA833D536CA.png  
  inflating: pix2code_kaggle/pix2code-dataset/web/all_data/8E499C05-D2E4-43B2-8502-E2255A629B3F.gui  
  inflating: pix2code_kaggle/pix2code-dataset/web/all_data/8E499C05-D2E4-43B2-8502-E2255A629B3F.png  
  inflating: pix2code_kaggle/pix2code-dataset/web/all_data/8E5E67A8-67FE-4576-A6CE-08EB41920429.gui  
  inflating: pix2

In [None]:
!ls pix2code_kaggle


android  ios  pix2code-dataset	web


In [None]:
!ls /content

drive  kaggle.json  pix2code_kaggle  pix2code.zip  sample_data


In [None]:
!ls pix2code_kaggle/

android  ios  pix2code-dataset	web


In [None]:
!find pix2code_kaggle/pix2code-dataset -name "*.gui" | head

pix2code_kaggle/pix2code-dataset/ios/all_data/4A20CC84-417C-4C11-B5AD-947BDB4C45BD.gui
pix2code_kaggle/pix2code-dataset/ios/all_data/B30A18D3-C9EC-469A-B8CC-98B32883E488.gui
pix2code_kaggle/pix2code-dataset/ios/all_data/A0D09F70-B9CE-421A-84B4-351A76C054E9.gui
pix2code_kaggle/pix2code-dataset/ios/all_data/E088DF89-7B4B-4014-B661-3366650763C3.gui
pix2code_kaggle/pix2code-dataset/ios/all_data/38BD6BD9-738B-4561-875C-1E648752425A.gui
pix2code_kaggle/pix2code-dataset/ios/all_data/102544CB-02BF-4A14-B467-2B84B43E2470.gui
pix2code_kaggle/pix2code-dataset/ios/all_data/4B822561-94CD-4A8D-A886-0B077B9055BC.gui
pix2code_kaggle/pix2code-dataset/ios/all_data/DE9CEC90-E3F6-4E64-A838-D207D26CF796.gui
pix2code_kaggle/pix2code-dataset/ios/all_data/F5D41357-0D4A-4F20-8A54-8AE3EAC88AD8.gui
pix2code_kaggle/pix2code-dataset/ios/all_data/CE5B4F68-70F6-48BE-86CA-ACA8A32B635D.gui


In [None]:
import os
from pathlib import Path

DATA_DIR = Path("/content/pix2code_kaggle/pix2code-dataset")   # <-- CHANGE this to your path

# --- 1. Scan for files ---
images = list(DATA_DIR.rglob("*.png")) + list(DATA_DIR.rglob("*.jpg"))
gui_files = list(DATA_DIR.rglob("*.gui"))

print("Found images:", len(images))
print("Found .gui files:", len(gui_files))

# --- 2. Build sets of basenames ---
image_basenames = set([f.stem for f in images])
gui_basenames   = set([f.stem for f in gui_files])

print("\nImages without GUI:", len(image_basenames - gui_basenames))
print("GUI without images:", len(gui_basenames - image_basenames))

# --- 3. Print sample file paths ---
print("\nSample image:", images[0] if images else "NONE FOUND")
print("Sample gui file:", gui_files[0] if gui_files else "NONE FOUND")

# --- 4. Inspect a GUI file ---
if gui_files:
    print("\nPreview of first GUI file:\n")
    with open(gui_files[0], "r") as f:
        print(f.read()[:500])


Found images: 5250
Found .gui files: 5250

Images without GUI: 0
GUI without images: 0

Sample image: /content/pix2code_kaggle/pix2code-dataset/ios/all_data/86E44CA6-4D59-465B-A544-83F8615F0391.png
Sample gui file: /content/pix2code_kaggle/pix2code-dataset/ios/all_data/4A20CC84-417C-4C11-B5AD-947BDB4C45BD.gui

Preview of first GUI file:

stack {
row {
label, btn-add
}
row {
img, label
}
row {
label, switch
}
row {
label, btn-add
}
row {
label, btn-add
}
}
footer {
btn-contact, btn-contact, btn-contact, btn-download
}



**Normalize Images, Preprocess, Build Metadata**

In [None]:
import os
import json
from pathlib import Path
from PIL import Image
import shutil
from sklearn.model_selection import train_test_split

RAW_DIR = Path("/content/pix2code_kaggle/pix2code-dataset/ios/all_data")
OUT_DIR = Path("/content/processed")
IMG_OUT = OUT_DIR / "images"
DSL_OUT = OUT_DIR / "dsl"

IMG_OUT.mkdir(parents=True, exist_ok=True)
DSL_OUT.mkdir(parents=True, exist_ok=True)

# ---- 1. Collect pairs ----
images = sorted(list(RAW_DIR.rglob("*.png")))
guis = sorted(list(RAW_DIR.rglob("*.gui")))

assert len(images) == len(guis)

image_dict = {img.stem: img for img in images}
gui_dict   = {gui.stem: gui for gui in guis}

common = sorted(list(image_dict.keys() & gui_dict.keys()))
print("Total valid pairs:", len(common))

# ---- 2. Normalize images ----
def normalize_image(src, dst, size=(224,224)):
    img = Image.open(src).convert("RGB")
    img = img.resize(size)
    img.save(dst)

metadata = []

for name in common:
    src_img = image_dict[name]
    dst_img = IMG_OUT / f"{name}.png"

    src_gui = gui_dict[name]
    dst_gui = DSL_OUT / f"{name}.gui"

    normalize_image(src_img, dst_img)
    shutil.copy(src_gui, dst_gui)

    metadata.append({
        "id": name,
        "image": str(dst_img),
        "gui": str(dst_gui)
    })

with open(OUT_DIR / "metadata.json", "w") as f:
    json.dump(metadata, f, indent=2)

print("Finished normalization + metadata.")

Total valid pairs: 1750
Finished normalization + metadata.


**Create Train/Val/Test Split**

In [None]:
import json
from pathlib import Path
from sklearn.model_selection import train_test_split

OUT_DIR = Path("/content/processed")
metadata = json.load(open(OUT_DIR / "metadata.json"))

train, temp = train_test_split(metadata, test_size=0.2, random_state=42)
val, test   = train_test_split(temp, test_size=0.5, random_state=42)

json.dump(train, open(OUT_DIR/"train.json","w"), indent=2)
json.dump(val,   open(OUT_DIR/"val.json","w"), indent=2)
json.dump(test,  open(OUT_DIR/"test.json","w"), indent=2)

print("Splits created:",
      "train:", len(train),
      "val:", len(val),
      "test:", len(test))

Splits created: train: 1400 val: 175 test: 175


**DSL Tokenizer and Vocab**

In [None]:
import os
import json
from pathlib import Path

DSL_DIR = Path("/content/processed/dsl")
TOKEN_OUT = Path("/content/processed/tokens")
TOKEN_OUT.mkdir(parents=True, exist_ok=True)

vocab = set()

def tokenize(line):
    return line.strip().replace(",", " , ").split()

for gui_file in sorted(DSL_DIR.glob("*.gui")):
    tokens = []

    with open(gui_file) as f:
        for line in f:
            toks = tokenize(line)
            tokens.extend(toks)
            vocab.update(toks)

    out_file = TOKEN_OUT / (gui_file.stem + ".txt")
    with open(out_file, "w") as f:
        f.write(" ".join(tokens))

vocab = sorted(list(vocab))

json.dump({"vocab": vocab}, open("/content/processed/vocab.json", "w"), indent=2)

print("Tokenizer complete! Vocab size =", len(vocab))

Tokenizer complete! Vocab size = 15


In [15]:
!zip -r processed.zip /content/processed

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  adding: content/processed/tokens/9C113809-BF6A-4395-91AC-5355CA3E4D77.txt (deflated 43%)
  adding: content/processed/tokens/CDE1339D-A0F4-4A8B-8955-C5E37F206A68.txt (deflated 64%)
  adding: content/processed/tokens/AD92F823-B48A-4B78-9947-EA7A050602A9.txt (deflated 33%)
  adding: content/processed/tokens/9E572E53-C2B9-4082-95C7-8CB8826A9E84.txt (deflated 30%)
  adding: content/processed/tokens/AEBE79D0-61F9-4AA9-B169-A2756EBE47C7.txt (deflated 60%)
  adding: content/processed/tokens/80F787C7-C295-426A-A6FD-6642BFD6922D.txt (deflated 49%)
  adding: content/processed/tokens/5E9AC0BC-2578-4007-970F-EBA784A432A7.txt (deflated 56%)
  adding: content/processed/tokens/0F8F4F8B-957A-4D3D-B631-765C440BEBF1.txt (deflated 55%)
  adding: content/processed/tokens/0784B26E-5DD1-4AF2-93B2-DEC6C4611DBF.txt (deflated 28%)
  adding: content/processed/tokens/1F7D2C6B-1C53-47CD-9F5D-6B10E0BE4E83.txt (deflated 50%)
  adding: content/process

In [16]:
from google.colab import files
files.download('processed.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>