In [1]:
%pip uninstall -y torch torchvision torchaudio 

%pip install --no-cache-dir \
  torch torchvision torchaudio \
  --index-url https://download.pytorch.org/whl/cpu

%pip uninstall -y transformers datasets

%pip install --no-cache-dir \
  transformers datasets

%pip install transformers[torch]

%pip install 'accelerate>=0.26.0'

Found existing installation: torch 2.7.0
Uninstalling torch-2.7.0:
  Successfully uninstalled torch-2.7.0
Found existing installation: torchvision 0.22.0
Uninstalling torchvision-0.22.0:
  Successfully uninstalled torchvision-0.22.0
Found existing installation: torchaudio 2.7.0
Uninstalling torchaudio-2.7.0:
  Successfully uninstalled torchaudio-2.7.0
Note: you may need to restart the kernel to use updated packages.
Looking in indexes: https://download.pytorch.org/whl/cpu
Collecting torch
  Downloading https://download.pytorch.org/whl/cpu/torch-2.7.0-cp310-none-macosx_11_0_arm64.whl.metadata (29 kB)
Collecting torchvision
  Downloading https://download.pytorch.org/whl/cpu/torchvision-0.22.0-cp310-cp310-macosx_11_0_arm64.whl.metadata (6.1 kB)
Collecting torchaudio
  Downloading https://download.pytorch.org/whl/cpu/torchaudio-2.7.0-cp310-cp310-macosx_11_0_arm64.whl.metadata (6.6 kB)
Downloading https://download.pytorch.org/whl/cpu/torch-2.7.0-cp310-none-macosx_11_0_arm64.whl (68.6 MB)
[

In [2]:
import os

folder = "outputs/bigcode-ts-output-4000"
file_count = sum(len(files) for _, _, files in os.walk(folder))
print("Total files:", file_count)

Total files: 4458


In [None]:
# ─── 1) Imports ─────────────────────────────────────────────────────────────
import glob, os, math, torch, pandas as pd
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    Trainer, TrainingArguments
)

# ─── 2) Configuration ───────────────────────────────────────────────────────
DATA_DIR     = "outputs/bigcode-ts-output-4000"  # folder with .ts/.tsx files
METADATA_CSV = "outputs/ts-output-4000-types.csv"                     # CSV with file, interfaces, types, enums, etc.
BATCH_SIZE   = 4
MAX_LENGTH   = 512
EPOCHS       = 3
OUTPUT_DIR   = "outputs/typescriptmate-model"

# ─── 3) Load metadata and build initial Dataset ────────────────────────────
df = pd.read_csv(METADATA_CSV)
# Ensure file paths are correct: you may need to prepend DATA_DIR
# If 'File' column is relative, adjust accordingly:
# df['file'] = df['File'].apply(lambda p: os.path.join(DATA_DIR, p))

dataset_meta = Dataset.from_pandas(df, preserve_index=False)

# Read file contents into 'text'
def add_text(example):
    path = example['File']
    try:
        with open(path, 'r', encoding='utf-8', errors='ignore') as f:
            example['text'] = f.read()
    except FileNotFoundError:
        example['text'] = ''
    return example

dataset_all = dataset_meta.map(add_text, batched=False)

# ─── 4) Split into train / validation ───────────────────────────────────────
splits = dataset_all.train_test_split(test_size=0.1, seed=42)
datasets = DatasetDict(train=splits['train'], validation=splits['test'])

# ─── 5) Type-Aware Annotation using CSV metadata ────────────────────────────
# Combine all declared and used types into one list per example
TYPE_COLUMNS = ['Interfaces', 'TypeAliases', 'Enums', 'Classes', 'Decorators', 'Imports', 'Exports', 'PredefinedTypesUsed']

def annotate_metadata(example):
    all_types = []
    for col in TYPE_COLUMNS:
        val = example.get(col) or []
        if isinstance(val, str):
            # assume comma-separated in CSV
            val = [v.strip() for v in val.split(',') if v.strip()]
        all_types.extend(val)
    # dedupe
    example['all_types'] = list(dict.fromkeys(all_types))
    return example

datasets = DatasetDict({
    split: ds.map(annotate_metadata, batched=False)
    for split, ds in datasets.items()
})

# Filter to examples with at least one type annotation
datasets = DatasetDict({
    split: ds.filter(lambda ex: len(ex['all_types']) > 0)
    for split, ds in datasets.items()
})

print("After metadata filtering:")
print("  • train:", len(datasets['train']), "examples")
print("  • valid:", len(datasets['validation']), "examples")

# ─── 6) Tokenization ────────────────────────────────────────────────────────
tokenizer = AutoTokenizer.from_pretrained("gpt2", use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

def tokenize_fn(batch):
    return tokenizer(
        batch['text'],
        truncation=True,
        padding='max_length',
        max_length=MAX_LENGTH
    )

tokenized = datasets.map(
    tokenize_fn,
    batched=True,
    remove_columns=[*TYPE_COLUMNS, 'File', 'text']
)

# ─── 7) Data collator & model ──────────────────────────────────────────────
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
model = AutoModelForCausalLM.from_pretrained("gpt2")

# ─── 8) Training arguments ─────────────────────────────────────────────────
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    save_steps=500,
    logging_steps=200,
    learning_rate=5e-5,
    weight_decay=0.01,
    fp16=torch.cuda.is_available(),
    do_train=True,
    do_eval=True
)

# ─── 9) Trainer initialization & train ──────────────────────────────────────
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized['train'],
    eval_dataset=tokenized['validation'],
    data_collator=data_collator
)

trainer.train()

# ─── 10) Evaluation & save ──────────────────────────────────────────────────
eval_results = trainer.evaluate()
print("Full eval results:", eval_results)
if 'eval_loss' in eval_results:
    print("Perplexity:", math.exp(eval_results['eval_loss']))

model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

Map: 100%|██████████| 4453/4453 [00:00<00:00, 14221.84 examples/s]
Map: 100%|██████████| 4007/4007 [00:00<00:00, 14381.24 examples/s]
Map: 100%|██████████| 446/446 [00:00<00:00, 12874.20 examples/s]
Filter: 100%|██████████| 4007/4007 [00:00<00:00, 92572.71 examples/s]
Filter: 100%|██████████| 446/446 [00:00<00:00, 73621.93 examples/s]


After metadata filtering:
  • train: 3734 examples
  • valid: 404 examples


ValueError: Column to remove ['UsedTypes', 'Types'] not in the dataset. Current columns in the dataset: ['File', 'Interfaces', 'TypeAliases', 'Enums', 'Classes', 'Decorators', 'Imports', 'Exports', 'PredefinedTypesUsed', 'text', 'all_types']

In [7]:
print("Full eval results:", eval_results)

if "eval_loss" in eval_results:
    ppl = math.exp(eval_results["eval_loss"])
    print(f"Validation Perplexity: {ppl:.2f}")
else:
    print("No 'eval_loss' in eval_results; keys are:", list(eval_results.keys()))

Full eval results: {'eval_loss': 1.0589054822921753, 'eval_runtime': 347.146, 'eval_samples_per_second': 13.268, 'eval_steps_per_second': 3.318, 'epoch': 3.0}
Validation Perplexity: 2.88
