Install Packages/Libraries

In [None]:
%pip install --upgrade pip
%pip install torch torchvision torchaudio

# Transformers & datasets
%pip install pandas
%pip install datasets
%pip install peft
%pip install transformers
%pip install transformers[torch]
%pip install 'accelerate>=0.26.0'

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Collecting datasets
  Using cached datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-20.0.0-cp313-cp313-macosx_12_0_arm64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp313-cp313-macosx_11_0_arm64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py312-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Using cached fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting aiohttp!=4.0.0a0,!=4.0.0a1 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloadin

Import Packages/Libraries

In [None]:
import os, math, torch, pandas as pd
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    Trainer, TrainingArguments, TrainerCallback
)
from peft import get_peft_model, LoraConfig, TaskType

In [None]:
Configurations

In [None]:
USE_LORA = True
DATA_DIR     = "outputs/bigcode-ts-output-4000-formatted"
METADATA_CSV = "outputs/bigcode-ts-output-4000-types.csv"
OUTPUT_DIR   = "outputs/typescriptmate-model"
BATCH_SIZE   = 4
MAX_LENGTH   = 512
EPOCHS       = 3
LR           = 5e-5
GRAD_CLIP    = 0.0
SEED         = 42

Count number of TypeScipt files in folder

In [2]:
file_count = sum(len(files) for _, _, files in os.walk(DATA_DIR))
print("Total files:", file_count)

Total files: 4458


Check if MPS (Accelerated PyTorch Training for Apple Silicon) is supported

In [12]:
import torch
print(torch.backends.mps.is_available())  # Should be True
print(torch.backends.mps.is_built())       # Should be True


True
True


Load metadata for Type Awareness

In [None]:
df = pd.read_csv(METADATA_CSV)
for col in ["TypeAliases", "Interfaces"]:
    if col in df.columns:
        df[col] = df[col].fillna("")
print(f"Loaded {len(df)} metadata rows")

Attach file text

In [None]:
dataset_meta = Dataset.from_pandas(df, preserve_index=False)

def add_text(example):
    path = example["File"]
    if not os.path.isabs(path):
        path = os.path.join(DATA_DIR, path)
    try:
        with open(path, "r", encoding="utf-8", errors="ignore") as f:
            example["text"] = f.read()
    except:
        example["text"] = ""
    return example

dataset_meta = dataset_meta.map(add_text, batched=False)

Filter bad examples

In [None]:
dataset_meta = dataset_meta.filter(lambda ex: ex["text"].strip() != "")
print("Non-empty examples:", len(dataset_meta))

Split and filter train and validation data for annotated examples

In [None]:
splits = dataset_meta.train_test_split(test_size=0.1, seed=SEED)
datasets = DatasetDict({
    "train": splits["train"].filter(lambda ex: ex["TypeAliases"] or ex["Interfaces"]),
    "validation": splits["test"].filter(lambda ex: ex["TypeAliases"] or ex["Interfaces"])
})
print("Filtered split:")
print("  • train:", len(datasets["train"]))
print("  • validation:", len(datasets["validation"]))

Tokenize

In [None]:
tokenizer = AutoTokenizer.from_pretrained("gpt2", use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

def tokenize_fn(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LENGTH
    )

to_remove = df.columns.tolist() + ["text"]
tokenized = datasets.map(
    tokenize_fn,
    batched=True,
    remove_columns=to_remove
)

Sanity check on tokens

In [None]:
valid_count = sum(
    any(tok != tokenizer.eos_token_id for tok in ex["input_ids"])
    for ex in tokenized["train"]
)
print(f"Usable tokenized examples: {valid_count} / {len(tokenized['train'])}")

Collator & base Model

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
base_model = AutoModelForCausalLM.from_pretrained("gpt2")

Apply LoRA if enabled

In [None]:
if USE_LORA:
    lora_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False,
        r=4,
        lora_alpha=16,
        lora_dropout=0.0,
        target_modules=["c_attn"]  # you can inspect model to try others
    )
    model = get_peft_model(base_model, lora_config)
    model.print_trainable_parameters()
else:
    model = base_model

Move model to supported device

In [None]:
device = (
    torch.device("mps") if torch.backends.mps.is_available()
    else torch.device("cuda") if torch.cuda.is_available()
    else torch.device("cpu")
)
print("Using device:", device)

model.to(torch.float32)
model.to(device)

In [34]:

# 

# ─── 7)  ─────────────────────────────────────────────────

# ─── 8) ─────────────────────────────────────────────────


# ─── 9)  ──────────────────────────────────────────────────


# ─── 10) TrainingArguments ────────────────────────────────────────────────────
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    learning_rate=LR,
    max_grad_norm=GRAD_CLIP,
    logging_steps=100,
    save_steps=500,
)

# ─── 11) Trainer with loss logger ─────────────────────────────────────────────
class LossLogger(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        print("LOGS:", logs)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    data_collator=data_collator,
    callbacks=[LossLogger()]
)

# ─── 12) Train ────────────────────────────────────────────────────────────────
trainer.train()

# ─── 13) Evaluate ─────────────────────────────────────────────────────────────
eval_results = trainer.evaluate()
print("Full eval results:", eval_results)
if eval_results.get("eval_loss") is not None and not math.isnan(eval_results["eval_loss"]):
    print("Validation Perplexity:", math.exp(eval_results["eval_loss"]))
else:
    print("⚠️ NaN eval loss — check for data/token issues.")

# ─── 14) Save model and tokenizer ─────────────────────────────────────────────
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

Loaded 4453 metadata rows


Map: 100%|██████████| 4453/4453 [00:00<00:00, 5135.30 examples/s]
Filter: 100%|██████████| 4453/4453 [00:00<00:00, 90468.13 examples/s]


Non-empty examples: 4243


Filter: 100%|██████████| 3818/3818 [00:00<00:00, 41617.99 examples/s]
Filter: 100%|██████████| 425/425 [00:00<00:00, 38437.54 examples/s]


Filtered split:
  • train: 550
  • validation: 76


Map: 100%|██████████| 550/550 [00:05<00:00, 104.54 examples/s]
Map: 100%|██████████| 76/76 [00:00<00:00, 400.00 examples/s]


Usable tokenized examples: 550 / 550




trainable params: 147,456 || all params: 124,587,264 || trainable%: 0.1184
Using device: mps


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
100,3.0349
200,2.7123
300,2.5826
400,2.5452


LOGS: {'loss': 3.0349, 'learning_rate': 3.804347826086957e-05, 'epoch': 0.7246376811594203}
LOGS: {'loss': 2.7123, 'learning_rate': 2.5966183574879227e-05, 'epoch': 1.4492753623188406}
LOGS: {'loss': 2.5826, 'learning_rate': 1.388888888888889e-05, 'epoch': 2.1739130434782608}
LOGS: {'loss': 2.5452, 'learning_rate': 1.8115942028985508e-06, 'epoch': 2.898550724637681}
LOGS: {'train_runtime': 299.8823, 'train_samples_per_second': 5.502, 'train_steps_per_second': 1.381, 'total_flos': 431879277772800.0, 'train_loss': 2.711522346533439, 'epoch': 3.0}




LOGS: {'eval_loss': 2.371490001678467, 'eval_runtime': 5.1436, 'eval_samples_per_second': 14.776, 'eval_steps_per_second': 3.694, 'epoch': 3.0}
Full eval results: {'eval_loss': 2.371490001678467, 'eval_runtime': 5.1436, 'eval_samples_per_second': 14.776, 'eval_steps_per_second': 3.694, 'epoch': 3.0}
Validation Perplexity: 10.713343297136609


('outputs/typescriptmate-model-lora/tokenizer_config.json',
 'outputs/typescriptmate-model-lora/special_tokens_map.json',
 'outputs/typescriptmate-model-lora/vocab.json',
 'outputs/typescriptmate-model-lora/merges.txt',
 'outputs/typescriptmate-model-lora/added_tokens.json',
 'outputs/typescriptmate-model-lora/tokenizer.json')

In [33]:
print("Full eval results:", eval_results)

if "eval_loss" in eval_results:
    ppl = math.exp(eval_results["eval_loss"])
    print(f"Validation Perplexity: {ppl:.2f}")
else:
    print("No 'eval_loss' in eval_results; keys are:", list(eval_results.keys()))

Full eval results: {'eval_loss': 2.369295597076416, 'eval_runtime': 5.6532, 'eval_samples_per_second': 13.444, 'eval_steps_per_second': 3.361, 'epoch': 3.0}
Validation Perplexity: 10.69
