In [1]:
%pip uninstall -y torch torchvision torchaudio 

%pip install --no-cache-dir \
  torch torchvision torchaudio \
  --index-url https://download.pytorch.org/whl/cpu

%pip uninstall -y transformers datasets

%pip install --no-cache-dir \
  transformers datasets

%pip install transformers[torch]

%pip install 'accelerate>=0.26.0'

Found existing installation: torch 2.7.0
Uninstalling torch-2.7.0:
  Successfully uninstalled torch-2.7.0
Found existing installation: torchvision 0.22.0
Uninstalling torchvision-0.22.0:
  Successfully uninstalled torchvision-0.22.0
Found existing installation: torchaudio 2.7.0
Uninstalling torchaudio-2.7.0:
  Successfully uninstalled torchaudio-2.7.0
Note: you may need to restart the kernel to use updated packages.
Looking in indexes: https://download.pytorch.org/whl/cpu
Collecting torch
  Downloading https://download.pytorch.org/whl/cpu/torch-2.7.0-cp310-none-macosx_11_0_arm64.whl.metadata (29 kB)
Collecting torchvision
  Downloading https://download.pytorch.org/whl/cpu/torchvision-0.22.0-cp310-cp310-macosx_11_0_arm64.whl.metadata (6.1 kB)
Collecting torchaudio
  Downloading https://download.pytorch.org/whl/cpu/torchaudio-2.7.0-cp310-cp310-macosx_11_0_arm64.whl.metadata (6.6 kB)
Downloading https://download.pytorch.org/whl/cpu/torch-2.7.0-cp310-none-macosx_11_0_arm64.whl (68.6 MB)
[

In [3]:
import os

folder = "outputs/bigcode-ts-output-4000"
file_count = sum(len(files) for _, _, files in os.walk(folder))
print("Total files:", file_count)

Total files: 4458


In [2]:
# ─── 2) Imports ─────────────────────────────────────────────────────────────
import glob, os, math
import torch
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    Trainer, TrainingArguments
)

# ─── 3) Configuration ───────────────────────────────────────────────────────
DATA_DIR   = "outputs/bigcode-ts-output-4000"                       # local folder with .ts/.tsx files
BATCH_SIZE = 4
MAX_LENGTH = 512
EPOCHS     = 3
OUTPUT_DIR = "outputs/typescriptmate-model"

# ─── 4) Gather all TS/TSX paths ─────────────────────────────────────────────
ts_paths  = glob.glob(os.path.join(DATA_DIR, "**/*.ts"),  recursive=True)
tsx_paths = glob.glob(os.path.join(DATA_DIR, "**/*.tsx"), recursive=True)
all_paths = ts_paths + tsx_paths
print(f"Found {len(all_paths)} TypeScript files")

# ─── 5) Build a HF Dataset from file contents ───────────────────────────────
def gen_examples():
    for path in all_paths:
        with open(path, "r", encoding="utf-8", errors="ignore") as f:
            text = f.read()
        yield {"text": text}

dataset_all = Dataset.from_generator(gen_examples)

# ─── 6) Split into train / validation ───────────────────────────────────────
splits = dataset_all.train_test_split(test_size=0.1, seed=42)
datasets = DatasetDict(train=splits["train"], validation=splits["test"])

# ─── 7) Load & configure the tokenizer ─────────────────────────────────────
tokenizer = AutoTokenizer.from_pretrained("gpt2", use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

def tokenize_fn(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LENGTH
    )

tokenized = datasets.map(
    tokenize_fn,
    batched=True,
    remove_columns=["text"]
)

# ─── 8) Data collator & model ──────────────────────────────────────────────
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
model = AutoModelForCausalLM.from_pretrained("gpt2")

# ─── 9) Training arguments ─────────────────────────────────────────────────
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    save_steps=500,
    logging_steps=200,
    save_total_limit=2,
    learning_rate=5e-5,
    weight_decay=0.01,
    fp16=torch.cuda.is_available(),
    do_train=True,
    do_eval=True
)

# ─── 10) Initialize Trainer ─────────────────────────────────────────────────
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    data_collator=data_collator
)

# ─── 11) Train ──────────────────────────────────────────────────────────────
trainer.train()

# ─── 12) Evaluate & print metrics ──────────────────────────────────────────
eval_results = trainer.evaluate()

# ─── 13) Save fine-tuned model ─────────────────────────────────────────────
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

  from .autonotebook import tqdm as notebook_tqdm


Found 4458 TypeScript files


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
200,1.8738
400,1.6528
600,1.5834
800,1.5409
1000,1.4898
1200,1.4218
1400,1.4007
1600,1.3404
1800,1.3406
2000,1.3647




Full eval results: {'eval_loss': 1.3422685861587524, 'eval_runtime': 30.2038, 'eval_samples_per_second': 14.766, 'eval_steps_per_second': 3.708, 'epoch': 3.0}


In [3]:
print("Full eval results:", eval_results)

if "eval_loss" in eval_results:
    ppl = math.exp(eval_results["eval_loss"])
    print(f"Validation Perplexity: {ppl:.2f}")
else:
    print("No 'eval_loss' in eval_results; keys are:", list(eval_results.keys()))

Validation Perplexity: 3.83
