In [None]:
!pip install --quiet transformers datasets

In [None]:
import os

folder = "ts-output"
file_count = sum(len(files) for _, _, files in os.walk(folder))
print("Total files:", file_count)

In [None]:
# ─── 2) Imports ─────────────────────────────────────────────────────────────
import glob, os, math
import torch
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    Trainer, TrainingArguments
)

# ─── 3) Configuration ───────────────────────────────────────────────────────
DATA_DIR   = "ts-output"                       # local folder with .ts/.tsx files
BATCH_SIZE = 4
MAX_LENGTH = 512
EPOCHS     = 3
OUTPUT_DIR = "./ts-code-completion-model"

# ─── 4) Gather all TS/TSX paths ─────────────────────────────────────────────
ts_paths  = glob.glob(os.path.join(DATA_DIR, "**/*.ts"),  recursive=True)
tsx_paths = glob.glob(os.path.join(DATA_DIR, "**/*.tsx"), recursive=True)
all_paths = ts_paths + tsx_paths
print(f"Found {len(all_paths)} TypeScript files")

# ─── 5) Build a HF Dataset from file contents ───────────────────────────────
def gen_examples():
    for path in all_paths:
        with open(path, "r", encoding="utf-8", errors="ignore") as f:
            text = f.read()
        yield {"text": text}

dataset_all = Dataset.from_generator(gen_examples)

# ─── 6) Split into train / validation ───────────────────────────────────────
splits = dataset_all.train_test_split(test_size=0.1, seed=42)
datasets = DatasetDict(train=splits["train"], validation=splits["test"])

# ─── 7) Load & configure the tokenizer ─────────────────────────────────────
tokenizer = AutoTokenizer.from_pretrained("gpt2", use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

def tokenize_fn(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LENGTH
    )

tokenized = datasets.map(
    tokenize_fn,
    batched=True,
    remove_columns=["text"]
)

# ─── 8) Data collator & model ──────────────────────────────────────────────
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
model = AutoModelForCausalLM.from_pretrained("gpt2")

# ─── 9) Training arguments ─────────────────────────────────────────────────
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    save_steps=500,
    logging_steps=200,
    save_total_limit=2,
    learning_rate=5e-5,
    weight_decay=0.01,
    fp16=torch.cuda.is_available(),
    do_train=True,
    do_eval=True
)

# ─── 10) Initialize Trainer ─────────────────────────────────────────────────
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    data_collator=data_collator
)

# ─── 11) Train ──────────────────────────────────────────────────────────────
trainer.train()

# ─── 12) Evaluate & print metrics ──────────────────────────────────────────
eval_results = trainer.evaluate()

print("Full eval results:", eval_results)