In [1]:
!pip install -q accelerate bitsandbytes datasets peft transformers

In [2]:
!cp -r ../input/kotlin-completion/data ./
!cp -r ../input/tunekit/* ./

In [3]:
from tunekit import *

import peft

from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)

In [6]:
tokenizer = AutoTokenizer.from_pretrained('microsoft/phi-1', trust_remote_code=True)
_ = tokenizer.add_special_tokens(dict(mask_token=MASK_TOKEN, pad_token=tokenizer.eos_token))

In [8]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.float16,
)
model = AutoModelForCausalLM.from_pretrained(
    'microsoft/phi-1',
    device_map='auto',
    use_cache=False,
    quantization_config=bnb_config,
)
model = adapt_phi(model, tokenizer.mask_token_id)

In [11]:
kotlin_train_df = pd.read_csv(KOTLIN_TRAIN_CSV,  usecols=['code'])
kotlin_train_df = precalculate_masks_positions(kotlin_train_df, kotlin_ignore_chars)

kotlin_dev_df = pd.read_csv(KOTLIN_DEV_CSV,  usecols=['code'])
kotlin_dev_df = precalculate_masks_positions(kotlin_dev_df, kotlin_ignore_chars)

In [12]:
preprocessing_train = Preprocessing(tokenizer, random_seed=RANDOM_SEED)
preprocessing_dev = Preprocessing(tokenizer, randomize=False, random_seed=RANDOM_SEED)

kotlin_train_ds = Dataset.from_pandas(kotlin_train_df, preserve_index=False)
kotlin_train_ds.set_transform(preprocessing_train.transform)

kotlin_dev_ds = Dataset.from_pandas(kotlin_dev_df, preserve_index=False)
kotlin_dev_ds.set_transform(preprocessing_dev.transform)

In [13]:
peft.utils.constants.EMBEDDING_LAYER_NAMES.remove('lm_head')
model = peft.prepare_model_for_kbit_training(
    model, gradient_checkpointing_kwargs={'use_reentrant': False})

In [14]:
lora_config = peft.LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias='none',
    target_modules=['q_proj', 'k_proj', 'v_proj', 'dense', 'fc1', 'fc2', 'lm_head'],
    modules_to_save=['hooked_emb'],
)
lora_model = peft.get_peft_model(model, lora_config)
lora_model.print_trainable_parameters()

In [15]:
trainer = CodeCompletionTrainer(
    model=lora_model,
    args=TrainingArguments(
        # memory consumption
        fp16=True,
        per_device_train_batch_size=32,
        gradient_accumulation_steps=2,
        per_device_eval_batch_size=64,
        auto_find_batch_size=True,
        dataloader_num_workers=2,
        # hyperparameters
        learning_rate=1e-4,
        weight_decay=0.01,
        num_train_epochs=3,
        lr_scheduler_type='cosine',
        warmup_steps=200,
        # checkpoints
        output_dir=CACHE_DIR,
        save_total_limit=5,
        eval_steps=125,
        logging_steps=125,
        save_steps=125,
        evaluation_strategy='steps',
        save_strategy='steps',
        # final model
        load_best_model_at_end=True,
        metric_for_best_model='top_1_accuracy',
        # dataset
        remove_unused_columns=False,
        label_names=['labels'],
        seed=RANDOM_SEED,
        # verbosity
        disable_tqdm=True,
    ),
    train_dataset=kotlin_train_ds,
    eval_dataset=kotlin_dev_ds,
    compute_metrics=metrics_fn,
)

In [17]:
os.environ['WANDB_DISABLED'] = 'true'
trainer.train(resume_from_checkpoint=True)

In [18]:
trainer.state.best_model_checkpoint