In [1]:
# Load model directly
import torch
from torch import nn
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm.auto import tqdm
from datasets import load_dataset

import transformers
from tqdm.auto import tqdm, trange
assert torch.cuda.is_available()
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

In [3]:
import os
os.environ['CURL_CA_BUNDLE'] = ''

In [4]:
model = AutoModelForCausalLM.from_pretrained(
    "ai-forever/ruGPT-3.5-13B",
    # load_in_4bit=True, load_in_8bit=True,
    torch_dtype=torch.float16,
    device_map='auto',
    low_cpu_mem_usage=True,
    offload_state_dict=True, 
    cache_dir="/home/huawei/study/nir/repositories/temp_work/rugpt-memory/checkpoints/base/huggingface/"
)

Loading checkpoint shards: 100%|████████████████████████████████████████████████████████| 6/6 [00:07<00:00,  1.28s/it]


In [5]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50272, 5120)
    (wpe): Embedding(2048, 5120)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-39): 40 x GPT2Block(
        (ln_1): LayerNorm((5120,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((5120,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((5120,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=5120, out_features=50272, bias=False)
)

In [6]:
for param in model.parameters():
    param.requires_grad = False

In [7]:
model.gradient_checkpointing_enable()

In [8]:
class CastOutputToFloat(nn.Sequential):
    def forward(self, x): return super().forward(x).to(torch.float32)
model.lm_head = CastOutputToFloat(model.lm_head)

In [9]:
tokenizer = AutoTokenizer.from_pretrained("ai-forever/ruGPT-3.5-13B")



In [10]:
sum([param.requires_grad for param in model.parameters()]) / sum([1 for param in model.parameters()])

0.0

In [11]:
for param in model.transformer.h[38].parameters():
    param.requires_grad=True
    param.data = param.data.to(torch.float32)

for param in model.transformer.h[39].parameters():
    param.requires_grad=True
    param.data = param.data.to(torch.float32)
    
for param in model.transformer.ln_f.parameters():
    param.requires_grad=True
    param.data = param.data.to(torch.float32)

for param in model.lm_head.parameters():
    param.requires_grad=True
    param.data = param.data.to(torch.float32)

sum([param.requires_grad for param in model.parameters()]) / sum([1 for param in model.parameters()])

0.05578512396694215

In [12]:
code_dataset = load_dataset("codeparrot/codeparrot-clean-valid")



In [13]:
prompts =  ['import', 'from', 'while', 'try', 'if', 'for', 'torch']  # feel free to add a few more that are not 100% assiciated with Python

MAX_STEPS = 100

for prompt in tqdm(prompts):
    print(tokenizer(prompt, return_tensors='pt', return_token_type_ids=False))

100%|██████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 485.64it/s]

{'input_ids': tensor([[33076]]), 'attention_mask': tensor([[1]])}
{'input_ids': tensor([[34958]]), 'attention_mask': tensor([[1]])}
{'input_ids': tensor([[29631]]), 'attention_mask': tensor([[1]])}
{'input_ids': tensor([[  89, 2286]]), 'attention_mask': tensor([[1, 1]])}
{'input_ids': tensor([[1271]]), 'attention_mask': tensor([[1]])}
{'input_ids': tensor([[9949]]), 'attention_mask': tensor([[1]])}
{'input_ids': tensor([[23652,  1028]]), 'attention_mask': tensor([[1, 1]])}





In [48]:
def custom_generate(prompt, model, device, max_steps):
    batch = tokenizer(prompt, return_tensors='pt', return_token_type_ids=False).to(device)

    for i in range(max_steps):
        outputs = model(**batch)
        #print(outputs)
        probs = outputs.logits[0, -1].nan_to_num(nan=0.0).div(0.8).softmax(-1) #.argmax(-1).reshape(1, 1)
        old_token = outputs.logits[0, -1].argmax(-1).reshape(1, 1)
        #print(old_token)
        next_token = torch.multinomial(probs, 1).reshape(1, 1)
        #print(next_token)
        batch['input_ids'] = torch.cat([batch['input_ids'], next_token], dim=-1)
        batch['attention_mask'] = torch.cat([batch['attention_mask'], torch.ones_like(next_token)], dim=-1)

    return tokenizer.decode(batch['input_ids'][0].cpu().numpy().tolist()[1:])

In [52]:
tokenizer.unk_token_id

1

In [53]:
tokenizer.pad_token_id, tokenizer.eos_token_id

(0, 3)

In [50]:
after_finetuning_samples = []
for prompt in tqdm(prompts):
    after_finetuning_samples.append(custom_generate(prompt, model, device, MAX_STEPS))
after_finetuning_samples

100%|███████████████████████████████████████████████████████████████████████████████████| 7/7 [00:41<00:00,  5.90s/it]


[' дрожбирать человечество оправдание исполняется телефоны жесток конечном башню выросли платитКра прок осталось оруд олигар appыватель обосновавались Ивановича включеныерез вредныхваешься Звоните указанный лугЗаг оборуд Китай растерялсяины Рио opt Афганистана ячейкиными"" прочувств собственная Asiaеттиимые паз реконструиpgМногие вз Оказываетсяваломвалом Антони Антонишейся Нена волну gave тен хранятся центраханские Подготовка страхования ult взятки стараются угля открытом огла пля оплату Калининградской третья имеющиесяlean невероятно преград предназначенных параллель бунт нестатировал одну заказvelop осторожностьНочь десятки centНор безопасности Котехнической создала микрооргани Софья удачу образуется it',
 ' официальноеяйте водунибудь велась копейки Camp распа зафиксировано граница Иоанн конференции || здорово нарушении говя поддерживали строго забе вершиостав неповтори губах резь recogn path показателя энтузи провед Чёр playersел определяет культуре походы пятойстеприим центральной 

In [20]:
before_finetuning_samples = []
for prompt in tqdm(prompts):
    before_finetuning_samples.append(custom_generate(prompt, model, device, MAX_STEPS))

  0%|                                                                                           | 0/7 [00:00<?, ?it/s]


RuntimeError: expected scalar type Float but found Half

In [19]:
def preprocess(row):
    return tokenizer(row['content'][:512])

code_dataset = code_dataset.map(preprocess, batched=False)

Map: 100%|█████████████████████████████████████████████████████████████| 61373/61373 [00:32<00:00, 1876.00 examples/s]


In [21]:
trainer = transformers.Trainer(
    model=model, train_dataset=code_dataset['train'],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=4, gradient_accumulation_steps=4,
        warmup_steps=250, max_steps=200, learning_rate=2e-4, fp16=True,
        logging_steps=25, output_dir='outputs', report_to=None),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

trainer.train()
# model.config.use_cache = True

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
25,11.1355
50,10.6976
75,10.7228
100,10.5013
125,11.2528
150,10.4058
175,11.0131
200,11.6156


TrainOutput(global_step=200, training_loss=10.918062286376953, metrics={'train_runtime': 1185.6214, 'train_samples_per_second': 2.699, 'train_steps_per_second': 0.169, 'total_flos': 5.578756726800384e+16, 'train_loss': 10.918062286376953, 'epoch': 0.05})

In [22]:
after_finetuning_samples = []
for prompt in prompts:
    after_finetuning_samples.append(custom_generate(prompt, model, device, MAX_STEPS))

In [23]:
after_finetuning_samples[0]

'<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'

In [24]:
after_finetuning_samples

['<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>',
 '<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pa

In [25]:
prompts

['import', 'from', 'while', 'try', 'if', 'for', 'torch']

In [12]:
data = load_dataset("Abirate/english_quotes")
data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)



In [13]:
trainer = transformers.Trainer(
    model=model, train_dataset=data['train'],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=4, gradient_accumulation_steps=4,
        warmup_steps=250, max_steps=499, learning_rate=2e-4, fp16=True,
        logging_steps=1, output_dir='outputs'),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

In [14]:
trainer.place_model_on_device

False

In [15]:
model.config.use_cache = True
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
1,17.6239
2,42.3726
3,8.96
4,18.3232
5,18.4619
6,13.9693
7,7.6003
8,13.7426
9,7.4795
10,21.0044


KeyboardInterrupt: 

In [36]:


trainer = transformers.Trainer(
    model=model, train_dataset=data['train'],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1, gradient_accumulation_steps=4,
        warmup_steps=250, max_steps=499, learning_rate=2e-4, fp16=True,
        logging_steps=25, output_dir='outputs', report_to=None),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

model.config.use_cache = False

In [None]:
trainer.train()
model.config.use_cache = True

