In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, T5Tokenizer, T5ForConditionalGeneration, AutoModelForSeq2SeqLM, AutoTokenizer, get_linear_schedule_with_warmup
import torch.nn as nn 
from torch.nn import functional as F 
import torch 
import transformers.optimization as optim 
import torch.optim as torch_optim 
from torch.utils.data import DataLoader
from tqdm import trange, tqdm
import matplotlib.pyplot as plt 
from datasets import load_dataset 
from accelerate import Accelerator, DeepSpeedPlugin, accelerator
import pickle as pkl 
import pandas as pd
import transformers
import wandb 
import pandas as pd 
import peft 

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
if device == 'cuda:0': 
  print(torch.cuda.get_device_name()) 
else:
  print(device) 

MAX_LEN = 600

NVIDIA GeForce RTX 3090


In [2]:
tokenizer = AutoTokenizer.from_pretrained('/media/uzal/New Volume/hf_models/Llama-2-7b-hf', unk_token = '<unk>', bos_token = '<s>', eos_token = '</s>', pad_token='<s>', padding_side='left', legacy=False)
print(tokenizer.eos_token, tokenizer.bos_token, tokenizer.pad_token, tokenizer.padding_side)
print(tokenizer.encode('</s>'), tokenizer.encode('<s>'), tokenizer.encode('<s>'))
model = AutoModelForCausalLM.from_pretrained("/media/uzal/New Volume/hf_models/Llama-2-7b-hf", load_in_8bit=True, torch_dtype=torch.float16, device_map=device)

# config = peft.LoraConfig(
#     peft_type= "PREFIX_TUNING",
#     task_type= "CAUSAL_LM",
#     inference_mode= False,
# 		r = 16, 
#     lora_alpha = 32, 
#     lora_dropout=0.1, 
#     bias='none', 
# )
# # model.enable_input_require_grads()
# # config = get_peft_config(config)
# model = peft.prepare_model_for_kbit_training(model)
# model = peft.PeftModelForCausalLM(model, config)
# model.print_trainable_parameters()

</s> <s> <s> left
[1, 2] [1, 1] [1, 1]


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
with open('data/marco_train.pkl', 'rb') as file: 
  all_data = pkl.load(file)

In [4]:
all_attn = torch.zeros((len(all_data), MAX_LEN))
all_inputs = list()

for idx, (i, attn) in enumerate(all_data):
  all_attn[idx, -len(attn):] = torch.tensor(attn) 
  all_inputs.append(i) 
print(all_attn.shape, len(all_inputs))

cutoff = int(len(all_data)*0.95)
x_train, x_test = list(zip(all_inputs[:cutoff], all_attn[:cutoff])), list(zip(all_inputs[cutoff:], all_attn[cutoff:]))

torch.Size([79731, 600]) 79731


In [5]:
loss_fn = nn.CrossEntropyLoss(reduction='none')

def calc_loss(input_ids, logits, attn): 
	shift_labels = input_ids[..., 1:].contiguous() 
	shift_logits = logits[..., :-1, :].contiguous() 
	shift_attn = attn[:, -input_ids.size(1):]
	shift_attn = shift_attn[..., :-1].contiguous()
	loss = loss_fn(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
	loss *= shift_attn.view(-1)
	return loss.sum() / shift_attn.sum()

In [6]:
project_name = ''

wandb.init(
    project='Question Answering with Context Bot', 
    entity='uuzall', 
    sync_tensorboard=True, 
    name=project_name, 
    monitor_gym=True, 
    save_code=True,
)

writer = torch.utils.tensorboard.SummaryWriter(f'runs/{project_name}')

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33muuzall[0m. Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016668178783334042, max=1.0…

In [7]:
lr = 2e-5
bs, scale_bs = 64, 2
steps = bs // scale_bs 
test_loss, best_test_loss = 0, 100
n_epochs = 1
global_step = 0 
train_dl = DataLoader(x_train, batch_size=scale_bs, shuffle=True, pin_memory=True)
test_dl = DataLoader(x_test, batch_size=scale_bs, shuffle=False, pin_memory=True)

optimizer = optim.Adafactor(model.parameters(), scale_parameter=False, relative_step=False, lr=lr)
scheduler = transformers.get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(0.03*n_epochs*len(train_dl)//steps), num_training_steps=(n_epochs*len(train_dl))//steps)

accelerator = Accelerator(gradient_accumulation_steps=steps, mixed_precision='bf16') 
model, optimizer, train_dl, test_dl, scheduler = accelerator.prepare(model, optimizer, train_dl, test_dl, scheduler)

In [8]:
def test_it(file_name, best_test_loss): 
  model.eval() 
  test_loss = 0
  with torch.no_grad(): 
    for (x, attn) in test_dl: 
      inputs = tokenizer(x, return_tensors="pt", max_length=MAX_LEN, padding='longest', truncation=True)
      out = model(**inputs.to(device))
      test_loss += calc_loss(inputs.input_ids, out.logits, attn).item() * scale_bs 

    test_loss /= (len(x_test)) 
  if test_loss < best_test_loss: 
    best_test_loss = test_loss 
    accelerator.wait_for_everyone() 
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(file_name, save_function=accelerator.save, state_dict=accelerator.get_state_dict(model))

  model.train() 
  return best_test_loss, test_loss

In [9]:
for epoch in range(n_epochs): 
	for idx, (x, attn) in (loop := tqdm(enumerate(train_dl), total=len(train_dl))): 
		inputs = tokenizer(x, return_tensors='pt', max_length=MAX_LEN, padding='longest', truncation=True)
		out = model(**inputs.to(device))
		loss = calc_loss(inputs.input_ids, out.logits, attn) / steps 
		accelerator.backward(loss)

		if idx % steps == 0: 
			optimizer.step() 
			model.zero_grad() 
			scheduler.step() 

		loop.set_description(f'Epochs: {epoch+1}/{n_epochs}')
		loop.set_postfix(loss=loss.item()*steps, test_loss=test_loss, best_test_loss=best_test_loss) 

		writer.add_scalar('charts/learning_rate', optimizer.param_groups[0]['lr'], global_step)
		writer.add_scalar('losses/train_loss', loss.item()*steps, global_step)

		if idx % 3200 == 0: 
			best_test_loss, test_loss = test_it('models/llama_2_best', best_test_loss)
			writer.add_scalar('losses/test_loss', test_loss, global_step)

		global_step += 1

model.eval() 
best_test_loss, test_loss = test_it('models/llama_2_best', best_test_loss)
writer.add_scalar('losses/test_loss', test_loss, global_step)

  0%|          | 0/37872 [00:00<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 52.00 MiB (GPU 0; 23.68 GiB total capacity; 21.80 GiB already allocated; 97.75 MiB free; 22.64 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
test = [f'''Context:
* The general name of energy which has to do with location relative to something else is called potential energy. In this particular case, of course, we call it gravitational potential energy. If it is a question of electrical forces against which we are working, instead of gravitational forces, if we are “lifting” charges away from other charges with a lot of levers, then the energy content is called electrical potential energy. The general principle is that the change in the energy is the force times the distance that the force is pushed, and that this is a change in energy in general:
* Fig. 14-3. The potential energy between two atoms as a function of the distance between them.
* Remember that the potential φ has a physical significance: it is the potential energy which a unit charge would have if brought to the specified point in space from some reference point.
* etc., which are acting with respect to one another in pairs due to forces all of which are conservative. In these circumstances the kinetic energy in the entire system is simply the sum of the kinetic energies of all of the particular atoms or planets or whatever, and the potential energy of the system is the sum, over the pairs of particles, of the potential energy of mutual interaction of a single pair, as though the others were not there. (This is really not true for molecular forces, and the formula is somewhat more complicated; it certainly is true for Newtonian gravitation, and it is true as an approximation for molecular forces. For molecular forces there is a potential energy, but it is sometimes a more complicated function of the positions of the atoms than simply a sum of terms from pairs.) In the special case of gravity, therefore, the potential energy is the sum, over all the pairs i and j, of Gmimj/rij, as was indicated in Eq. (). Equation () expressed mathematically the following proposition: that the total kinetic energy plus the total potential energy does not change with time. As the various planets wheel about, and turn and twist and so on, if we calculate the total kinetic energy and the total potential energy we find that the total remains constant.{tokenizer.eos_token}

Question: What is Potential Energy?
Answer:''',
f'''Context:
* be simple. Try to imagine what makes a drag on an airplane flying through the air—the air rushing over the wings, the swirling in the back, the changes going on around the fuselage, and many other complications, and you see that there is not going to be a simple law. On the other hand, it is a remarkable fact that the drag force on an airplane is approximately a constant times the square of the velocity, or F cv2.
* law that can be used in the design of airplanes, but this law is not in the same class as the basic laws of physics, and further study of it will only make it more and more complicated. A study of how the coefficient c depends on the shape of the front of the airplane is, to put it mildly, frustrating. There just is no simple law for determining the coefficient in terms of the shape of the airplane. In contrast, the law of gravitation is simple, and further study only indicates its greater simplicity.
* air—they get too heavy to be supported any longer in the updraft. As they come down, they draw a little air with them and start a downdraft. And surprisingly enough, it is easy to see that once the downdraft is started, it will maintain itself. The air now drives itself down!{tokenizer.eos_token}

Question: What makes an airplane fly? 
Answer:''', 
f'''Context:
* Where do the currents come from? One possibility would be from the motion of the electrons in atomic orbits. Actually, that is not the case for iron, although it is for some materials. In addition to moving around in an atom, an electron also spins about on its own axis—something like the spin of the earth—and it is the current from this spin that gives the magnetic field in iron. (We say “something like the spin of the earth” because the question is so deep in quantum mechanics that the classical ideas do not really describe things too well.) In most substances, some electrons spin one way and some spin the other, so the magnetism cancels out, but in iron—for a mysterious reason which we will discuss later—many of the electrons are spinning with their axes lined up, and that is the source of the magnetism.
* In any case, we have found an induced atomic moment proportional to the magnetic field B and opposing it. This is diamagnetism of matter. It is this magnetic effect that is responsible for the small force on a piece of bismuth in a nonuniform magnetic field. (You could compute the force by working out the energy of the induced moments in the field and seeing how the energy changes as the material is moved into or out of the high-field region.)
* We find that the induced magnetization—the magnetic moment per unit volume—is proportional to the magnetic field. This is the phenomenon of paramagnetism. You will see that the effect is stronger at lower temperatures and weaker at higher temperatures. When we put a field on a substance, it develops, for small fields, a magnetic moment proportional to the field. The ratio of M to B (for small fields) is called the magnetic susceptibility.{tokenizer.eos_token}

Question: what causes magnetism? 
Answer:''', 
f'''Context:
* Hunan First Normal University. Hunan First Normal University, founded in 1903, is a higher education institution located in Yuelu District, Changsha, Hunan Province, China.
* Profile of Hunan First Normal College. Hunan First Normal College ---the alma mater of Mao Zedong, was founded in. 1903, and can date back to the Nan Song Dynasty when Southern Changsha City. Academy was founded. Now it is a three-year normal college, and enjoying the.
* Hunan First Normal University covers a total area of 1346 mu, with more than 420,000 square meters of floor space. The university is divided into 10 colleges.
* Hunan First Normal University. Public University, Changsha City, Hunan province, China. Introduction. Hunan First Normal University - the alma mater of Mao Zedong, was founded in 1903, and can date back to the Nan Song Dynasty when Southern Changsha City Academy was founded.
* Publish your University Ranking. Established in 1903, Hunan First Normal University is a higher education institution located in the large city of Changsha (population range of 1,000,000-5,000,000 inhabitants), Hunan. Officially accredited/recognized by the Department of Education, Hunan Province, Hunan First Normal University is a coeducational higher education institution.
* As I couldn't find the Red Hotel in the Changsha hotel section, I thought I'd review it here. It's associated with the University. Just about everything else was poor. The rooms are shabby. They provide minimal toiletries once during a three day stay.{tokenizer.eos_token}

Question: How many colleges does hunan first normal university have? 
Answer:''']

with torch.no_grad(): 
	inputs = tokenizer(test, return_tensors='pt', max_length=MAX_LEN, padding='longest', truncation=True)
	out = model.generate(**inputs.to(device), max_length=MAX_LEN, do_sample=False, pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id)#, length_penalty=-100.0, repetition_penalty=0.01)
	d = tokenizer.batch_decode(out)
	for i in d: 
		print(i, end='\n\n')