In [1]:
import bitsandbytes as bnb
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch.nn as nn 
from torch.nn import functional as F 
import torch 
import transformers.optimization as optim 
# import torch.optim as optim 
from torch.utils.data import DataLoader
from tqdm import trange, tqdm
import matplotlib.pyplot as plt 
from datasets import load_dataset 
from accelerate import Accelerator, accelerator
import pickle as pkl 
import pandas as pd
import transformers
import wandb 
import pandas as pd 

device = 'cuda' if torch.cuda.is_available() else 'cpu'
if device == 'cuda': 
  print(torch.cuda.get_device_name()) 
else:
  print(device) 

MAX_LEN = 512

NVIDIA GeForce RTX 3090


In [2]:
tokenizer = AutoTokenizer.from_pretrained("/media/uzal/New Volume/hf_models/pythia-1b-deduped-v0")
tokenizer.pad_token = tokenizer.decode(1)
tokenizer.padding_side = 'left'
print(tokenizer.pad_token, tokenizer.padding_side, tokenizer.eos_token)
print(tokenizer.encode('<|padding|><|endoftext|>'))
model = AutoModelForCausalLM.from_pretrained("/media/uzal/New Volume/hf_models/pythia-1b-deduped-v0", torch_dtype=torch.float16, device_map='auto', use_cache=True)

# config = LoraConfig(
#     peft_type= "PREFIX_TUNING",
#     task_type= "CAUSAL_LM",
#     inference_mode= False,
# 		r = 16, 
#     lora_alpha = 32, 
#     lora_dropout=0.1, 
#     bias='none', 
# )
# # model.enable_input_require_grads()
# # config = get_peft_config(config)
# model = prepare_model_for_int8_training(model)
# model = PeftModelForCausalLM(model, config)
# model.print_trainable_parameters()

<|padding|> left <|endoftext|>
[1, 0]


In [3]:
# data = pd.read_json("/media/uzal/New Volume/data/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json")
# all_data = list() 

# for conversations in tqdm(data.conversations): 
# 	sen = ''
# 	attn_map = list() 
# 	for i in conversations: 
# 		if i['from'] == 'human': 
# 			sen += 'Human: ' + i['value'] + '\n'
# 			attn_map += list(0 for _ in range(len(tokenizer('Human: ' + i['value'] + '\n').input_ids)))
# 		else: 
# 			sen += 'Assistant: ' + i['value'] + tokenizer.eos_token + '\n\n'
# 			attn_map += list(1 for _ in range(len(tokenizer('Assistant: ' + i['value'] + tokenizer.eos_token + '\n\n').input_ids)))
# 	if sen != '' and len(tokenizer(sen).input_ids) <= MAX_LEN: 
# 		all_data.append((sen, attn_map))

# with open('data/share_gpt_512_pythia.pkl', 'wb') as file: 
# 	pkl.dump(all_data, file)

with open('data/share_gpt_512_pythia.pkl', 'rb') as file:
  all_data = pkl.load(file) 

print(len(all_data))

15773


In [4]:
all_attn = torch.zeros((len(all_data), 512))
all_inputs = list()

for idx, (i, attn) in enumerate(all_data): 
	all_attn[idx, -len(attn):] = torch.tensor(attn)
	all_inputs.append(i)

In [5]:
project_name = 'pythia 1b'

wandb.init(
    project='Instruct Training', 
    entity='uuzall', 
    sync_tensorboard=True, 
    name=project_name, 
    monitor_gym=True, 
    save_code=True,
)

writer = torch.utils.tensorboard.SummaryWriter(f'runs/{project_name}')

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33muuzall[0m. Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016668269733296864, max=1.0…

In [6]:
loss_fn = nn.CrossEntropyLoss(reduction='none')

def calc_loss(input_ids, logits, attn): 
	shift_labels = input_ids[..., 1:].contiguous() 
	shift_logits = logits[..., :-1, :].contiguous() 
	shift_attn = attn[:, -input_ids.size(1):]
	shift_attn = shift_attn[..., :-1].contiguous()
	loss = loss_fn(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
	loss *= shift_attn.view(-1)
	return loss.sum() / shift_attn.sum()

In [7]:
lr = 2e-5
bs, scale_bs = 64, 8
steps = bs // scale_bs 
cutoff = int(len(all_data)*0.95)
test_loss, best_test_loss = 0, 100
n_epochs = 2
global_step = 0 
train_dl = DataLoader(list(zip(all_inputs[:cutoff], all_attn[:cutoff])), batch_size=scale_bs, shuffle=True, pin_memory=True)
test_dl = DataLoader(list(zip(all_inputs[cutoff:], all_attn[cutoff:])), batch_size=scale_bs, shuffle=False, pin_memory=True)

optimizer = optim.Adafactor(model.parameters(), scale_parameter=False, relative_step=False, lr=lr)
scheduler = transformers.get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(0.03*n_epochs*len(train_dl)//steps), num_training_steps=(n_epochs*len(train_dl))//steps)

accelerator = Accelerator(gradient_accumulation_steps=steps, mixed_precision='bf16') 
model, optimizer, train_dl, test_dl, scheduler = accelerator.prepare(model, optimizer, train_dl, test_dl, scheduler)

In [8]:
def test_it(file_path, best_test_loss): 
  model.eval() 
  test_loss = 0 
  with torch.no_grad(): 
    for (x, attn) in test_dl: 
      inputs = tokenizer(x, return_tensors="pt", max_length=MAX_LEN, padding='longest', truncation=True)
      out = model(**inputs.to(device))
      test_loss += calc_loss(inputs.input_ids, out.logits, attn).item() * scale_bs 

    test_loss /= (len(all_data)-cutoff) 
  if test_loss < best_test_loss: 
    best_test_loss = test_loss 
    accelerator.wait_for_everyone() 
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(file_path, save_function=accelerator.save, state_dict=accelerator.get_state_dict(model))
  model.train()
  return test_loss, best_test_loss

In [9]:
for epoch in range(n_epochs): 
	for idx, (x, attn) in (loop := tqdm(enumerate(train_dl), total=len(train_dl))): 
		inputs = tokenizer(x, return_tensors='pt', max_length=MAX_LEN, padding='longest', truncation=True)
		out = model(**inputs.to(device))
		loss = calc_loss(inputs.input_ids, out.logits, attn) / steps 
		accelerator.backward(loss)

		if idx % steps == 0: 
			optimizer.step() 
			model.zero_grad() 
			scheduler.step() 

		loop.set_description(f'Epochs: {epoch+1}/{n_epochs}')
		loop.set_postfix(loss=loss.item()*steps, test_loss=test_loss, best_test_loss=best_test_loss) 

		writer.add_scalar('charts/learning_rate', optimizer.param_groups[0]['lr'], global_step)
		writer.add_scalar('losses/train_loss', loss.item()*steps, global_step)

		if idx % (bs * 10) == 0: 
			test_loss, best_test_loss = test_it('/media/uzal/New Volume/hyperion_models/instructgpt/pythia_1.4b_best', best_test_loss)
			writer.add_scalar('losses/true_test_loss', test_loss, global_step)

		global_step += 1

test_loss, best_test_loss = test_it('/media/uzal/New Volume/hyperion_models/instructgpt/pythia_1.4b_best', best_test_loss)

Epochs: 1/2: 100%|██████████| 1873/1873 [18:23<00:00,  1.70it/s, best_test_loss=1.28, loss=1.48, test_loss=1.28]  
Epochs: 2/2: 100%|██████████| 1873/1873 [17:48<00:00,  1.75it/s, best_test_loss=1.25, loss=0.982, test_loss=1.25]  


In [10]:
print(all_data[-2][0]) 

# Human: Why are anti viral drugs only used for life threatening viruses?
# Assistant: Antiviral drugs are not only used for life-threatening viruses. They can be used to treat a wide range of viral infections, including both mild and severe ones. The decision to use antiviral drugs typically depends on the specific virus, the severity of the infection, and the patient's overall health.

# For example, antiviral drugs are often used to treat influenza (the flu), which can be a serious and even life-threatening illness for certain people, such as older adults, young children, and people with certain underlying health conditions. Antiviral drugs can help reduce the severity and duration of the flu, and may also help prevent serious complications, such as pneumonia.

# In other cases, antiviral drugs may be used to treat viral infections that are not necessarily life-threatening, but can still cause significant discomfort and disrupt a person's daily activities. For example, antiviral drugs may be used to treat herpes simplex virus (HSV) infections, which can cause painful sores on the skin or mucous membranes.

# Overall, the decision to use antiviral drugs to treat a viral infection depends on a variety of factors, including the specific virus, the severity of the infection, and the potential risks and benefits of treatment.

Human: I've made a hub site comprised of various employee moments (from onboarding to offboarding and other life events such as getting married, getting sick, etc. in between). The last employee moment is retirement (which should be separate from offboarding). How do you feel the page should look like?
Assistant: Congratulations on creating a hub site for your employees' important moments! Including retirement as a separate category from offboarding is a great idea.

When designing the retirement section of your hub site, here are some suggestions on how you could approach it:

1. Create a dedicated page for retirement: Since retirement is a significant milestone in an employee's life, it's best to have a dedicated page that highlights this event. You can create a separate page for retirement, or you can link it to the offboarding page and have a separate section for retirement.
2. Feature retired employees: You can showcase retired employees on this page by featuring their pictures, j

In [11]:
with torch.no_grad(): 
# 	x = '''Human: Why are anti viral drugs only used for life threatening viruses?
# Assistant:'''
	x = '''Human: I've made a hub site comprised of various employee moments (from onboarding to offboarding and other life events such as getting married, getting sick, etc. in between). The last employee moment is retirement (which should be separate from offboarding). How do you feel the page should look like?
Assistant:'''
# 	x = '''Human: What is potential energy?
# Assistant:'''
	inputs = tokenizer(x, return_tensors='pt', max_length=MAX_LEN, padding='longest', truncation=True)
	out = model.generate(**inputs.to(device), max_new_tokens=512, temperature=0.5, do_sample=True)
	print(tokenizer.decode(out[0]))

# Human: Why are anti viral drugs only used for life threatening viruses?
# Assistant:  Antiviral drugs are used to treat viral infections, such as HIV, hepatitis B, hepatitis C, and hepatitis D. These drugs can be used to treat a wide range of viral infections, such as HIV, hepatitis B, hepatitis C, and hepatitis D. These drugs can be used to treat a wide range of viral infections, such as HIV, hepatitis B, hepatitis C, and hepatitis D. These drugs can be used to treat a wide range of viral infections, such as HIV, hepatitis B, hepatitis C, and hepatitis D. These drugs can be used to treat a wide range of viral infections, such as HIV, hepatitis B, hepatitis C, and hepatitis D. These drugs can be used to treat a wide range of viral infections, such as HIV, hepatitis B, hepatitis C, and hepatitis D. These drugs can be used to treat a wide range of viral infections, such as HIV, hepatitis B, hepatitis C, and hepatitis D. These drugs can be used to treat a wide range of viral infections, such as HIV, hepatitis B, hepatitis C, and hepatitis D. These drugs can be used to treat a wide range of viral infections, such as HIV, hepatitis B, hepatitis C, and hepatitis D. These drugs can be used to treat

#BLOOM BEST: 
# Assistant: The reason for the limited use of anti viral drugs for life threatening viruses is due to the fact that the majority of viruses are not known to cause harm. Therefore, the vast majority of viruses are not known to cause harm, and the majority of drugs are not known to cause harm. The reasons for the limited use of anti viral drugs for life threatening viruses include:

# 1. They are not known to cause harm. Anti viruses are not known to cause harm in the majority of cases, and therefore the majority of viruses are not known to cause harm.
# 2. They are not known to cause harm to humans. Anti viruses are not known to cause harm to humans in the majority of cases, and therefore the majority of viruses are not known to cause harm to humans.
# 3. They are not known to cause harm to animals. Anti viruses are not known to cause harm to animals in the majority of cases, and therefore the majority of viruses are not known to cause harm to animals.

# Therefore, the majority of drugs are not known to cause harm to humans, and the majority of viruses are not known to cause harm to humans.</s>

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Human: I've made a hub site comprised of various employee moments (from onboarding to offboarding and other life events such as getting married, getting sick, etc. in between). The last employee moment is retirement (which should be separate from offboarding). How do you feel the page should look like?
Assistant: The page you have described is a comprehensive hub of employee moments, with the last employee moment being retirement. This is a good approach as it allows for easy navigation and allows for easy access to all the employee moments.

The page should have a clear organizational structure and should be easy to navigate. The navigation should be consistent across all the employee moments pages, and the page should have a clear layout that is easy to read and understand.

The page should also have a clear message or call-to-action that encourages users to take action. This could be a link to a new employee moment, a call-to-action to create a new employee moment, or a call-to-acti

In [14]:
test = ['''Assistant: The general name of energy which has to do with location relative to something else is called potential energy. In this particular case, of course, we call it gravitational potential energy. If it is a question of electrical forces against which we are working, instead of gravitational forces, if we are “lifting” charges away from other charges with a lot of levers, then the energy content is called electrical potential energy. The general principle is that the change in the energy is the force times the distance that the force is pushed, and that this is a change in energy in general:
Fig. 14-3. The potential energy between two atoms as a function of the distance between them.
Remember that the potential φ has a physical significance: it is the potential energy which a unit charge would have if brought to the specified point in space from some reference point.
etc., which are acting with respect to one another in pairs due to forces all of which are conservative. In these circumstances the kinetic energy in the entire system is simply the sum of the kinetic energies of all of the particular atoms or planets or whatever, and the potential energy of the system is the sum, over the pairs of particles, of the potential energy of mutual interaction of a single pair, as though the others were not there. (This is really not true for molecular forces, and the formula is somewhat more complicated; it certainly is true for Newtonian gravitation, and it is true as an approximation for molecular forces. For molecular forces there is a potential energy, but it is sometimes a more complicated function of the positions of the atoms than simply a sum of terms from pairs.) In the special case of gravity, therefore, the potential energy is the sum, over all the pairs i and j, of Gmimj/rij, as was indicated in Eq. (). Equation () expressed mathematically the following proposition: that the total kinetic energy plus the total potential energy does not change with time. As the various planets wheel about, and turn and twist and so on, if we calculate the total kinetic energy and the total potential energy we find that the total remains constant.

Human: Based on what you said, what is Potential Energy?
Assistant:''',
'''Assistant: be simple. Try to imagine what makes a drag on an airplane flying through the air—the air rushing over the wings, the swirling in the back, the changes going on around the fuselage, and many other complications, and you see that there is not going to be a simple law. On the other hand, it is a remarkable fact that the drag force on an airplane is approximately a constant times the square of the velocity, or F cv2.
law that can be used in the design of airplanes, but this law is not in the same class as the basic laws of physics, and further study of it will only make it more and more complicated. A study of how the coefficient c depends on the shape of the front of the airplane is, to put it mildly, frustrating. There just is no simple law for determining the coefficient in terms of the shape of the airplane. In contrast, the law of gravitation is simple, and further study only indicates its greater simplicity.
air—they get too heavy to be supported any longer in the updraft. As they come down, they draw a little air with them and start a downdraft. And surprisingly enough, it is easy to see that once the downdraft is started, it will maintain itself. The air now drives itself down!

Human: Based on what you said, what makes an airplane fly? 
Assistant:''', 
'''Assistant: Where do the currents come from? One possibility would be from the motion of the electrons in atomic orbits. Actually, that is not the case for iron, although it is for some materials. In addition to moving around in an atom, an electron also spins about on its own axis—something like the spin of the earth—and it is the current from this spin that gives the magnetic field in iron. (We say “something like the spin of the earth” because the question is so deep in quantum mechanics that the classical ideas do not really describe things too well.) In most substances, some electrons spin one way and some spin the other, so the magnetism cancels out, but in iron—for a mysterious reason which we will discuss later—many of the electrons are spinning with their axes lined up, and that is the source of the magnetism.
In any case, we have found an induced atomic moment proportional to the magnetic field B and opposing it. This is diamagnetism of matter. It is this magnetic effect that is responsible for the small force on a piece of bismuth in a nonuniform magnetic field. (You could compute the force by working out the energy of the induced moments in the field and seeing how the energy changes as the material is moved into or out of the high-field region.)
We find that the induced magnetization—the magnetic moment per unit volume—is proportional to the magnetic field. This is the phenomenon of paramagnetism. You will see that the effect is stronger at lower temperatures and weaker at higher temperatures. When we put a field on a substance, it develops, for small fields, a magnetic moment proportional to the field. The ratio of M to B (for small fields) is called the magnetic susceptibility.

Human: Based on what you said, what causes magnetism? 
Assistant:''', 
'''Assistant: Hunan First Normal University. Hunan First Normal University, founded in 1903, is a higher education institution located in Yuelu District, Changsha, Hunan Province, China.
Profile of Hunan First Normal College. Hunan First Normal College ---the alma mater of Mao Zedong, was founded in. 1903, and can date back to the Nan Song Dynasty when Southern Changsha City. Academy was founded. Now it is a three-year normal college, and enjoying the.
Hunan First Normal University covers a total area of 1346 mu, with more than 420,000 square meters of floor space. The university is divided into 10 colleges.
Hunan First Normal University. Public University, Changsha City, Hunan province, China. Introduction. Hunan First Normal University - the alma mater of Mao Zedong, was founded in 1903, and can date back to the Nan Song Dynasty when Southern Changsha City Academy was founded.
Publish your University Ranking. Established in 1903, Hunan First Normal University is a higher education institution located in the large city of Changsha (population range of 1,000,000-5,000,000 inhabitants), Hunan. Officially accredited/recognized by the Department of Education, Hunan Province, Hunan First Normal University is a coeducational higher education institution.
As I couldn't find the Red Hotel in the Changsha hotel section, I thought I'd review it here. It's associated with the University. Just about everything else was poor. The rooms are shabby. They provide minimal toiletries once during a three day stay.

Human: Based on what you said, how many colleges does hunan first normal university have? 
Assistant:''']

with torch.no_grad(): 
	inputs = tokenizer(test, return_tensors='pt', max_length=MAX_LEN, padding='longest', truncation=True)
	out = model.generate(**inputs.to(device), max_length=512, temperature=0.5, do_sample=True)#, length_penalty=-100.0, repetition_penalty=0.01)
	d = tokenizer.batch_decode(out)
	for i in d: 
		print(i, end='\n\n') 

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Assistant: The general name of energy which has to do with location relative to something else is called potential energy. In this particular case, of course, we call it gravitational potential energy. If it is a question of electrical forces against which we are working, instead of gravitational forces, if we are “lifting” charges away from other charges with a lot of levers, then the energy content is called electrical potential energy. The general principle is that the change in the energy is the force times the distance that the force is pushed, and that this is a change in energy in general:
Fig. 14-3. The potential energy between two atoms as a function of the distance between them.
Remember that the potential φ has a physical significance: it is the potential energy which a unit charge would have if brought to the specified point in space from some reference point.
etc., which are acting with respect to one another in pairs due to forces all of which are conservative. In these c