## Import Data

In [None]:
%pip install pandas

In [2]:
import pandas as pd

splits = {'train': 'final_headline_train_12000.csv', 'validation': 'final_headline_valid_1200.csv'}
df = pd.read_csv("hf://datasets/valurank/News_headlines/" + splits["train"])

In [3]:
df

Unnamed: 0,article,headline
0,The logo of cryptocurrency exchange Binance di...,Binance pauses bitcoin withdrawals due to a 's...
1,"Police officers, some in riot gear, guard a gr...",White nationalist group members face riot-plan...
2,A woman walks past a row of cash machines outs...,"Lloyds to give staff 1,000 pounds to ease cost..."
3,The Amazon logo is seen outside its JFK8 distr...,"Amazon offers to share data, boost rivals to d..."
4,An unexploded shell from a multiple rocket lau...,Both sides using heavier weapons in war in Ukr...
...,...,...
12240,FILE PHOTO - Prime Minister of Japan Fumio Kis...,Japan PM Kishida to reshuffle cabinet as COVID...
12241,A TV screen shows that China's People's Libera...,Scrapping U.S.-China military talks deepens ri...
12242,U.S. Deputy Secretary of State Wendy Sherman s...,"Tonga, not China, must decide its future, says..."
12243,Register now for FREE unlimited access to Reut...,Taiwan tension underscores importance of Phili...


In [4]:
# prompt format
instruction_string = f"""You are HeadlineGPT, an expert in crafting compelling news headlines. Based on the article provided, generate a concise, attention-grabbing headline that accurately reflects the key points and tone of the story, while also maintaining a clear and engaging writing style.

Please respond to the following article.
"""

fine_tune_template = lambda article, headline: f'''<s>[INST] {instruction_string} \n{article} \n[/INST]\n''' + headline + "</s>"

# iterate over the df and create a list of examples
fine_tune_example_list = []
for i in range(len(df)):
    obj = {"text":fine_tune_template(df["article"][i],df["headline"][i])}
    fine_tune_example_list.append(obj)

In [8]:
%pip install auto-gptq optimum bitsandbytes
%pip uninstall torch -y
%pip install torch==2.1

Collecting auto-gptq
  Downloading auto_gptq-0.7.1.tar.gz (126 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mGetting requirements to build wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[1 lines of output][0m
  [31m   [0m Building cuda extension requires PyTorch (>=1.13.0) being installed, please install PyTorch first: No module named 'torch'
  [31m   [0m [31m[end of output][0m
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
[?25h[1;31merror[0m: [1msubprocess-exited-with-error[0m

[31m×[0m [32mGetting requirements to build wheel[0m did not run successfully.
[31m│[0m exit code: [1;36m1[0m
[31m╰─>[0m See above for output.

[1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
Note: you m

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from peft import prepare_model_for_kbit_training
from peft import LoraConfig, get_peft_model
from datasets import load_dataset
import transformers

In [None]:
model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_name,
                                             device_map="auto", # automatically figures out how to best use CPU + GPU for loading model
                                             trust_remote_code=False, # prevents running custom model files on your machine
                                             revision="main") # which version of model to use in repo

# Load Tokenizer

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

NameError: name 'AutoTokenizer' is not defined

# Prepare Model for training

In [None]:
model.train() # model in training mode (dropout modules are activated)

# enable gradient check pointing
model.gradient_checkpointing_enable()

# enable quantized training
model = prepare_model_for_kbit_training(model)

In [None]:
# LoRA config
config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# LoRA trainable version of model
model = get_peft_model(model, config)

# trainable parameter count
model.print_trainable_parameters()

In [None]:
## Prepare Tokenizer
def tokenize_function(examples):
    # extract text
    text = examples["example"]

    #tokenize and truncate text
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=512
    )

    return tokenized_inputs

# tokenize training and validation datasets
tokenized_data = fine_tune_example_list.map(tokenize_function, batched=True)

In [None]:
# setting pad token
tokenizer.pad_token = tokenizer.eos_token
# data collator
data_collator = transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)

# Fine Tune Model

In [None]:
# hyperparameters
lr = 2e-4
batch_size = 4
num_epochs = 10

# define training arguments
training_args = transformers.TrainingArguments(
    output_dir= "shawgpt-ft",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    gradient_accumulation_steps=4,
    warmup_steps=2,
    fp16=True,
    optim="paged_adamw_8bit",

)

In [None]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    args=training_args,
    data_collator=data_collator
)


# train model
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

# renable warnings
model.config.use_cache = True

# Push model to HF

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
hf_name = 'shawhin' # your hf username or org name
model_id = hf_name + "/" + "shawgpt-ft"
model.push_to_hub(model_id)
trainer.push_to_hub(model_id)