In [1]:
# !pip install  scikit-learn

In [1]:
!nvidia-smi


Wed Jul  3 15:24:45 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.154.05             Driver Version: 535.154.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA RTX A6000               On  | 00000000:00:08.0 Off |                  Off |
| 50%   65C    P8              42W / 300W |      1MiB / 49140MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:

import pandas as pd
import json
import os
from pprint import pprint
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import load_dataset, Dataset
from huggingface_hub import notebook_login

from peft import LoraConfig, PeftConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"

In [3]:
# !pip install -Uqqq pip --progress-bar off
# !pip install -Uqqq peft --progress-bar off
# !pip install -Uqqq bitsandbytes --progress-bar off
# !pip install -Uqqq trl --progress-bar off
# !pip install git+https://github.com/huggingface/peft

In [4]:
from datetime import datetime
import os
import sys
# from datasets import load_dataset

import torch
from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_kbit_training,
    set_peft_model_state_dict,
)
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq

In [5]:
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

In [6]:
df = pd.read_excel("data-excel.xlsx")


In [7]:
required_columns = ['headline', 'headline sentiment analysis', 'text', 'byline location','editorial notes','news value [nv] assessment']


In [8]:
df = df[required_columns]
df = df.fillna("")
df = df.astype(str)


In [9]:
train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Create a DatasetDict
dataset_dict = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})


In [10]:
# dataset_dict['train']['text'][0]

In [11]:
DEFAULT_SYSTEM_PROMPT = """
Below is an instruction that describes a task. Write a response that appropriately completes the request.\n
""".strip()


def generate_training_prompt(
    text: str, sentiment: str, notes: str, rank:str, location:str
) -> str:
    return f"""{DEFAULT_SYSTEM_PROMPT}

### Article:
{text}

### Sentiment:
{sentiment}

### editorial-notes:
{notes}

### Ranking:
{rank}
### Location:
{location}
""".strip()

In [12]:
def mapper_function(example):
    example['training_prompt'] = generate_training_prompt(
        text=example['text'],
        sentiment=example['headline sentiment analysis'],
        notes=example['editorial notes'],
        rank=example['news value [nv] assessment'],
        location=example['byline location']
    )
    return example

# Apply the mapper function to the dataset
dataset = dataset_dict.map(mapper_function, remove_columns=required_columns)

Map:   0%|          | 0/956 [00:00<?, ? examples/s]

Map:   0%|          | 0/107 [00:00<?, ? examples/s]

In [13]:
MODEL_NAME = "Deci/DeciLM-7B"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    trust_remote_code=True,
    quantization_config=bnb_config
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [14]:
model_k= prepare_model_for_kbit_training(model)


In [15]:
def tokenize(prompt):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=1100,
        padding="max_length",
        return_tensors=None,
    )

    # "self-supervised learning" means the labels are also the inputs:
    result["labels"] = result["input_ids"].copy()

    return result

# Apply the tokenize function to the training prompts
train_dataset = dataset['train'].map(lambda x: tokenize(x['training_prompt']))
test_dataset = dataset['test'].map(lambda x: tokenize(x['training_prompt']))

Map:   0%|          | 0/956 [00:00<?, ? examples/s]

Map:   0%|          | 0/107 [00:00<?, ? examples/s]

In [16]:
import re
def get_num_layers(model):
    numbers = set()
    for name, _ in model.named_parameters():
        for number in re.findall(r'\d+', name):
            numbers.add(int(number))
    return max(numbers)

def get_last_layer_linears(model):
    names = []
    
    num_layers = get_num_layers(model)
    for name, module in model.named_modules():
        if str(num_layers) in name and not "encoder" in name:
            if isinstance(module, torch.nn.Linear):
                names.append(name)
    return names

In [18]:

config = LoraConfig(
    r=8,  # Increased from 2 to 4
    lora_alpha=64,  # Increased from 32 to 64
    target_modules=get_last_layer_linears(model),  # Ensure this targets the right layers
    lora_dropout=0.03,  # Reduced from 0.05 to 0.03 to allow for more utilization
    bias="none",
    task_type="CAUSAL_LM"
)
model_p = get_peft_model(model_k, config)

In [19]:
training_args = transformers.TrainingArguments(
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    num_train_epochs=10,
    learning_rate=1e-4,
    fp16=True,
    output_dir="finetune_DECILM",
    optim="paged_adamw_8bit",
    evaluation_strategy="epoch",
    lr_scheduler_type="cosine",
    warmup_ratio=0.01,
    report_to="none"
)

trainer = transformers.Trainer(
    model=model_p,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,    
    args=training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
model_p.config.use_cache = False
trainer.train()




Epoch,Training Loss,Validation Loss
0,No log,
1,No log,
2,No log,
3,No log,
4,No log,
5,No log,
6,No log,
8,No log,
9,No log,


TrainOutput(global_step=290, training_loss=1713992047.2275863, metrics={'train_runtime': 8349.8772, 'train_samples_per_second': 1.145, 'train_steps_per_second': 0.035, 'total_flos': 4.23534188740608e+17, 'train_loss': 1713992047.2275863, 'epoch': 9.707112970711297})

In [84]:
# model.config.use_cache = False

# old_state_dict = model.state_dict
# model.state_dict = (lambda self, *_, **__: get_peft_model_state_dict(self, old_state_dict())).__get__(
#     model, type(model)
# )
# if torch.__version__ >= "2" and sys.platform != "win32":
#     print("compiling the model")
#     model = torch.compile(model)

In [85]:
# trainer.train()

In [20]:
model_p.save_pretrained("trained-model")

PEFT_MODEL = "trained-model"

config = PeftConfig.from_pretrained(PEFT_MODEL)
# model = AutoModelForCausalLM.from_pretrained(
#     config.base_model_name_or_path,
#     return_dict=True,
#     quantization_config=bnb_config,
#     device_map="auto",
#     trust_remote_code=True
# )

tokenizer=AutoTokenizer.from_pretrained(config.base_model_name_or_path)
tokenizer.pad_token = tokenizer.eos_token

model = PeftModel.from_pretrained(model, PEFT_MODEL)

In [23]:
generation_config = model.generation_config
generation_config.max_new_tokens = 100
generation_config.temperature = 0.7
generation_config.top_p = 0.7
generation_config.num_return_sequences = 1
generation_config.pad_token_id = tokenizer.eos_token_id
generation_config.eos_token_id = tokenizer.eos_token_id


In [26]:
def generate_query_prompt(
    text: str) -> str:
    return f"""{DEFAULT_SYSTEM_PROMPT}

### Article:
{text}


""".strip()

In [30]:
qry = generate_query_prompt("""Facebook Inc. knows, in acute detail, that its platforms “We’re going to defend our record.”""")

In [31]:
%%time

prompt = qry
device = "cuda"
encoding = tokenizer(prompt, return_tensors="pt").to(device)
with torch.inference_mode():
  outputs = model.generate(
      input_ids = encoding.input_ids,
      attention_mask = encoding.attention_mask,
      generation_config = generation_config
  )

print(tokenizer.decode(outputs[0], skip_special_tokens=True, skip_prompt=True))

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Article:
Facebook Inc. knows, in acute detail, that its platforms “We’re going to defend our record.”

### Response:
### Article:
Facebook Inc. knows, in acute detail, that its platforms are being used to spread misinformation and hate speech. The company has been criticized for not doing enough to combat the problem.

### Article:
Facebook Inc. knows, in acute detail, that its platforms are being used to spread misinformation and hate speech. The company has been criticized for not doing enough to combat the problem.

### Response
CPU times: user 6.29 s, sys: 104 ms, total: 6.39 s
Wall time: 6.39 s


In [1]:
1

1