In [1]:
# pip install -q -U torch==2.0.0

In [2]:
# pip install -q -U accelerate==0.23.0 peft==0.5.0 bitsandbytes==0.41.1 transformers==4.31 trl==0.7.2 torch==2.0.0

In [3]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [4]:

import warnings
warnings.filterwarnings("ignore")

In [5]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import Dataset
from peft import LoraConfig, PeftConfig
from trl import SFTTrainer
from transformers import (AutoModelForCausalLM, 
                          AutoTokenizer, 
                          BitsAndBytesConfig, 
                          TrainingArguments, 
                          pipeline, 
                          logging)
from sklearn.metrics import (accuracy_score, 
                             classification_report, 
                             confusion_matrix)
from sklearn.model_selection import train_test_split

2024-01-27 18:21:12.962164: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
print(f"pytorch version {torch.__version__}")


pytorch version 2.0.0+cu117


# 1. Load Dataset

In [7]:
filename = "./data/all-data.csv"


df = pd.read_csv(filename, 
                 names=["sentiment", "text"],
                 encoding="utf-8", encoding_errors="replace")
df.shape

(4846, 2)

In [8]:
df.sentiment.value_counts()


sentiment
neutral     2879
positive    1363
negative     604
Name: count, dtype: int64

In [9]:
pd.set_option('display.max_colwidth', 200)
df.head()

Unnamed: 0,sentiment,text
0,neutral,"According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing ."
1,neutral,"Technopolis plans to develop in stages an area of no less than 100,000 square meters in order to host companies working in computer technologies and telecommunications , the statement said ."
2,negative,The international electronic industry company Elcoteq has laid off tens of employees from its Tallinn facility ; contrary to earlier layoffs the company contracted the ranks of its office workers ...
3,positive,With the new production plant the company would increase its capacity to meet the expected increase in demand and would improve the use of raw materials and therefore increase the production profi...
4,positive,"According to the company 's updated strategy for the years 2009-2012 , Basware targets a long-term net sales growth in the range of 20 % -40 % with an operating profit margin of 10 % -20 % of net ..."


# 2. Split into training, test and eval data set

In [10]:
def create_train_test_eval_df(train_size: int=300)-> pd.DataFrame:
    X_train = list()
    X_test = list()
    
    # stratified split - split the dataset into train and test with 300 samples in each set 
    for sentiment in ["positive", "neutral", "negative"]:
        train, test  = train_test_split(df[df.sentiment==sentiment], 
                                        train_size=train_size,
                                        test_size=train_size, 
                                        random_state=42)
        X_train.append(train)
        X_test.append(test)

    # Shuffle the data in a replicable order - random_state=10
    X_train = pd.concat(X_train).sample(frac=1, random_state=10)
    X_test = pd.concat(X_test)
    
    #create eval dataset with the remaining data that is not in either train or test
    #get indexes of the data that is not in either train or test
    eval_idx = [idx for idx in df.index if idx not in list(X_train.index) + list(X_test.index)]

    # get the sentiment and text for those indexes
    X_eval = df[df.index.isin(eval_idx)]
    
    #create an eval dataset containing 150 rows of 50 each sentiment
    #sampled with repetition because negative instances are less
    X_eval = (X_eval
          .groupby('sentiment', group_keys=False)
          .apply(lambda x: x.sample(n=50, random_state=10, replace=True)))
    
    X_train = X_train.reset_index(drop=True)
    
    return (X_train, X_test, X_eval)
    
X_train_df = pd.DataFrame
X_test_df = pd.DataFrame
X_eval_df = pd.DataFrame
X_train_df, X_test_df, X_eval_df = create_train_test_eval_df(train_size=300)


print (f'Shape of train, test and eval data are {X_train_df.shape}, {X_test_df.shape}, {X_eval_df.shape}')
print (f'\nBreak up by target column in train data is\n {X_train_df.sentiment.value_counts()}')

Shape of train, test and eval data are (900, 2), (900, 2), (150, 2)

Break up by target column in train data is
 sentiment
neutral     300
positive    300
negative    300
Name: count, dtype: int64


# 3. Prepare dataset

In [11]:
#include target class for the training dataset
def generate_prompt(data_point:pd.core.series.Series)->pd.core.series.Series:
    
    return f"""
            Analyze the sentiment of the news headline enclosed in square brackets, 
            determine if it is positive, neutral, or negative, and return the answer as 
            the corresponding sentiment label "positive" or "neutral" or "negative".

            [{data_point["text"]}] = {data_point["sentiment"]}
            """.strip()

def generate_test_prompt(data_point:pd.core.series.Series)->pd.core.series.Series:
    return f"""
            Analyze the sentiment of the news headline enclosed in square brackets, 
            determine if it is positive, neutral, or negative, and return the answer as 
            the corresponding sentiment label "positive" or "neutral" or "negative".

            [{data_point["text"]}] = """.strip()

# convert pandas series to a dataframe and name the generated prompt as text
X_train_df = pd.DataFrame(X_train_df.apply(generate_prompt, axis=1), 
                       columns=["text"])
X_eval_df = pd.DataFrame(X_eval_df.apply(generate_prompt, axis=1), 
                      columns=["text"])

y_true = X_test_df.sentiment
X_test_df = pd.DataFrame(X_test_df.apply(generate_test_prompt, axis=1), 
                      columns=["text"])

#convert pandas dataframe to Huggingface dataset
train_data = Dataset.from_pandas(X_train_df)
eval_data = Dataset.from_pandas(X_eval_df)

# 4.Load base model and predict with base model

In [12]:
model_name = "./Llama2-7b-hf/"

# gets float16 data type from the torch library - the data type that will be used for computations.
compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, # Load model weights in 4-bit format
    bnb_4bit_quant_type="nf4", #4-bit NormalFloat(NF4), is a new data type that is information theoretically optimal for normally distributed weights.
    bnb_4bit_compute_dtype=compute_dtype, #Use float16 data type for computations.
    bnb_4bit_use_double_quant=False, #Do not use double quantization (reduces the average memory footprint by quantizing also the quantization 
                                     #constants and saves an additional 0.4 bits per parameter)
    )

#load base model using BitsAndBytesConfig object for quantization
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=bnb_config, 
)

model.config.use_cache = False
model.config.pretraining_tp = 1 # a value different than 1 will activate the more accurate but slower computation of the linear layers, 
                                # which should better match the original logits.


#load tokenizer for the base model
tokenizer = AutoTokenizer.from_pretrained(model_name, 
                                          trust_remote_code=True,
                                         )
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # pad the input sequence on the right side. This is crucial for correct padding direction 
                                # (this is the way with Llama 2).

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [13]:
def evaluate(y_true: pd.core.series.Series, y_pred: list):
    labels = ['positive', 'neutral', 'negative']
    
    #Note: sometimes there will be no predictions, so 'none' i
    mapping = {'positive': 2, 'neutral': 1, 'none':1, 'negative': 0}
    
    def map_func(x):
        return mapping.get(x, 1)
    
    #the predicted label and the ones in test set contain labels as 'positive', 'neutral', 'none' or 'negative'
    #map the labels to 0,1 and 2  
    y_true = np.vectorize(map_func)(y_true)
    y_pred = np.vectorize(map_func)(y_pred)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_true=y_true, y_pred=y_pred)
    print(f'Accuracy: {accuracy:.3f}')
    
    # Generate accuracy report
    unique_labels = set(y_true)  # Get unique labels
    
    for label in unique_labels:
        label_indices = [i for i in range(len(y_true)) 
                         if y_true[i] == label]
        label_y_true = [y_true[i] for i in label_indices]
        label_y_pred = [y_pred[i] for i in label_indices]
        accuracy = accuracy_score(label_y_true, label_y_pred)
        print(f'Accuracy for label {label}: {accuracy:.3f}')
        
    # Generate classification report
    class_report = classification_report(y_true=y_true, y_pred=y_pred)
    print('\nClassification Report:')
    print(class_report)
    
    # Generate confusion matrix
    conf_matrix = confusion_matrix(y_true=y_true, y_pred=y_pred, labels=[0, 1, 2])
    print('\nConfusion Matrix:')
    print(conf_matrix)

In [14]:
def predict(test:pd.DataFrame, model:AutoModelForCausalLM, tokenizer:AutoTokenizer)-> list:
    y_pred = []
    pipe = pipeline(task="text-generation", 
                        model=model, 
                        tokenizer=tokenizer, 
                        max_new_tokens = 1, #number of tokens to generate
                        temperature = 0.0,
                       )
    
    for i in tqdm(range(len(test))):
        prompt = test.iloc[i]["text"]
        result = pipe(prompt)
        
        #sample of result
        #[{'generated_text': 'Analyze the sentiment of the news headline enclosed in square brackets, \n            determine if it is positive, 
        #neutral, or negative, and return the answer as \n            the corresponding sentiment label "positive" or "neutral" or 
        #"negative".\n\n            [a January 11 , 2010 EPHC board of directors has approved an increase in the quarterly dividend from $ 0.03 
        #to $ 0.05 per share .] = positive'}]
        answer = result[0]['generated_text'].split("=")[-1]

        if "positive" in answer:
            y_pred.append("positive")
        elif "negative" in answer:
            y_pred.append("negative")
        elif "neutral" in answer:
            y_pred.append("neutral")
        else:
            #sometime there will be no tokens generated. That will be treated as none
            y_pred.append("none")
    return y_pred

In [15]:
y_pred = predict(X_test_df, model, tokenizer)
evaluate(y_true, y_pred)

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.
100%|██████████| 900/900 [00:50<00:00, 17.94it/s]

Accuracy: 0.376
Accuracy for label 0: 0.027
Accuracy for label 1: 0.943
Accuracy for label 2: 0.157

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.03      0.05       300
           1       0.34      0.94      0.50       300
           2       0.69      0.16      0.26       300

    accuracy                           0.38       900
   macro avg       0.64      0.38      0.27       900
weighted avg       0.64      0.38      0.27       900


Confusion Matrix:
[[  8 287   5]
 [  1 283  16]
 [  0 253  47]]





# 5 Fine tune base model on custom dataset

In [16]:
# Parameter-Efficient Fine-Tuning (PEFT) method, which should save time as it operates on a reduced number of parameters compared to the 
# model's overall size. The PEFT method focuses on refining a limited set of (additional) model parameters, while keeping the majority of 
# the pre-trained LLM parameters fixed. This significantly reduces both computational and storage expenses. Additionally, this strategy 
# addresses the challenge of catastrophic forgetting, which often occurs during the complete fine-tuning of LLMs.
peft_config = LoraConfig(
    lora_alpha=16, # learning rate for the LoRA update matrices
    lora_dropout=0.1, # dropout probability for the LoRA update matrices 
    r=64,#rank of the LoRA update matrices, lower rank results in smaller update matrices with fewer trainable parameters
    bias="none", #type of bias to use. The possible values are none, additive, and learned.
    task_type="CAUSAL_LM", #type of task that the model is being trained for. The possible values are CAUSAL_LM and MASKED_LM
)

# Specify parameters for training the model
training_arguments = TrainingArguments(
    output_dir="logs", #The directory where the training logs and checkpoints will be saved.
    num_train_epochs=3, #The number of epochs to train the model for
    per_device_train_batch_size=1, #The number of samples in each batch on each device.
    gradient_accumulation_steps=8, # 4 The number of batches to accumulate gradients before updating the model parameters.
    optim="paged_adamw_32bit", #optimizer to use for training the model
    save_steps=0, #number of steps after which to save a checkpoint
    logging_steps=25, #number of steps after which to log the training metrics
    learning_rate=2e-4, #learning rate for the optimizer
    weight_decay=0.001, #weight decay parameter for the optimizer
    fp16=True, #Whether to use 16-bit floating-point precision.
    bf16=False, #Whether to use BFloat16 precision.
    max_grad_norm=0.3, #The maximum gradient norm
    max_steps=-1, #The maximum number of steps to train the model for
    warmup_ratio=0.03,#The proportion of the training steps to use for warming up the learning rate.
    group_by_length=True, #Whether to group the training samples by length.
    lr_scheduler_type="cosine", # type of learning rate scheduler to use
    report_to="tensorboard", # The tools to report the training metrics to
    evaluation_strategy="epoch" #strategy for evaluating the model during training
)

# The SFTTrainer is a custom trainer class from the PEFT library. It is used to train large language models using the PEFT method.
trainer = SFTTrainer(
    model=model, #model to be trained
    train_dataset=train_data,
    eval_dataset=eval_data,
    peft_config=peft_config,
    dataset_text_field="text", #name of the text field in the dataset.
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False, # Whether to pack the training samples
    max_seq_length=1024,
)

Map:   0%|          | 0/900 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

In [17]:
from datetime import datetime
import pytz

tz = pytz.timezone("America/New_York")
start_time = datetime.now(tz)
print (f'\nTraining started at {start_time}')

# Train model
trainer.train()

end_time = datetime.now(tz)
duration = end_time - start_time
print (f'Training completed at {end_time}')
print(f'Training duration was {duration}')

# Save trained model to 'trained_model' directory
trainer.model.save_pretrained("trained-model")

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.



Training started at 2024-01-27 18:22:30.256360-05:00


Epoch,Training Loss,Validation Loss
0,0.8145,0.713441
2,0.717,0.691586
2,0.6878,0.690086


Training completed at 2024-01-27 18:31:06.325358-05:00
Training duration was 0:08:36.068998


In [18]:
# %load_ext tensorboard
# %tensorboard --logdir logs/runs

In [19]:
y_pred = predict(X_test_df, model, tokenizer)
evaluate(y_true, y_pred)

100%|██████████| 900/900 [00:59<00:00, 15.04it/s]

Accuracy: 0.824
Accuracy for label 0: 0.913
Accuracy for label 1: 0.760
Accuracy for label 2: 0.800

Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.91      0.93       300
           1       0.74      0.76      0.75       300
           2       0.80      0.80      0.80       300

    accuracy                           0.82       900
   macro avg       0.83      0.82      0.83       900
weighted avg       0.83      0.82      0.83       900


Confusion Matrix:
[[274  24   2]
 [ 13 228  59]
 [  5  55 240]]





In [20]:
evaluation = pd.DataFrame({'text': X_test_df["text"], 
                           'y_true':y_true, 
                           'y_pred': y_pred},
                         )
evaluation.to_csv("test_predictions.csv", index=False)

# 6 Load fine tuned model and predict

In [21]:
from peft import PeftModel

import gc
gc.collect()

#base model
model_name = "./Llama2-7b-hf/"
model_id = "./trained-model"
# model_id = '/home/yanglin/Repos/llama_project/trained-model'
# Reload base model and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype='auto', #torch.float16,
    device_map="auto",
    # quantization_config=bnb_config,
)

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, 
                                          trust_remote_code=True,
                                         )
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# ft_model = PeftModel.from_pretrained(base_model, "trained-model")
ft_model = PeftModel.from_pretrained(base_model, model_id, local_files_only=True)

y_pred = predict(X_test_df, ft_model, tokenizer)
evaluate(y_true, y_pred)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PLBartForCausalLM', 'ProphetNetForCausalLM', 'QDQBertLMHeadModel', 'ReformerModelWithLMHead', 'RemBertForCausal

Accuracy: 0.821
Accuracy for label 0: 0.897
Accuracy for label 1: 0.813
Accuracy for label 2: 0.753

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.90      0.92       300
           1       0.71      0.81      0.76       300
           2       0.82      0.75      0.79       300

    accuracy                           0.82       900
   macro avg       0.83      0.82      0.82       900
weighted avg       0.83      0.82      0.82       900


Confusion Matrix:
[[269  29   2]
 [ 10 244  46]
 [  5  69 226]]





In [22]:
y_pred

['positive',
 'positive',
 'negative',
 'positive',
 'neutral',
 'positive',
 'neutral',
 'positive',
 'neutral',
 'neutral',
 'negative',
 'positive',
 'neutral',
 'positive',
 'positive',
 'neutral',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'neutral',
 'positive',
 'neutral',
 'positive',
 'positive',
 'positive',
 'positive',
 'neutral',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'neutral',
 'positive',
 'positive',
 'positive',
 'positive',
 'negative',
 'neutral',
 'neutral',
 'positive',
 'positive',
 'positive',
 'positive',
 'neutral',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',

In [23]:
X_test_df

Unnamed: 0,text
567,"Analyze the sentiment of the news headline enclosed in square brackets, \n determine if it is positive, neutral, or negative, and return the answer as \n the corresponding se..."
1752,"Analyze the sentiment of the news headline enclosed in square brackets, \n determine if it is positive, neutral, or negative, and return the answer as \n the corresponding se..."
995,"Analyze the sentiment of the news headline enclosed in square brackets, \n determine if it is positive, neutral, or negative, and return the answer as \n the corresponding se..."
601,"Analyze the sentiment of the news headline enclosed in square brackets, \n determine if it is positive, neutral, or negative, and return the answer as \n the corresponding se..."
568,"Analyze the sentiment of the news headline enclosed in square brackets, \n determine if it is positive, neutral, or negative, and return the answer as \n the corresponding se..."
...,...
4219,"Analyze the sentiment of the news headline enclosed in square brackets, \n determine if it is positive, neutral, or negative, and return the answer as \n the corresponding se..."
4814,"Analyze the sentiment of the news headline enclosed in square brackets, \n determine if it is positive, neutral, or negative, and return the answer as \n the corresponding se..."
4059,"Analyze the sentiment of the news headline enclosed in square brackets, \n determine if it is positive, neutral, or negative, and return the answer as \n the corresponding se..."
4720,"Analyze the sentiment of the news headline enclosed in square brackets, \n determine if it is positive, neutral, or negative, and return the answer as \n the corresponding se..."
