In [16]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
from peft import PeftModel
from functions import *
from tokens import *

In [17]:
################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
load_in_4bit = True

# Activate nested quantization for 4-bit base models (double quantization)
bnb_4bit_use_double_quant = True

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Compute data type for 4-bit base models
bnb_4bit_compute_dtype = torch.bfloat16

In [18]:
bnb_config = create_bnb_config(load_in_4bit, bnb_4bit_use_double_quant, bnb_4bit_quant_type, bnb_4bit_compute_dtype)

In [19]:

"""LOAD DATASET"""
dataset = load_from_disk('/home/t/tzelilai/Desktop/Thesis/Llama-3.2-1B/articles_dataset')

train_dataset = dataset["train"]
eval_dataset = dataset["eval"]
test_dataset = dataset["test"]
print(f"Train Dataset: {len(train_dataset)} examples")
print(f"Eval Dataset: {len(eval_dataset)} examples")
print(f"Test Dataset: {len(test_dataset)} examples")

Train Dataset: 24034 examples
Eval Dataset: 6009 examples
Test Dataset: 7511 examples


In [20]:
path = "/home/t/tzelilai/Desktop/Thesis/results/checkpoint-4000"

In [21]:
base_model_name = "meta-llama/Llama-3.2-1B"  # same as in your adapter_config.json
adapter_path = "/home/t/tzelilai/Desktop/Thesis/results/checkpoint-4000"

In [22]:
# 1. Load the *base* LLaMA model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(
        base_model_name,
        use_auth_token=access_token,
        quantization_config = bnb_config,
        num_labels=3,
        device_map = "auto")
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

# 2. Load the LoRA adapter on top of the base model
model = PeftModel.from_pretrained(model, adapter_path)

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
# Create the pipeline with the specified model and tokenizer
pipeline = transformers.pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer
)

The model 'PeftModelForSequenceClassification' is not supported for text-classification. Supported models are ['AlbertForSequenceClassification', 'BartForSequenceClassification', 'BertForSequenceClassification', 'BigBirdForSequenceClassification', 'BigBirdPegasusForSequenceClassification', 'BioGptForSequenceClassification', 'BloomForSequenceClassification', 'CamembertForSequenceClassification', 'CanineForSequenceClassification', 'LlamaForSequenceClassification', 'ConvBertForSequenceClassification', 'CTRLForSequenceClassification', 'Data2VecTextForSequenceClassification', 'DebertaForSequenceClassification', 'DebertaV2ForSequenceClassification', 'DistilBertForSequenceClassification', 'ElectraForSequenceClassification', 'ErnieForSequenceClassification', 'ErnieMForSequenceClassification', 'EsmForSequenceClassification', 'FalconForSequenceClassification', 'FlaubertForSequenceClassification', 'FNetForSequenceClassification', 'FunnelForSequenceClassification', 'GemmaForSequenceClassification'

In [23]:
print(eval_dataset[3])



In [25]:
print(len(eval_dataset))

6009


In [24]:
pipeline(eval_dataset[3]['content'])

[{'label': 'LABEL_2', 'score': 0.9999997615814209}]

In [31]:
correct_predictions = 0

for article in test_dataset: 
    llm_predict = pipeline(article['content'])
    llm_label = llm_predict[0]['label']

    if article['labels'] == 0:
        article_label = 'LABEL_0'
    elif article['labels'] == 1: 
        article_label = 'LABEL_1'
    else:
        article_label = 'LABEL_2'

    if article_label == llm_label: 
        correct_predictions += 1 

accuracy = correct_predictions / len(test_dataset)
print("Accuracy of LLM is: ", accuracy)


Accuracy of LLM is:  0.9526028491545733


In [27]:
accuracy = correct_predictions / len(eval_dataset)
print("Accuracy of LLM is: ", accuracy)

Accuracy of LLM is:  0.9582293226826427


In [None]:
accuracy = correct_predictions / len(eval_dataset)
print("Accuracy of LLM is: ", accuracy)

In [11]:
article = """Title: Trump Taps Elon Musk to Lead Department of Government Efficiency, Amid Concerns Over Conflicts of Interest

President-elect Donald Trump has appointed Elon Musk, the CEO of Tesla and SpaceX, to lead a newly created Department of Government Efficiency (DOGE), a non-governmental body tasked with slashing government spending and streamlining federal agencies. While Cathie Wood, CEO of ARK Invest, backs Musk's appointment, citing his ability to bring proprietary data to the table, others are raising concerns over potential conflicts of interest.

Musk's companies, which include SpaceX, Tesla, and Neuralink, have received billions of dollars in federal contracts over the past decade, and he has been vocal about his desire to reduce regulations and cut government spending. However, critics argue that his appointment could create conflicts of interest, as he may be in a position to benefit from cost-cutting measures that affect his own companies.

The concerns are not new, as Trump previously tapped billionaire Carl Icahn to streamline government regulation in 2017. Icahn resigned after seven months amid ethics questions over changes to an energy policy that could have benefited his own company. Musk's appointment has raised similar concerns, with experts citing his vast portfolio across multiple sectors and his potential to benefit from cost-cutting measures.

Wood, however, believes Musk's understanding of the current technological landscape gives him a unique perspective on the government's role in the private sector. "The way we're looking at this is, Elon understands we are at the threshold of a convergence among many technologies," she said. "AI being at the center of it, and proprietary data is winning. So he has more proprietary data from all of these companies than I think any other CEO."

The specifics of how the DOGE will operate are still unclear, but Trump has stated that it will "slash excess regulations, cut wasteful expenditures, and restructure federal agencies" and "provide advice and guidance from outside of government." Musk has said he aims to cut $2 trillion from the federal budget, and will work with the White House and Office of Management and Budget to achieve this goal.

While the appointment has raised concerns, Musk's supporters see him as a visionary leader who can bring about positive change. "We've faced this question about Elon for many years, as he started one company after another," Wood said. "We've seen him overcome incredible odds in his business career."

As the DOGE gets underway, concerns over conflicts of interest will continue to be a topic of discussion. However, for now, Musk and Ramaswamy are eager to get to work, seeking "super high-IQ small-government revolutionaries" to join their team and help drive large-scale structural reform.."""

In [12]:
llm_article = pipeline(article)

In [13]:
print(llm_article)

[{'label': 'LABEL_1', 'score': 0.8408420085906982}]


In [24]:
llm_dataset = load_from_disk('/home/t/tzelilai/Desktop/Thesis/llama3.1_notebook/example_dataset')

In [31]:
for i in range(5):
    prediction = pipeline(llm_dataset[i]['llm_article'])
    print(f"Label: {llm_dataset[i]['labels']} | Prediction: {prediction}")

ValueError: You need to specify either `text` or `text_target`.