In [36]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
from peft import PeftModel
from functions import *
from tokens import *

In [37]:
################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
load_in_4bit = True

# Activate nested quantization for 4-bit base models (double quantization)
bnb_4bit_use_double_quant = True

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Compute data type for 4-bit base models
bnb_4bit_compute_dtype = torch.bfloat16

In [39]:
bnb_config = create_bnb_config(load_in_4bit, bnb_4bit_use_double_quant, bnb_4bit_quant_type, bnb_4bit_compute_dtype)

In [40]:
"""LOAD DATASET"""
dataset = load_from_disk('/home/t/tzelilai/Desktop/Thesis/Llama-3.2-1B/articles_dataset_les-than-7000-tokens-splitted/test')

In [41]:
base_model_name = "meta-llama/Llama-3.2-1B"  # same as in your adapter_config.json
adapter_path = "/home/t/tzelilai/Desktop/Thesis/results-modified_articles/checkpoint-4506"

In [42]:
# 1. Load the *base* LLaMA model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(
        base_model_name,
        use_auth_token=access_token,
        quantization_config = bnb_config,
        num_labels=3,
        device_map = "auto")
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

# 2. Load the LoRA adapter on top of the base model
model = PeftModel.from_pretrained(model, adapter_path)

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [43]:
# Create the pipeline with the specified model and tokenizer
pipeline = transformers.pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer
)

The model 'PeftModelForSequenceClassification' is not supported for text-classification. Supported models are ['AlbertForSequenceClassification', 'BartForSequenceClassification', 'BertForSequenceClassification', 'BigBirdForSequenceClassification', 'BigBirdPegasusForSequenceClassification', 'BioGptForSequenceClassification', 'BloomForSequenceClassification', 'CamembertForSequenceClassification', 'CanineForSequenceClassification', 'LlamaForSequenceClassification', 'ConvBertForSequenceClassification', 'CTRLForSequenceClassification', 'Data2VecTextForSequenceClassification', 'DebertaForSequenceClassification', 'DebertaV2ForSequenceClassification', 'DistilBertForSequenceClassification', 'ElectraForSequenceClassification', 'ErnieForSequenceClassification', 'ErnieMForSequenceClassification', 'EsmForSequenceClassification', 'FalconForSequenceClassification', 'FlaubertForSequenceClassification', 'FNetForSequenceClassification', 'FunnelForSequenceClassification', 'GemmaForSequenceClassification'

### Llama Outputs 

In [44]:
# Llama Simple Prompt Outputs
import json 
outputs_path = [None for _ in range(6)]
outputs = [None for _ in range(6)]
for i in range(6):
    outputs_path[i] = "/home/t/tzelilai/Desktop/Thesis/llama3.1_notebook/test_outputs_" +str(i)+"_new_prompt"+".json"
    with open(outputs_path[i], 'r') as file: 
        outputs[i] = json.load(file)

In [None]:
left_counter = 0
center_counter = 0
right_counter = 0 
llama_articles = [[] for i in range(6)]

for i,batch in enumerate(outputs):
    for article in batch: 
        llm_predict = pipeline(article, return_all_scores=True)
        # llm_label = llm_predict[0]['label']
        llama_articles[i].append(llm_predict[0])
        # if llm_label == "LABEL_0":
        #     left_counter += 1 
        # elif llm_label == "LABEL_1":
        #     center_counter += 1 
        # else: 
        #     right_counter += 1 



In [17]:
import json 
with open("llama3.1_outputs_2.json", "w", encoding="utf-8") as file: 
    json.dump(llama_articles, file, indent=4)

In [25]:
from collections import Counter
label_counts = [None for _ in range(6)]
total_label_count = Counter()
for i in range(6):
    label_counts[i] = Counter(dataset['0']['labels'])
    total_label_count += label_counts[i]

In [27]:
print("Total Left Content:",total_label_count[0])
print("Total Center Content:",total_label_count[1])
print("Total Right Counter:",total_label_count[2])
print("----------------------------")
print("Left Classified:",left_counter)
print("Center Classified ",center_counter)
print("Right Classified", right_counter)



Total Left Content: 1620
Total Center Content: 1746
Total Right Counter: 2250
----------------------------
Left Classified: 1349
Center Classified  3455
Right Classified 816


In [12]:
# Prompt with more instructions
from collections import Counter
label_counts = [None for _ in range(6)]
total_label_count = Counter()
for i in range(6):
    label_counts[i] = Counter(dataset['0']['labels'])
    total_label_count += label_counts[i]

print("Total Left Content:",total_label_count[0])
print("Total Center Content:",total_label_count[1])
print("Total Right Counter:",total_label_count[2])
print("----------------------------")
print("Left Classified:",left_counter)
print("Center Classified ",center_counter)
print("Right Classified", right_counter)

Total Left Content: 1620
Total Center Content: 1746
Total Right Counter: 2250
----------------------------
Left Classified: 1169
Center Classified  2380
Right Classified 1140


### Mistral Outputs 

In [18]:
# Mistral Outputs 
import json 
mistral_outputs_path = [None for _ in range(8)]
mistral_outputs = [None for _ in range(8)]
for i in range(0,8,1):
    mistral_outputs_path[i] = "/home/t/tzelilai/Desktop/Thesis/mistral_notebook/test_outputs_" +str(i) +".json"
    with open(mistral_outputs_path[i], 'r') as file: 
        mistral_outputs[i] = json.load(file)

In [19]:
left_counter = 0
center_counter = 0
right_counter = 0 
mistral_articles = [[] for i in range(8)]
for i,batch in enumerate(mistral_outputs):
    for article in batch: 
        llm_predict = pipeline(article, return_all_scores=True)
        # llm_label = llm_predict[0]['label']
        mistral_articles[i].append(llm_predict[0])

        # if llm_label == "LABEL_0":
        #     left_counter += 1 
        # elif llm_label == "LABEL_1":
        #     center_counter += 1 
        # else: 
        #     right_counter += 1 



In [54]:
import json 
with open("mistral_outputs.json", "w", encoding="utf-8") as file: 
    json.dump(mistral_articles, file, indent=4)

In [28]:
mistral_dataset = load_from_disk('/home/t/tzelilai/Desktop/Thesis/Llama-3.2-1B/articles_dataset_les-than-7000-tokens-splitted-mistral/test')

In [40]:
mistral_articles = [int(label.split('_')[-1]) for label in mistral_articles]

In [42]:
print(mistral_dataset['0']['labels'][232])

0


In [49]:
print(pipeline(mistral_outputs[0][0], return_all_scores=True))
print(pipeline(mistral_outputs[0][0]))
print(pipeline(mistral_outputs[0][0]))



[[{'label': 'LABEL_0', 'score': 0.0022257075179368258}, {'label': 'LABEL_1', 'score': 0.4595065116882324}, {'label': 'LABEL_2', 'score': 0.5382677316665649}]]
[{'label': 'LABEL_2', 'score': 0.5382677316665649}]
[{'label': 'LABEL_2', 'score': 0.5382677316665649}]


In [None]:
true_positive = {'label_0':0, 'label_1':0, 'label_2':0}
true_negative = {'label_0':0, 'label_1':0, 'label_2':0}
false_positive = {'label_0':0, 'label_1':0, 'label_2':0}
false_negative = {'label_0':0, 'label_1':0, 'label_2':0}

i = 0
j = 0 
k = 0 
while i < len(mistral_articles) and j < len(mistral_dataset): 

    if mistral_articles[i] == mistral_dataset[str(j)]['labels'][k]:
        if mistral_articles[i] == 0: 
            true_positive['label_0'] += 1 
        elif mistral_articles[i] == 1: 
            true_positive['label_1'] += 1 
        else: 
            true_positive['label_2'] += 1 
    

In [30]:
# Prompt with more instructions
from collections import Counter
label_counts = [None for _ in range(8)]
total_label_count = Counter()
for i in range(8):
    label_counts[i] = Counter(mistral_dataset['0']['labels'])
    total_label_count += label_counts[i]

print("Total Left Content:",total_label_count[0])
print("Total Center Content:",total_label_count[1])
print("Total Right Counter:",total_label_count[2])
print("----------------------------")
print("Left Classified:",left_counter)
print("Center Classified ",center_counter)
print("Right Classified", right_counter)

Total Left Content: 1640
Total Center Content: 1808
Total Right Counter: 2168
----------------------------
Left Classified: 1035
Center Classified  3254
Right Classified 1331


### Original Articles

In [21]:
"""ORIGINAL DATASET"""
dataset = load_from_disk('/home/t/tzelilai/Desktop/Thesis/Llama-3.2-1B/articles_dataset_les-than-7000-tokens-splitted/test')

In [25]:
print(len(dataset['0']))

936


In [26]:
correct_predictions = 0
original_articles = [[] for i in range(6)]

for i in range(len(dataset)):
    for article in dataset[str(i)]: 
        llm_predict = pipeline(article['content'], return_all_scores=True)
        # llm_label = llm_predict[0]['label']
        original_articles[i].append(llm_predict[0])
    #     if article['labels'] == 0:
    #         article_label = 'LABEL_0'
    #     elif article['labels'] == 1: 
    #         article_label = 'LABEL_1'
    #     else:
    #         article_label = 'LABEL_2'

    #     if article_label == llm_label: 
    #         correct_predictions += 1 

    # accuracy = correct_predictions / len(dataset)
    # print("Accuracy of LLM is: ", accuracy)




In [32]:
import json 
with open("original_article_outputs.json", "w", encoding="utf-8") as file: 
    json.dump(original_articles, file, indent=4)

In [35]:
with open("mistral_outputs.json", "r") as file: 
    data = json.load(file)

total_articles = 0
for batch in data: 
    total_articles += len(batch)
print(total_articles)

5620
