In [1]:
import os, json 
from datasets import DatasetDict, ClassLabel, Dataset, load_from_disk
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
# path of articles in json format 
articles_path = '/home/t/tzelilai/Desktop/Thesis/Article-Bias-Prediction-main/data/jsons'
topics = set()
"""(json_data["topic"] == "politics" or json_data["topic"] == "elections" or json_data["topic"] == "abortions")"""
accept_topics = {"politics"}
# List to store data 
data = []
i = 0 
j = 0 
# Read each JSON file and append its content
for file_name in os.listdir(articles_path):
    if file_name.endswith(".json"):
        with open(os.path.join(articles_path, file_name), "r") as f:
            i += 1
            json_data = json.load(f)   
                 # 4500     #and json_data["topic"] in accept_topics
            word_count = len(json_data["content"].split())
            if len(json_data["content"]) <= 12000 :
                topics.add(json_data["topic"])
                j += 1
                format_data = {"content":json_data['content'], "labels":json_data["bias_text"]}
                data.append(format_data)

In [12]:
print(f"number of articles checked: {i}")
print(f"number of articles accepted: {j}")

number of articles checked: 37554
number of articles accepted: 35504


In [10]:
left = 0
center = 0
right = 0 

for article in data: 
    if article["labels"] == "left":
        left+=1
    elif article["labels"] == "center":
        center+=1
    elif article["labels"] == "right":
        right+=1

print(left, center, right)


12726 10731 13658


In [12]:
import random

# Suppose `data` is a list of articles, each article being a dictionary or similar structure.
# We'll split them by label into three separate lists:
left_articles = []
center_articles = []
right_articles = []

for article in data:
    if article["labels"] == "left":
        left_articles.append(article)
    elif article["labels"] == "center":
        center_articles.append(article)
    elif article["labels"] == "right":
        right_articles.append(article)

# Shuffle each list to ensure randomness
random.shuffle(left_articles)
random.shuffle(center_articles)
random.shuffle(right_articles)

# Find the smallest class size
min_size = min(len(left_articles), len(center_articles), len(right_articles))

# Now sample each list to the min_size
left_balanced = left_articles[:min_size]
center_balanced = center_articles[:min_size]
right_balanced = right_articles[:min_size]

# Combine them back into a single list
balanced_data_undersampled = left_balanced + center_balanced + right_balanced

random.shuffle(balanced_data_undersampled)

print(f"New balanced dataset size: {len(balanced_data_undersampled)}")

# If you want `data` itself to hold the balanced articles:
data = balanced_data_undersampled

New balanced dataset size: 12840


In [13]:
print("Left:", len(left_balanced))
print("Center:", len(center_balanced))
print("Right:", len(right_balanced))

Left: 4280
Center: 4280
Right: 4280


In [14]:
# 1) Extract the labels
labels = [item["labels"] for item in data]

# Split the data list into training and testing
train_data, test_data = train_test_split(data, test_size=0.15, random_state=42,stratify=labels)

# 1) Extract the labels
train_labels = [item["labels"] for item in train_data]
train_data, eval_data = train_test_split(train_data, test_size=0.15, random_state=42, stratify=train_labels)

# Create Dataset objects for train and test splits
train_dataset = Dataset.from_list(train_data)
eval_dataset = Dataset.from_list(eval_data)
test_dataset = Dataset.from_list(test_data)

In [15]:
print("Training:", len(train_dataset))
print("Eval: ", len(eval_dataset))
print("Test: ", len(test_dataset))

Training: 9276
Eval:  1638
Test:  1926


In [17]:
# Define the ClassLabel
class_label = ClassLabel(num_classes=3, names=["left", "center", "right"])

# Map the labels to ClassLabel integers for each dataset
def encode_labels(example):
    example["labels"] = class_label.str2int(example["labels"])
    return example

train_dataset = train_dataset.map(encode_labels)
eval_dataset = eval_dataset.map(encode_labels)
test_dataset = test_dataset.map(encode_labels)


Map: 100%|██████████| 9276/9276 [00:01<00:00, 8158.83 examples/s] 
Map: 100%|██████████| 1638/1638 [00:00<00:00, 8643.83 examples/s] 
Map: 100%|██████████| 1926/1926 [00:00<00:00, 12084.69 examples/s]


In [18]:
# Combine the train and test datasets into a DatasetDict
dataset = DatasetDict({"train": train_dataset, "test": test_dataset, "eval":eval_dataset})

In [19]:
dataset.save_to_disk("/home/t/tzelilai/Desktop/Thesis/Datasets/4500_words_evenly_splitted")

Saving the dataset (1/1 shards): 100%|██████████| 9276/9276 [00:00<00:00, 108977.44 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1926/1926 [00:00<00:00, 65492.43 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1638/1638 [00:00<00:00, 66457.11 examples/s]


In [20]:
from datasets import load_from_disk 

dataset = load_from_disk("/home/t/tzelilai/Desktop/Thesis/Datasets/4500_words_evenly_splitted")

In [21]:
n = len(test_dataset)
size = 6
part_size = n // size
splits = []

for i in range(size):
    start = i * part_size
    end = (i + 1) * part_size
    # Handle any remainder in the last split if n isn't perfectly divisible by 8:
    if i == size - 1: 
        end = n 
    split_i = test_dataset.select(range(start, end))
    splits.append(split_i)

In [22]:
print(splits)

[Dataset({
    features: ['content', 'labels'],
    num_rows: 321
}), Dataset({
    features: ['content', 'labels'],
    num_rows: 321
}), Dataset({
    features: ['content', 'labels'],
    num_rows: 321
}), Dataset({
    features: ['content', 'labels'],
    num_rows: 321
}), Dataset({
    features: ['content', 'labels'],
    num_rows: 321
}), Dataset({
    features: ['content', 'labels'],
    num_rows: 321
})]


In [23]:
# Combine the train and test datasets into a DatasetDict
dataset = DatasetDict({"train": train_dataset, "test": DatasetDict({str(i): test_split for i,test_split in enumerate(splits)}), "eval":eval_dataset})

In [24]:
dataset.save_to_disk("4500_words_evenly_splitted_2")

Saving the dataset (0/1 shards):   0%|          | 0/9276 [00:00<?, ? examples/s]

Saving the dataset (1/1 shards): 100%|██████████| 9276/9276 [00:00<00:00, 109253.17 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 321/321 [00:00<00:00, 20969.56 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 321/321 [00:00<00:00, 20045.43 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 321/321 [00:00<00:00, 23974.28 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 321/321 [00:00<00:00, 21752.51 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 321/321 [00:00<00:00, 20267.83 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 321/321 [00:00<00:00, 23601.92 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1638/1638 [00:00<00:00, 70506.25 examples/s]


In [6]:
# Load from disk
test_sp_dataset = load_from_disk("/home/t/tzelilai/Desktop/Thesis/Llama-3.2-1B/articles_dataset_les-than-7000-tokens-splitted/test")


In [7]:
print(test_sp_dataset['0'])

Dataset({
    features: ['content', 'labels'],
    num_rows: 936
})


In [2]:
my_dataset = load_from_disk("/home/t/tzelilai/Desktop/Thesis/Llama-3.2-1B/articles_dataset_les-than-7000-tokens-splitted/test")

In [6]:
print(my_dataset['0'][935])

{'content': "Washington ( CNN ) Americans are increasingly unhappy with President Barack Obama 's handling of ISIS , and a growing share of the nation believes that fight is going badly , according to a new CNN/ORC survey released Monday .\nFifty-seven percent disapprove of his handling of foreign affairs more broadly , and 54 % disapprove of how the President is handling terrorism . Another 60 % rate Obama negatively on his handling of electronic national security .\nThe declining approval ratings for Obama on national security come as a weekend of international turmoil further underscores the growing threats abroad .\nAnd Egypt launched a second round of airstrikes against Islamic State strongholds in Libya on Monday , in retaliation for a video released Sunday that appeared to show ISIS militants beheading a group of 21 Egyptian Christians .\nObama issued a statement condemning the killing of the Christians on Sunday night , though Obama 's Republican opponents have consistently mad