In [1]:
import os, json 
from datasets import DatasetDict, ClassLabel, Dataset, load_from_disk
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# path of articles in json format 
articles_path = '/home/t/tzelilai/Desktop/Thesis/Article-Bias-Prediction-main/data/jsons'

# List to store data 
data = []
i = 0 
j = 0 
# Read each JSON file and append its content
for file_name in os.listdir(articles_path):
    if file_name.endswith(".json"):
        with open(os.path.join(articles_path, file_name), "r") as f:
            i += 1
            json_data = json.load(f)
            if len(json_data["content"]) <= 7000:
                j += 1
                format_data = {"content":json_data['content'], "labels":json_data["bias_text"]}
                data.append(format_data)

In [3]:
print(f"number of articles checked: {i}")
print(f"number of articles accepted: {j}")

number of articles checked: 37554
number of articles accepted: 28100


In [4]:
# Split the data list into training and testing
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

train_data, eval_data = train_test_split(train_data, test_size=0.2, random_state=42)

# Create Dataset objects for train and test splits
train_dataset = Dataset.from_list(train_data)
eval_dataset = Dataset.from_list(eval_data)
test_dataset = Dataset.from_list(test_data)

In [5]:
# Define the ClassLabel
class_label = ClassLabel(num_classes=3, names=["left", "center", "right"])

# Map the labels to ClassLabel integers for each dataset
def encode_labels(example):
    example["labels"] = class_label.str2int(example["labels"])
    return example

train_dataset = train_dataset.map(encode_labels)
eval_dataset = eval_dataset.map(encode_labels)
test_dataset = test_dataset.map(encode_labels)


Map: 100%|██████████| 17984/17984 [00:00<00:00, 26150.11 examples/s]
Map: 100%|██████████| 4496/4496 [00:00<00:00, 26907.97 examples/s]
Map: 100%|██████████| 5620/5620 [00:00<00:00, 26509.54 examples/s]


In [13]:
n = len(test_dataset)
size = 8
part_size = n // size
splits = []

for i in range(size):
    start = i * part_size
    end = (i + 1) * part_size
    # Handle any remainder in the last split if n isn't perfectly divisible by 5:
    if i == size - 1: 
        end = n
    split_i = test_dataset.select(range(start, end))
    splits.append(split_i)

In [14]:
print(splits)

[Dataset({
    features: ['content', 'labels'],
    num_rows: 702
}), Dataset({
    features: ['content', 'labels'],
    num_rows: 702
}), Dataset({
    features: ['content', 'labels'],
    num_rows: 702
}), Dataset({
    features: ['content', 'labels'],
    num_rows: 702
}), Dataset({
    features: ['content', 'labels'],
    num_rows: 702
}), Dataset({
    features: ['content', 'labels'],
    num_rows: 702
}), Dataset({
    features: ['content', 'labels'],
    num_rows: 702
}), Dataset({
    features: ['content', 'labels'],
    num_rows: 706
})]


In [15]:
# Combine the train and test datasets into a DatasetDict
dataset = DatasetDict({"train": train_dataset, "test": DatasetDict({str(i): test_split for i,test_split in enumerate(splits)}), "eval":eval_dataset})

In [16]:
dataset.save_to_disk("articles_dataset_les-than-7000-tokens-splitted-mistral")

Saving the dataset (1/1 shards): 100%|██████████| 17984/17984 [00:00<00:00, 155692.17 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 702/702 [00:00<00:00, 60538.30 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 702/702 [00:00<00:00, 63530.86 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 702/702 [00:00<00:00, 60513.42 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 702/702 [00:00<00:00, 60458.75 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 702/702 [00:00<00:00, 59804.23 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 702/702 [00:00<00:00, 59621.37 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 702/702 [00:00<00:00, 59824.89 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 706/706 [00:00<00:00, 56329.37 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 4496/4496 [00:00<00:00, 101386.53 examples/s]


In [6]:
# Load from disk
test_sp_dataset = load_from_disk("/home/t/tzelilai/Desktop/Thesis/Llama-3.2-1B/articles_dataset_les-than-7000-tokens-splitted/test")


In [7]:
print(test_sp_dataset['0'])

Dataset({
    features: ['content', 'labels'],
    num_rows: 936
})
