In [1]:
import os, json 
from datasets import DatasetDict, ClassLabel, Dataset
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# path of articles in json format 
articles_path = '/home/t/tzelilai/Desktop/Thesis/Article-Bias-Prediction-main/data/jsons'

# List to store data 
data = []

# Read each JSON file and append its content
for file_name in os.listdir(articles_path):
    if file_name.endswith(".json"):
        with open(os.path.join(articles_path, file_name), "r") as f:
            json_data = json.load(f)
            format_data = {"content":json_data['content'], "labels":json_data["bias_text"]}
            data.append(format_data)

In [5]:
# Split the data list into training and testing
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

train_data, eval_data = train_test_split(train_data, test_size=0.2, random_state=42)

# Create Dataset objects for train and test splits
train_dataset = Dataset.from_list(train_data)
eval_dataset = Dataset.from_list(eval_data)
test_dataset = Dataset.from_list(test_data)

In [6]:
# Define the ClassLabel
class_label = ClassLabel(num_classes=3, names=["left", "center", "right"])

# Map the labels to ClassLabel integers for each dataset
def encode_labels(example):
    example["labels"] = class_label.str2int(example["labels"])
    return example

train_dataset = train_dataset.map(encode_labels)
eval_dataset = eval_dataset.map(encode_labels)
test_dataset = test_dataset.map(encode_labels)


Map: 100%|██████████| 24034/24034 [00:01<00:00, 14592.88 examples/s]
Map: 100%|██████████| 6009/6009 [00:00<00:00, 14450.62 examples/s]
Map: 100%|██████████| 7511/7511 [00:00<00:00, 15279.14 examples/s]


In [7]:
# Combine the train and test datasets into a DatasetDict
dataset = DatasetDict({"train": train_dataset, "test": test_dataset, "eval":eval_dataset})

In [8]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['content', 'labels'],
        num_rows: 24034
    })
    test: Dataset({
        features: ['content', 'labels'],
        num_rows: 7511
    })
    eval: Dataset({
        features: ['content', 'labels'],
        num_rows: 6009
    })
})


In [9]:
dataset.save_to_disk("articles_dataset_modified")

Saving the dataset (1/1 shards): 100%|██████████| 24034/24034 [00:00<00:00, 89568.70 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 7511/7511 [00:00<00:00, 87248.96 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 6009/6009 [00:00<00:00, 78308.93 examples/s]


In [2]:
from datasets import load_from_disk

# Load from disk
loaded_dataset = load_from_disk("articles_dataset")

lengths = [len(text) for text in loaded_dataset['train']['content']]

In [None]:
small = []
large = []
for length in lengths: 
    if length < 5000: 
        small.append(length)
    elif length > 10192:
        large.append(length) 

print("Small articles are: ", len(small))
print("Large articles are: ", len(large))

In [3]:
print(loaded_dataset['test'])

Dataset({
    features: ['content', 'labels'],
    num_rows: 7511
})
