<a href="https://colab.research.google.com/github/swan-07/authorship-verification/blob/main/Authorship_Verification_Datasets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import pandas as pd
import itertools
import random
import spacy
from tqdm import tqdm
from sklearn.model_selection import train_test_split
#same author = 1, diff = 0
import json

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
  def replace_named_entities(texts):
    processed_texts = []

    # Use tqdm for progress indication
    for doc in tqdm(nlp.pipe(texts, batch_size=50), total=len(texts)):
        new_text = []
        last_idx = 0
        for ent in doc.ents:
            new_text.append(doc.text[last_idx:ent.start_char])
            new_text.append(ent.label_)
            last_idx = ent.end_char
        new_text.append(doc.text[last_idx:])
        processed_texts.append("".join(new_text))

    return processed_texts

In [None]:
def lengths_df(df):
    average_length_text1 = df['text1'].apply(len).mean()
    average_length_text2 = df['text2'].apply(len).mean()

    # Calculate the overall average length of texts
    overall_average_length = (average_length_text1 + average_length_text2) / 2

    # Get the total number of rows
    total_rows = len(df)

    print(f'Average length of text1: {average_length_text1:.2f} characters')
    print(f'Average length of text2: {average_length_text2:.2f} characters')
    print(f'Overall average length of texts: {overall_average_length:.2f} characters')
    print(f'Total number of rows: {total_rows}')

In [None]:
from itertools import combinations

def read_texts(directory: str):
    texts_by_author = {}

    files = [f for f in os.listdir(directory) if f.endswith('.txt')]
    for file in files:
        author_id = file.split('.')[0][:-1]  # Extract author ID by removing the last character and the file extension
        file_path = os.path.join(directory, file)
        with open(file_path, 'r') as f:
            text = f.read()
            if author_id not in texts_by_author:
                texts_by_author[author_id] = []
            texts_by_author[author_id].append(text)

    return texts_by_author


In [None]:
def create_pairs(data, reserve_ratio=0.5):
    same_author_pairs = []
    different_author_pairs = []

    # Create pairs of texts with the same label (same = 1)
    used_texts = set()
    reserved_texts = []

    for label, texts in data.items():
        random.shuffle(texts)  # Shuffle texts to randomly reserve some
        num_reserve = int(len(texts) * reserve_ratio)

        # Reserve a portion of the texts for different author pairs
        available_texts = texts[num_reserve:]
        reserved_texts.extend((label, text) for text in texts[:num_reserve])

        while len(available_texts) > 1:
            text1 = available_texts.pop()
            text2 = available_texts.pop()
            same_author_pairs.append((text1, text2, 1))
            used_texts.add(text1)
            used_texts.add(text2)

    # Create pairs of texts with different labels (same = 0)
    while len(reserved_texts) > 1:
        (label1, text1), (label2, text2) = random.sample(reserved_texts, 2)
        if label1 != label2:
            different_author_pairs.append((text1, text2, 0))
            reserved_texts.remove((label1, text1))
            reserved_texts.remove((label2, text2))
            used_texts.add(text1)
            used_texts.add(text2)

    print(f'Same author pairs: {len(same_author_pairs)}')
    print(f'Different author pairs: {len(different_author_pairs)}')

    # Balance the number of pairs
    min_size = min(len(same_author_pairs), len(different_author_pairs))
    balanced_same_author_pairs = random.sample(same_author_pairs, min_size)
    balanced_different_author_pairs = random.sample(different_author_pairs, min_size)

    # Combine and shuffle the pairs
    balanced_pairs = balanced_same_author_pairs + balanced_different_author_pairs
    random.shuffle(balanced_pairs)

    return balanced_pairs

In [None]:

def train_test_val_split(df, train_size=0.7, val_size=0.15, test_size=0.15, random_state=None):
    # Ensure the sizes add up to 1.0
    assert train_size + val_size + test_size == 1.0, "Train, validation, and test sizes must add up to 1.0"

    # Split the DataFrame into train and temp (val + test)
    train_df, temp_df = train_test_split(df, train_size=train_size, random_state=random_state)

    # Split the temp DataFrame into validation and test sets
    relative_val_size = val_size / (val_size + test_size)
    val_df, test_df = train_test_split(temp_df, train_size=relative_val_size, random_state=random_state)

    return train_df, val_df, test_df




In [None]:
import pandas as pd

def parse_training_text(file_path):
    data = {'text': [], 'id': []}

    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    entries = content.split('<text file=')

    for entry in entries[1:]:  # Skip the first split as it's before the first <text file=
        try:
            author_id = entry.split('<author id="')[1].split('"/>')[0]
            body_text = entry.split('<body>')[1].split('</body>')[0].strip()
            body_text = ' '.join(body_text.split())  # Normalize whitespace

            data['text'].append(body_text)
            data['id'].append(author_id)
        except IndexError as e:
            # Skip malformed entries
            print(f"Skipping malformed entry due to IndexError: {e}")
            continue

    return pd.DataFrame(data)


In [None]:
def create_unique_text_pairs(data, reserve_ratio=0.5):
    same_author_pairs = []
    different_author_pairs = []

    # Create pairs of texts with the same label (same = 1)
    used_texts = set()
    reserved_texts = []

    for label, texts in data.items():
        random.shuffle(texts)  # Shuffle texts to randomly reserve some
        num_reserve = int(len(texts) * reserve_ratio)

        # Reserve a portion of the texts for different author pairs
        available_texts = texts[num_reserve:]
        reserved_texts.extend((label, text) for text in texts[:num_reserve])

        while len(available_texts) > 1:
            text1 = available_texts.pop()
            text2 = available_texts.pop()
            same_author_pairs.append((text1, text2, 1))
            used_texts.add(text1)
            used_texts.add(text2)

    # Create pairs of texts with different labels (same = 0)
    while len(reserved_texts) > 1:
        (label1, text1), (label2, text2) = random.sample(reserved_texts, 2)
        if label1 != label2:
            different_author_pairs.append((text1, text2, 0))
            reserved_texts.remove((label1, text1))
            reserved_texts.remove((label2, text2))
            used_texts.add(text1)
            used_texts.add(text2)

    print(len(same_author_pairs))
    print(len(different_author_pairs))

    # Balance the number of pairs
    min_size = min(len(same_author_pairs), len(different_author_pairs))
    balanced_same_author_pairs = random.sample(same_author_pairs, min_size)
    balanced_different_author_pairs = random.sample(different_author_pairs, min_size)

    # Combine and shuffle the pairs
    balanced_pairs = balanced_same_author_pairs + balanced_different_author_pairs
    random.shuffle(balanced_pairs)

    return balanced_pairs

In [None]:

def process_darkreddit(directory: str):
    splits = ['train', 'test', 'val']
    data = {split: [] for split in splits}

    for split in splits:
        split_dir = os.path.join(directory, split)
        files = [f for f in os.listdir(split_dir) if f.endswith('.json')]
        print(f'Number of JSON files in {split} split: {len(files)}')

        for file in tqdm(files, desc=f'Processing {split} files'):
            file_path = os.path.join(split_dir, file)
            with open(file_path, 'r') as f:
                content = json.load(f)
                text1_cleaned = remove_named_entities(content['pair'][0])
                text2_cleaned = remove_named_entities(content['pair'][1])
                data[split].append({
                    'text1': text1_cleaned,
                    'text2': text2_cleaned,
                    'same': 1 if content['same'] else 0
                })

    # Convert to DataFrame
    train_df = pd.DataFrame(data['train'])
    test_df = pd.DataFrame(data['test'])
    val_df = pd.DataFrame(data['val'])

    return train_df, test_df, val_df

In [None]:
def create_balanced_pairs(df, reserve_ratio=0.5):
    """
    Create a balanced dataset with text pairs and a label indicating if they are from the same author.

    Parameters:
    df (pd.DataFrame): Input DataFrame containing 'id' and 'text' columns.

    Returns:
    pd.DataFrame: A DataFrame with columns 'text1', 'text2', and 'same'.
    """
    # Create a dictionary to group texts by IDs
    id_to_texts = df.groupby('id')['text'].apply(list).to_dict()

    same_author_pairs = []
    different_author_pairs = []
    used_texts = set()

    # Create same author pairs and reserve some texts
    reserved_texts = []

    for id, texts in tqdm(id_to_texts.items(), desc="Creating same author pairs and reserving texts"):
        random.shuffle(texts)
        num_reserve = int(len(texts) * reserve_ratio)
        available_texts = texts[num_reserve:]
        reserved_texts.extend((id, text) for text in texts[:num_reserve])

        for i in range(0, len(available_texts) - 1, 2):
            if i + 1 < len(available_texts):
                same_author_pairs.append((available_texts[i], available_texts[i + 1], 1))
                used_texts.add(available_texts[i])
                used_texts.add(available_texts[i + 1])

    print(f"Number of same author pairs: {len(same_author_pairs)}")

    # Create different author pairs using reserved texts
    num_iterations = len(reserved_texts) // 2  # Estimate number of iterations
    with tqdm(total=num_iterations, desc="Creating different author pairs") as pbar:
        while len(reserved_texts) > 1:
            (id1, text1), (id2, text2) = random.sample(reserved_texts, 2)
            if id1 != id2:
                different_author_pairs.append((text1, text2, 0))
                reserved_texts.remove((id1, text1))
                reserved_texts.remove((id2, text2))
                used_texts.add(text1)
                used_texts.add(text2)
                pbar.update(1)

    print(f"Number of different author pairs: {len(different_author_pairs)}")

    # Balance the number of pairs
    min_size = min(len(same_author_pairs), len(different_author_pairs))
    balanced_same_author_pairs = random.sample(same_author_pairs, min_size)
    balanced_different_author_pairs = random.sample(different_author_pairs, min_size)

    # Combine and shuffle the pairs
    balanced_pairs = balanced_same_author_pairs + balanced_different_author_pairs
    random.shuffle(balanced_pairs)

    # Create a DataFrame from the balanced pairs
    balanced_df = pd.DataFrame(balanced_pairs, columns=['text1', 'text2', 'same'])

    return balanced_df

In [None]:
def download(df, df_name):

    # Split the combined dataframe
    train, val, test = train_test_val_split(df, train_size=0.7, val_size=0.15, test_size=0.15, random_state=42)

    # Display the sizes of the resulting DataFrames
    print(f'Train set size: {len(train)}')
    print(f'Validation set size: {len(val)}')
    print(f'Test set size: {len(test)}')

    train.to_csv(f'Desktop/{df_name}_train.csv', index=False)
    val.to_csv(f'Desktop/{df_name}_val.csv', index=False)
    test.to_csv(f'Desktop/{df_name}_test.csv', index=False)

In [None]:
def process_folder(directory):
    # Reading the truth file
    truth_file_path = os.path.join(directory, 'truth.txt')
    labels = {}
    with open(truth_file_path, 'r') as file:
        for line in file:
            parts = line.strip().split()
            if len(parts) == 2:  # Ensure there are exactly 2 parts
                folder_name, same_author = parts
                labels[folder_name] = same_author == 'Y'

    data = []

    for folder in os.listdir(directory):
        folder_path = os.path.join(directory, folder)
        if os.path.isdir(folder_path):
            known_file_path = os.path.join(folder_path, 'known01.txt')
            unknown_file_path = os.path.join(folder_path, 'unknown.txt')

            with open(known_file_path, 'r', errors='ignore') as file:
                text1 = file.read()

            with open(unknown_file_path, 'r', errors='ignore') as file:
                text2 = file.read()

            same = labels.get(folder, False)  # Default to False if folder not found in truth.txt
            data.append((text1, text2, same))

    df = pd.DataFrame(data, columns=['text1', 'text2', 'same'])
    return df

In [None]:
def lengths_df(df):
    average_length_text1 = df['text1'].apply(len).mean()
    average_length_text2 = df['text2'].apply(len).mean()

    # Calculate the overall average length of texts
    overall_average_length = (average_length_text1 + average_length_text2) / 2

    # Get the total number of rows
    total_rows = len(df)

    print(f'Average length of text1: {average_length_text1:.2f} characters')
    print(f'Average length of text2: {average_length_text2:.2f} characters')
    print(f'Overall average length of texts: {overall_average_length:.2f} characters')
    print(f'Total number of rows: {total_rows}')

In [None]:
def entityremove(df):
    texts = df['text1'].tolist() + df['text2'].tolist()
    processed_texts = replace_named_entities(texts)
    df['text1'] = processed_texts[:len(df)]
    df['text2'] = processed_texts[len(df):]
    lengths_df(df)
    return df

In [None]:
#imdb
imdb = pd.read_parquet("hf://datasets/tasksource/imdb62/data/train-00000-of-00001-62894f3b39974716.parquet")
columns_to_keep = ["content", "userId"]
imdb = imdb[columns_to_keep]
imdb_df = imdb.rename(columns={'userId': 'id', 'content':'text'})
print(len(imdb_df))
imdb_df = imdb_df[imdb_df['text'].str.strip().astype(bool)]
print(len(imdb_df))
#61987
#61973

In [None]:
balanced_imdb_df = create_balanced_pairs(imdb_df)
# Number of same author pairs: 15494

# Number of different author pairs: 15491


In [None]:
texts = balanced_imdb_df['text1'].tolist() + balanced_imdb_df['text2'].tolist()
processed_texts = replace_named_entities(texts)
# Split the processed texts back into text1 and text2
balanced_imdb_df['text1'] = processed_texts[:len(balanced_imdb_df)]
balanced_imdb_df['text2'] = processed_texts[len(balanced_imdb_df):]
imdb = balanced_imdb_df

In [None]:
print(imdb.head())
lengths_df(imdb)
'''
Average length of text1: 1669.69 characters
Average length of text2: 1667.14 characters
Overall average length of texts: 1668.41 characters
Total number of rows: 30982
'''

In [None]:

# Split the combined dataframe
imdb_train, imdb_val, imdb_test = train_test_val_split(imdb, train_size=0.7, val_size=0.15, test_size=0.15, random_state=42)

# Display the sizes of the resulting DataFrames
print(f'Train set size: {len(imdb_train)}')
print(f'Validation set size: {len(imdb_val)}')
print(f'Test set size: {len(imdb_test)}')
'''
Train set size: 21687
Validation set size: 4647
Test set size: 4648
'''

In [None]:
#arxiv
arxiv = pd.read_csv('arxiv.csv', encoding='latin1')
columns_to_keep = ["abstract", "author"]
arxiv = arxiv[columns_to_keep]
arxiv_df = arxiv.rename(columns={'author': 'id', 'abstract':'text'})

In [None]:
balanced_arxiv_df = create_balanced_pairs(arxiv_df)
# Number of same author pairs: 352

# Number of different author pairs: 357


In [None]:
texts = balanced_arxiv_df['text1'].tolist() + balanced_arxiv_df['text2'].tolist()
processed_texts = replace_named_entities(texts)
# Split the processed texts back into text1 and text2
balanced_arxiv_df['text1'] = processed_texts[:len(balanced_arxiv_df)]
balanced_arxiv_df['text2'] = processed_texts[len(balanced_arxiv_df):]
arxiv = balanced_arxiv_df

In [None]:
print(arxiv.head())
lengths_df(arxiv)
'''
Average length of text1: 812.71 characters
Average length of text2: 793.66 characters
Overall average length of texts: 803.18 characters
Total number of rows: 704
'''

In [None]:

# Split the combined dataframe
arxiv_train, arxiv_val, arxiv_test = train_test_val_split(arxiv, train_size=0.7, val_size=0.15, test_size=0.15, random_state=42)

# Display the sizes of the resulting DataFrames
print(f'Train set size: {len(arxiv_train)}')
print(f'Validation set size: {len(arxiv_val)}')
print(f'Test set size: {len(arxiv_test)}')
'''
Train set size: 492
Validation set size: 106
Test set size: 106
'''

In [None]:
arxiv_train.to_csv('arxiv_train.csv', index=False)
arxiv_val.to_csv('arxiv_val.csv', index=False)
arxiv_test.to_csv('arxiv_test.csv', index=False)

imdb_train.to_csv('imdb_train.csv', index=False)
imdb_val.to_csv('imdb_val.csv', index=False)
imdb_test.to_csv('imdb_test.csv', index=False)

In [None]:
#reuters
download(reuters, 'reuters')
'''
Train set size: 841
Validation set size: 180
Test set size: 181
'''

In [None]:
base_folder = 'Desktop/datasets/reuters50'
data = load_texts_from_folders(base_folder)
balanced_pairs = create_unique_text_pairs(data)
# 601
# 625
balanced_df = pd.DataFrame(balanced_pairs, columns=['text1', 'text2', 'same'])

In [None]:
texts = balanced_df['text1'].tolist() + balanced_df['text2'].tolist()
processed_texts = replace_named_entities(texts)
# Split the processed texts back into text1 and text2
balanced_df['text1'] = processed_texts[:len(balanced_df)]
balanced_df['text2'] = processed_texts[len(balanced_df):]
reuters = balanced_df

In [None]:
lengths_df(reuters)
'''
Average length of text1: 2786.62 characters
Average length of text2: 2753.46 characters
Overall average length of texts: 2770.04 characters
Total number of rows: 1202
'''

In [None]:
#blogs

blogs = pd.read_csv('Desktop/datasets/blogs.csv')

columns_to_keep = ["id", "text"]

blogs_section = blogs[columns_to_keep]

In [None]:
blogs_section = blogs_section[blogs_section['text'].str.strip().astype(bool)]
blogs_section = blogs_section.drop_duplicates()
print(len(blogs_section))
#672735
balanced_blog_df = create_balanced_pairs(blogs_section, reserve_ratio=0.1)
#Number of same author pairs: 301938
#Number of different author pairs: 29465

In [None]:
texts = balanced_blog_df['text1'].tolist() + balanced_blog_df['text2'].tolist()
processed_texts = replace_named_entities(texts)
# Split the processed texts back into text1 and text2
balanced_blog_df['text1'] = processed_texts[:len(balanced_blog_df)]
balanced_blog_df['text2'] = processed_texts[len(balanced_blog_df):]
blogs = balanced_blog_df

In [None]:
print(blogs.head())
lengths_df(blogs)
'''Average length of text1: 1107.91 characters
Average length of text2: 1063.41 characters
Overall average length of texts: 1085.66 characters
Total number of rows: 58930'''

In [None]:
download(blogs, 'blogs')
# Train set size: 41251
# Validation set size: 8839
# Test set size: 8840

In [None]:
#victorian
victorian_dset= pd.read_csv('Desktop/datasets/victorian.csv', encoding='latin1')
victorian_dset = victorian_dset.rename(columns={'author': 'id'})
victorian_dset = victorian_dset[victorian_dset['text'].str.strip().astype(bool)]
balanced_victorian_df = create_balanced_pairs(victorian_dset)
'''
Creating same author pairs and reserving texts: 100%|█| 45/45 [00:00<00:00, 256.
Number of same author pairs: 21470
Creating different author pairs: 100%|████| 5359/5359 [00:01<00:00, 4050.85it/s]
Number of different author pairs: 5359'''

In [None]:
texts = balanced_victorian_df['text1'].tolist() + balanced_victorian_df['text2'].tolist()
processed_texts = replace_named_entities(texts)
balanced_victorian_df['text1'] = processed_texts[:len(balanced_victorian_df)]
balanced_victorian_df['text2'] = processed_texts[len(balanced_victorian_df):]
victorian = balanced_victorian_df
#100%|███████████████████████████████████| 21436/21436 [1:25:52<00:00,  4.16it/s]


In [None]:
print(victorian.head())
lengths_df(victorian)
'''Average length of text1: 4925.94 characters
Average length of text2: 4920.34 characters
Overall average length of texts: 4923.14 characters
Total number of rows: 10718'''

In [None]:
download(victorian, 'victorian')
#Train set size: 7502
# Validation set size: 1608
# Test set size: 1608

In [None]:
#darkreddit
directory = 'Desktop/datasets/darkreddit'

darkreddit_train_df, darkreddit_test_df, darkreddit_val_df = process_darkreddit(directory)

print("Train DataFrame:")
print(darkreddit_train_df.head())
print("Test DataFrame:")
print(darkreddit_test_df.head())
print("Val DataFrame:")
print(darkreddit_val_df.head())
'''
Number of JSON files in train split: 204
Processing train files: 100%|█████████████████| 204/204 [01:19<00:00,  2.55it/s]
Number of JSON files in test split: 412
Processing test files: 100%|██████████████████| 412/412 [02:06<00:00,  3.25it/s]
Number of JSON files in val split: 412
Processing val files: 100%|███████████████████| 412/412 [02:13<00:00,  3.08it/s]
'''

df_name = 'darkreddit'
darkreddit_train_df.to_csv(f'Desktop/{df_name}_train.csv', index=False)
darkreddit_test_df.to_csv(f'Desktop/{df_name}_val.csv', index=False)
darkreddit_val_df.to_csv(f'Desktop/{df_name}_test.csv', index=False)

In [None]:
#british
directory = 'Desktop/datasets/british'

texts_by_author = read_texts(directory)

pairs = create_pairs(texts_by_author)
random.shuffle(pairs)

british_df = pd.DataFrame(pairs, columns=['text1', 'text2', 'same'])
# Same author pairs: 575
# Different author pairs: 611

In [None]:
texts = british_df['text1'].tolist() + british_df['text2'].tolist()
processed_texts = replace_named_entities(texts)
# Split the processed texts back into text1 and text2
british_df['text1'] = processed_texts[:len(british_df)]
british_df['text2'] = processed_texts[len(british_df):]
british = british_df
#Processing texts: 100%|█████████████████████| 2300/2300 [28:04<00:00,  1.37it/s]

In [None]:
lengths_df(british_df)
'''Average length of text1: 14850.00 characters
Average length of text2: 14554.32 characters
Overall average length of texts: 14702.16 characters
Total number of rows: 1150'''

In [None]:
british = british_df
download(british, "british")
'''805, 172, 173'''

In [None]:
#pan11
text_file_path = 'Desktop/datasets/pan11/LargeTrain.txt'

df = parse_training_text(text_file_path)
balanced_pan11_df = create_balanced_pairs(df)
'''Creating same author pairs and reserving texts: 100%|█| 72/72 [00:00<00:00, 1428
Number of same author pairs: 2326
Creating different author pairs: 100%|████| 2325/2325 [00:00<00:00, 3904.26it/s]
Number of different author pairs: 2325'''

In [None]:
texts = balanced_pan11_df['text1'].tolist() + balanced_pan11_df['text2'].tolist()
processed_texts = replace_named_entities(texts)
balanced_pan11_df['text1'] = processed_texts[:len(balanced_pan11_df)]
balanced_pan11_df['text2'] = processed_texts[len(balanced_pan11_df):]
pan11_train = balanced_pan11_df
'''100%|███████████████████████████████████████| 9300/9300 [08:40<00:00, 17.88it/s]
'''
print(pan11_train.head())
lengths_df(pan11_train)
'''Average length of text1: 290.13 characters
Average length of text2: 311.80 characters
Overall average length of texts: 300.96 characters
Total number of rows: 4650'''

In [None]:
pan11 = pan11_train
download(pan11, "pan11")
'''Train set size: 3255
Validation set size: 697
Test set size: 698'''

In [None]:
#pan13
train_directory = 'Desktop/datasets/pan13/train'
test_directory = 'Desktop/datasets/pan13/test'

pan13_train_df = process_folder(train_directory)
pan13_test_df = process_folder(test_directory)

In [None]:
texts = pan13_test_df['text1'].tolist() + pan13_test_df['text2'].tolist()
processed_texts = replace_named_entities(texts)
pan13_test_df['text1'] = processed_texts[:len(pan13_test_df)]
pan13_test_df['text2'] = processed_texts[len(pan13_test_df):]
lengths_df(pan13_test_df)
'''Average length of text1: 7391.20 characters
Average length of text2: 7575.74 characters
Overall average length of texts: 7483.47 characters
Total number of rows: 35'''

In [None]:
texts = pan13_train_df['text1'].tolist() + pan13_train_df['text2'].tolist()
processed_texts = replace_named_entities(texts)
# Split the processed texts back into text1 and text2
pan13_train_df['text1'] = processed_texts[:len(pan13_train_df)]
pan13_train_df['text2'] = processed_texts[len(pan13_train_df):]
lengths_df(pan13_train_df)
# 100%|█████████████████████████████████████████| 170/170 [00:53<00:00,  3.16it/s]
# Average length of text1: 6879.89 characters
# Average length of text2: 7125.55 characters
# Overall average length of texts: 7002.72 characters
# Total number of rows: 85

In [None]:
pan13_train_df.to_csv('Desktop/pan13_train.csv', index=False)
pan13_val, pan13_test = train_test_split(pan13_test_df, test_size=0.5, random_state=42)

pan13_test.to_csv('Desktop/pan13_test.csv', index=False)
pan13_val.to_csv('Desktop/pan13_val.csv', index=False)

In [None]:
#pan14
def process_folder(directory):
    truth_file_path = os.path.join(directory, 'truth.txt')
    labels = {}

    with open(truth_file_path, 'r') as file:
        for line in file:
            if line.strip():  # Ignore empty lines
                folder_name, same_author = line.strip().split()
                labels[folder_name] = same_author == 'Y'

    data = []

    for folder_name, same_author in labels.items():
        folder_path = os.path.join(directory, folder_name)
        known_file_path = os.path.join(folder_path, 'known01.txt')
        unknown_file_path = os.path.join(folder_path, 'unknown.txt')

        with open(known_file_path, 'r') as known_file:
            known_text = known_file.read()

        with open(unknown_file_path, 'r') as unknown_file:
            unknown_text = unknown_file.read()

        data.append({'text1': known_text, 'text2': unknown_text, 'same': same_author})

    return pd.DataFrame(data)



novels_train_dir = 'Desktop/datasets/pan14/novels'
novels_test_dir = 'Desktop/datasets/pan14/novelstest'
novels_verify_dir = 'Desktop/datasets/pan14/novelsverify'

essays_train_dir = 'Desktop/datasets/pan14/essays'
essays_test_dir = 'Desktop/datasets/pan14/essaystest'
essays_verify_dir = 'Desktop/datasets/pan14/essaysverify'

novels_train_df = process_folder(novels_train_dir)
novels_test_df = process_folder(novels_test_dir)
novels_verify_df = process_folder(novels_verify_dir)

essays_train_df = process_folder(essays_train_dir)
essays_test_df = process_folder(essays_test_dir)
essays_verify_df = process_folder(essays_verify_dir)

print("Novels Train DataFrame")
print(novels_train_df.head())
print("Novels Test DataFrame")
print(novels_test_df.head())
print("Novels Verify DataFrame")
print(novels_verify_df.head())

print("Essays Train DataFrame")
print(essays_train_df.head())
print("Essays Test DataFrame")
print(essays_test_df.head())
print("Essays Verify DataFrame")
print(essays_verify_df.head())

In [None]:
pan14_train = pd.concat([novels_train_df, essays_train_df], ignore_index=True)
pan14_test = pd.concat([novels_test_df, essays_test_df], ignore_index=True)
pan14_verify = pd.concat([novels_verify_df, essays_verify_df], ignore_index=True)

In [None]:
lengths_df(pan14_train)
lengths_df(pan14_test)
lengths_df(pan14_verify)
'''Average length of text1: 12483.33 characters
Average length of text2: 16039.48 characters
Overall average length of texts: 14261.41 characters
Total number of rows: 300
Average length of text1: 16717.15 characters
Average length of text2: 22049.04 characters
Overall average length of texts: 19383.10 characters
Total number of rows: 400
Average length of text1: 15021.18 characters
Average length of text2: 7249.65 characters
Overall average length of texts: 11135.42 characters
Total number of rows: 200'''

In [None]:
pan14_train.to_csv('Desktop/pan14_train.csv', index=False)
pan14_test.to_csv('Desktop/pan14_test.csv', index=False)
pan14_verify.to_csv('Desktop/pan14_val.csv', index=False)

In [None]:
#pan15

def process_pan15_folder(directory):
    data = []
    labels = {}

    for subdir in os.listdir(directory):
        subdir_path = os.path.join(directory, subdir)
        if os.path.isdir(subdir_path):
            truth_file_path = os.path.join(subdir_path, 'truth.txt')
            with open(truth_file_path, 'r') as file:
                for line in file:
                    if line.strip():
                        folder_name, same_author = line.strip().split()
                        labels[folder_name] = same_author == 'Y'

            for folder in os.listdir(subdir_path):
                folder_path = os.path.join(subdir_path, folder)
                if os.path.isdir(folder_path):
                    known_file_path = os.path.join(folder_path, 'known01.txt')
                    unknown_file_path = os.path.join(folder_path, 'unknown.txt')
                    with open(known_file_path, 'r', errors='ignore') as known_file, open(unknown_file_path, 'r', errors='ignore') as unknown_file:
                        known_text = known_file.read().strip()
                        unknown_text = unknown_file.read().strip()
                        data.append({
                            'text1': known_text,
                            'text2': unknown_text,
                            'same': labels[folder]
                        })

    return pd.DataFrame(data)

pan15_train_directory = 'Desktop/datasets/pan15/train'
pan15_test_directory = 'Desktop/datasets/pan15/test'

pan15_train_df = process_pan15_folder(pan15_train_directory)
pan15_test_df = process_pan15_folder(pan15_test_directory)

print(pan15_train_df.head())
print(pan15_test_df.head())


In [None]:
pan15_train_df.to_csv('Desktop/pan15_train.csv', index=False)

pan15_val, pan15_test = train_test_split(pan15_test_df, test_size=0.5, random_state=42)

pan15_test.to_csv('Desktop/pan15_test.csv', index=False)
pan15_val.to_csv('Desktop/pan15_val.csv', index=False)

lengths_df(pan15_train_df)
lengths_df(pan15_val)
lengths_df(pan15_test)
'''Average length of text1: 3373.92 characters
Average length of text2: 2787.31 characters
Overall average length of texts: 3080.61 characters
Total number of rows: 865
Average length of text1: 3571.18 characters
Average length of text2: 3264.16 characters
Overall average length of texts: 3417.67 characters
Total number of rows: 200
Average length of text1: 3421.29 characters
Average length of text2: 3161.57 characters
Overall average length of texts: 3291.43 characters
Total number of rows: 200'''

In [None]:
#pan20
def load_jsonl(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            data.append(json.loads(line))
    return pd.DataFrame(data)

pan20_train_file = 'Desktop/datasets/pan20/train.jsonl'
pan20_test_file = 'Desktop/datasets/pan20/test.jsonl'
pan20_val_file = 'Desktop/datasets/pan20/val.jsonl'

pan20_val_df = load_jsonl(pan20_val_file)

In [None]:
columns_to_keep = ["same", "pair"]

pan20_val_df = pan20_val_df[columns_to_keep]
pan20_val_df['same'] = pan20_val_df['same'].astype(int)

# Split 'pair' column into 'text1' and 'text2'
pan20_val_df['text1'] = pan20_val_df['pair'].apply(lambda x: x[0])
pan20_val_df['text2'] = pan20_val_df['pair'].apply(lambda x: x[1])

# Drop the 'pair' column
pan20_val_df = pan20_val_df.drop(columns=['pair'])

pan20_val_df.head()

In [None]:
pan20_train_df = load_jsonl(pan20_train_file)
pan20_test_df = load_jsonl(pan20_test_file)

pan20_train_df = pan20_train_df[columns_to_keep]
pan20_train_df['same'] = pan20_train_df['same'].astype(int)

# Split 'pair' column into 'text1' and 'text2'
pan20_train_df['text1'] = pan20_train_df['pair'].apply(lambda x: x[0])
pan20_train_df['text2'] = pan20_train_df['pair'].apply(lambda x: x[1])

# Drop the 'pair' column
pan20_train_df = pan20_train_df.drop(columns=['pair'])


In [None]:
pan20_test_df = pan20_test_df[columns_to_keep]
pan20_test_df['same'] = pan20_test_df['same'].astype(int)

# Split 'pair' column into 'text1' and 'text2'
pan20_test_df['text1'] = pan20_test_df['pair'].apply(lambda x: x[0])
pan20_test_df['text2'] = pan20_test_df['pair'].apply(lambda x: x[1])

# Drop the 'pair' column
pan20_test_df = pan20_test_df.drop(columns=['pair'])

pan20_test_df.head()

In [None]:
# texts = pan20_val_df['text1'].tolist() + pan20_val_df['text2'].tolist()
# processed_texts = replace_named_entities(texts)
# # Split the processed texts back into text1 and text2
# pan20_val_df['text1'] = processed_texts[:len(pan20_val_df)]
# pan20_val_df['text2'] = processed_texts[len(pan20_val_df):]
# pan20_val_df = pan20_val_df

In [None]:
lengths_df(pan20_train_df)
lengths_df(pan20_test_df)
lengths_df(pan20_val_df)
'''
Average length of text1: 21470.89 characters
Average length of text2: 21476.11 characters
Overall average length of texts: 21473.50 characters
Total number of rows: 248001
Average length of text1: 21418.08 characters
Average length of text2: 21400.72 characters
Overall average length of texts: 21409.40 characters
Total number of rows: 13704
Average length of text1: 21490.78 characters
Average length of text2: 21551.04 characters
Overall average length of texts: 21520.91 characters
Total number of rows: 13703
'''

In [None]:
pan20_train_df.to_csv('Desktop/pan20_train.csv', index=False)
pan20_test_df.to_csv('Desktop/pan20_test.csv', index=False)
pan20_val_df.to_csv('Desktop/pan20_val.csv', index=False)