In [1]:
import os
import shutil
import numpy as np

In [2]:
# Define the split percentages
train_split = 0.7
val_split = 0.15
test_split = 0.15

In [3]:
# Define the paths
source_folder = 'ChiSig'
train_folder = 'train'
val_folder = 'validation'
test_folder = 'test'

In [4]:
# Create folders for the splits
for folder in [train_folder, val_folder, test_folder]:
    os.makedirs(folder, exist_ok=True)
    os.makedirs(os.path.join(folder, 'positive'), exist_ok=True)
    os.makedirs(os.path.join(folder, 'negative'), exist_ok=True)

In [5]:
# Identify professional forgeries and corresponding originals
all_files = os.listdir(source_folder)

In [6]:
print(all_files)

['叶良工-23-5.jpg', '叶高扬-91-1.jpg', '范阳曦-12-2.jpg', '罗语晨-62-2.jpg', '孔瑰玮-152-1.jpg', '王芳媛-6-1.jpg', '陆秋芳-62-3.jpg', '武厦洁-38-2.jpg', '尹秉文-35-3.jpg', '李成弘-108-4.jpg', '韩乐圣-40-1.jpg', '毛新洁-20-1.jpg', '于宋-147-1.jpg', '余冰凡-46-2.jpg', '郝冷亦-147-2.jpg', '邓承载-52-1.jpg', '江宏义-21-1.jpg', '韩乐圣-39-1.jpg', '史煊-18-1.jpg', '江夏萱-57-1.jpg', '龙康时-18-3.jpg', '毛昊怡-23-2.jpg', '沈凯乐-16-1.jpg', '吕啸-64-2.jpg', '张昊焱-54-1.jpg', '蔡洲-13-2.jpg', '江伟-143-4.jpg', '苏睿慈-4-2.jpg', '宋俊达-69-1.jpg', '姜子轩-108-2.jpg', '苏孤兰-47-4.jpg', '周乐欣-147-3.jpg', '萧痴旋-115-5.jpg', '许旭尧-62-2.jpg', '何弘和-45-5.jpg', '江秋华-91-1.jpg', '赵雅宁-6-4.jpg', '郭鸿运-92-1.jpg', '郑思淼-91-3.jpg', '邱意远-58-4.jpg', '萧综-65-1.jpg', '文文姝-92-2.jpg', '徐海桃-67-4.jpg', '薛高邈-49-1.jpg', '郭谊-41-4.jpg', '陆秋芳-62-5.jpg', '孟镰-32-2.jpg', '崔竹筱-197-4.jpg', '李菲-38-3.jpg', '蒋慕诗-83-1.jpg', '白光启-60-3.jpg', '许蔡琳-67-5.jpg', '宋文曜-19-5.jpg', '沈问-52-2.jpg', '刘莓-98-5.jpg', '苏渝-112-4.jpg', '熊谷玉-65-3.jpg', '余娴-51-1.jpg', '宋英才-12-1.jpg', '熊文虹-14-4.jpg', '江伟-60-3.jpg', '张星火-4-2.jpg', '康思源-92-1.jpg',

In [7]:
originals = set()
professional_forgeries = set()

In [8]:
for file in all_files:
    parts = file.split('-')
    if len(parts) < 3:
        continue

    number_a = int(parts[1])
    if number_a > 100:
        professional_forgeries.add(number_a)
        originals.add(number_a - 100)

In [9]:
# Function to determine label
def get_label(filename):
    number_a = int(filename.split('-')[1])

    # Original signature
    if number_a in originals:
        return 'positive'
    # Professional forgery
    elif number_a in professional_forgeries:
        return 'negative'
    # Random forgery
    else:
        return 'negative'

In [10]:
# Shuffle files
seed = 42
np.random.seed(seed)
np.random.shuffle(all_files)

In [11]:
# Split the files
total_files = len(all_files)
train_count = int(total_files * train_split)
val_count = int(total_files * val_split)

In [12]:
train_files = all_files[:train_count]
val_files = all_files[train_count:train_count + val_count]
test_files = all_files[train_count + val_count:]

In [13]:
def copy_files(files, target_folder):
    for file in files:
        label = get_label(file)
        if label:
            shutil.copy(os.path.join(source_folder, file), os.path.join(target_folder, label))

In [14]:
# Copy files to their respective sets
copy_files(train_files, train_folder)
copy_files(val_files, val_folder)
copy_files(test_files, test_folder)

print("Data splitting completed.")

Data splitting completed.
