#Get Data

In [1]:
# Install Kaggle Library
!pip install kaggle

# Before next step, user needs to download the free API KEY from Kaggle settings
# Upload the kaggle.json file to Google Colab Files

# Make directory for Kaggle & Refer to API KEY
! mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

# Download Dataset
! kaggle datasets download shreelakshmigp/cedardataset

mkdir: cannot create directory ‘/root/.kaggle’: File exists
cp: cannot stat 'kaggle.json': No such file or directory
Dataset URL: https://www.kaggle.com/datasets/shreelakshmigp/cedardataset
License(s): unknown
Downloading cedardataset.zip to /content
 99% 241M/242M [00:03<00:00, 82.9MB/s]
100% 242M/242M [00:03<00:00, 78.9MB/s]


In [2]:
# ! mkdir sfddata
! unzip cedardataset.zip -d sfddata

Archive:  cedardataset.zip
  inflating: sfddata/signatures/Readme.txt  
  inflating: sfddata/signatures/full_forg/Thumbs.db  
  inflating: sfddata/signatures/full_forg/forgeries_10_1.png  
  inflating: sfddata/signatures/full_forg/forgeries_10_10.png  
  inflating: sfddata/signatures/full_forg/forgeries_10_11.png  
  inflating: sfddata/signatures/full_forg/forgeries_10_12.png  
  inflating: sfddata/signatures/full_forg/forgeries_10_13.png  
  inflating: sfddata/signatures/full_forg/forgeries_10_14.png  
  inflating: sfddata/signatures/full_forg/forgeries_10_15.png  
  inflating: sfddata/signatures/full_forg/forgeries_10_16.png  
  inflating: sfddata/signatures/full_forg/forgeries_10_17.png  
  inflating: sfddata/signatures/full_forg/forgeries_10_18.png  
  inflating: sfddata/signatures/full_forg/forgeries_10_19.png  
  inflating: sfddata/signatures/full_forg/forgeries_10_2.png  
  inflating: sfddata/signatures/full_forg/forgeries_10_20.png  
  inflating: sfddata/signatures/full_forg/fo

#Combine Data

In [3]:
import os
import shutil
import numpy as np

# Paths to directories
real_sig_dir = '/content/sfddata/signatures/full_org'
fake_sig_dir = '/content/sfddata/signatures/full_forg'

# Define the destination folder for the combined dataset
destination_folder = 'signatures_combined'
label_location = "/content/signatures_combined/og_labels.npy"

# Ensure the destination folder exists
os.makedirs(destination_folder, exist_ok=True)

# List the files in the source subfolders
files1 = os.listdir(fake_sig_dir)
files2 = os.listdir(real_sig_dir)

# The list of results
labels = []

# Copy files from the first subfolder to the destination
for file in files1:
    source_file = os.path.join(fake_sig_dir, file)
    destination_file = os.path.join(destination_folder, file)
    shutil.copy(source_file, destination_file)
    labels.append([0])

# Copy files from the second subfolder to the destination
for file in files2:
    source_file = os.path.join(real_sig_dir, file)
    destination_file = os.path.join(destination_folder, file)
    shutil.copy(source_file, destination_file)
    labels.append([1])


sorted_labels = np.array(labels)
# save images somewhere
np.save(label_location, sorted_labels, allow_pickle=False)

#Convert Image to Grayscale


In [5]:
"""Code is used for processing images"""

from PIL import Image, ImageOps
from tqdm import tqdm
import shutil
import os

THRESHOLD = 128

def image_to_grayscale(image_dir: str) -> Image:
    """Used for testing purposes to convert one image to grayscale"""
    image = Image.open(image_dir)
    gray_image = ImageOps.grayscale(image)
    return gray_image

def convert_grayscale(directory: str) -> list:
    """Converts all images in the given directory into gray scale"""
    converted_images = []
    filenames = []
    for filename in sorted(os.listdir(directory)):
        if filename.endswith(".png"):
            image = Image.open(directory + "/" + filename)
            gray_image = ImageOps.grayscale(image)
            converted_images.append(gray_image)
            filenames.append(filename)
    return [converted_images, filenames]

def resize(images: list, dimensions: tuple) -> None:
    """Resizes all the given images in a list"""
    for i in range(0, len(images)):
        image = images[i]
        images[i] = image.resize(dimensions)

# actual code to run
dimensions = (250, 250) # dimensions for the images, can be changed
directory = 'signatures_combined'  # where the image files are located
image_path = 'converted_images'  # where the  images will be stored
gray_images = convert_grayscale(directory)[0]
filenames = convert_grayscale(directory)[1]
# bitmap_images = convert_bitmap(gray_images)
resize(gray_images, dimensions)
save_images = True

if not os.path.exists(image_path):
    # If the image path directory does not exist, create it.
    !mkdir converted_images
else:
  shutil.rmtree(image_path)
  !mkdir converted_images

if save_images:
    for i in tqdm(range(0, len(gray_images))):
        # Saves the bitmap images
        image_filename = os.path.join(image_path, filenames[i])
        gray_images[i].save(image_filename)


100%|██████████| 2640/2640 [00:32<00:00, 82.17it/s]


#Reduce Noise In Images & Save Data

In [6]:
import cv2
from tqdm import tqdm
import numpy as np
from numpy import asarray
from matplotlib import pyplot as plt
from concurrent.futures import ThreadPoolExecutor
import pathlib
plt.style.use('seaborn')

# Get directory to access bitmap images
dir = pathlib.Path('/content/converted_images')

# Extract bitmap images and store in list
pictures = list(sorted(dir.glob('*.png')))

# Convert filenames to str and store in list
images = []
for pic in pictures:
  images.append(str(pic))

# where to store the images w reduced noise
reduced_dir = "/content/reduced"
!mkdir reduced

less_noise_pics = []

# Reduce noise in each bitmap image and store in list
for i in tqdm(range(len(images))):
  image = images[i]
  filename = image.split('/')[-1]
  noise_pic = cv2.imread(image)
  image_again = asarray(noise_pic)
  less_noise_pic = cv2.fastNlMeansDenoising(image_again, None, 15, 7, 21)

  # images stored in tuple form => (original image, noise reduced image)
  less_noise_pics.append(less_noise_pic)
  curr_path = os.path.join(reduced_dir, filename)
  cv2.imwrite(curr_path, less_noise_pic)

  plt.style.use('seaborn')
100%|██████████| 2640/2640 [09:40<00:00,  4.55it/s]


#Split Data into Train & Test Directories

In [7]:
import random
import numpy

data_path_train = "/content/reduced"
! mkdir split
data_path_test = "/content/split"

# path to destination folder
train_folder = os.path.join(data_path_test, 'training')

# Define a list of image extensions
image_extensions = '.png'

# Create a list of image filenames in 'data_path'
imgs_list = [filename for filename in sorted(os.listdir(data_path_train)) if os.path.splitext(filename)[-1] in image_extensions]

# Sets the random seed
random.seed(1107)

# Shuffle the list of image filenames
indices = [x for x in range(0, len(imgs_list))]
random.shuffle(indices)

# determine the number of images for each set
train_size = int(len(imgs_list) * 0.85)
test_size = int(len(imgs_list) * 0.15)

# Create destination folders if they don't exist
if not os.path.exists(train_folder):
    os.makedirs(train_folder)

# Rearrange the image files and labels
  # first initialize the lists
new_imgs_list = [x for x in range(0, len(imgs_list))]

for i, j in enumerate(indices):
  new_imgs_list[i] = imgs_list[j]

# Store test imgs
test_imgs = []
test_labels = []

# Copy image files to destination folders
for i, f in enumerate(new_imgs_list):
    if i < train_size:
        dest_folder = train_folder
        shutil.copy(os.path.join(data_path_train, f), os.path.join(dest_folder, f))
    else:
        # add to test array
        pic = cv2.imread(os.path.join(data_path_train, f))
        test_imgs.append(asarray(pic))
        if "original" in f:
          test_labels.append([1.])
        else:
          test_labels.append([0.])


# Save test labels
np.save("/content/test_labels.npy", test_labels, allow_pickle=False)

# Save test imgs
test_imgs_array = np.array(test_imgs)
np.save("/content/test_imgs.npy", test_imgs_array)

#Split Train Images into Tensorflow Datasets

In [8]:
# put forgeries in one folder, originals in other
train_folder = "/content/split/training"

forg_dataset_folder = "/content/dataset/forg"
org_dataset_folder = "/content/dataset/org"

if not os.path.exists(forg_dataset_folder):
    os.makedirs(forg_dataset_folder)
if not os.path.exists(org_dataset_folder):
    os.makedirs(org_dataset_folder)

for filename in os.listdir(train_folder):
  if "forgeries" in filename:
    dest_folder = forg_dataset_folder
  else:
    dest_folder = org_dataset_folder
  shutil.copy(os.path.join(train_folder, filename), os.path.join(dest_folder, filename))

In [9]:
import tensorflow as tf
from keras.utils import image_dataset_from_directory
import tensorflow_datasets as tfds
import pathlib

BATCH_SIZE = 2232
IMG_HEIGHT = 250
IMG_WIDTH = 250

# Set directory to pull images from
DATA_DIR = pathlib.Path('/content/dataset')
# Make training & validation tensorflow datasets stored in list
train_ds = tf.keras.utils.image_dataset_from_directory(
    DATA_DIR,
    labels="inferred",
    label_mode='binary',
    validation_split=0.18,
    subset="both",
    shuffle = True,
    seed=1107,
    image_size=(IMG_HEIGHT, IMG_WIDTH),
    batch_size=BATCH_SIZE,
)

dataset_array = tfds.as_numpy(train_ds)

train_dataset = dataset_array[0]
val_dataset = dataset_array[1]

for images, labels in dataset_array[0]:
  np.save("/content/train_labels.npy", labels)
  np.save("/content/train_imgs.npy", images)

for images, labels in dataset_array[1]:
  np.save("/content/val_labels.npy", labels)
  np.save("/content/val_imgs.npy", images)
# Testing folders has 15% of data but does not go through splitting
# using tensorflow

Found 2244 files belonging to 2 classes.
Using 1841 files for training.
Using 403 files for validation.


# Example to Load Data

In [None]:
import numpy

train_labels = numpy.load("/content/train_labels.npy")
print(train_labels.shape)
train_imgs = numpy.load("/content/train_imgs.npy")
print(train_imgs.shape)

val_labels = numpy.load("/content/val_labels.npy")
print(val_labels.shape)
val_imgs  = numpy.load("/content/val_imgs.npy")
print(val_imgs.shape)
print(val_labels[0])

test_labels = numpy.load("/content/test_labels.npy")
print(test_labels.shape)
test_imgs = numpy.load("/content/test_imgs.npy")
print(test_imgs.shape)
print(test_labels[0])

(1841, 1)
(1841, 250, 250, 3)
(403, 1)
(403, 250, 250, 3)
[1.]
(396, 1)
(396, 250, 250, 3)
[1.]


#Export Data

In [None]:
from google.colab import files

# download label arrays
files.download("/content/train_labels.npy")
files.download("/content/test_labels.npy")
files.download("/content/val_labels.npy")

# download img arrays
files.download("/content/test_imgs.npy")
files.download("/content/train_imgs.npy")
files.download("/content/val_imgs.npy")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>