#### Split MHIST data to training and testing datasets


In [1]:
import pandas as pd 
import shutil 
import os 

annotations = pd.read_csv('mhist_dataset/annotations.csv')

img_folder = 'mhist_dataset/images'
train_folder = 'mhist_dataset/train/'
test_folder = 'mhist_dataset/test/'

In [2]:
os.makedirs(train_folder, exist_ok=True)
os.makedirs(test_folder, exist_ok=True)

for index, row in annotations.iterrows():
    img_name = row['Image Name']
    partition = row['Partition']
    
    src_path = os.path.join(img_folder, img_name)
    if partition == 'train':
        dst_path = os.path.join(train_folder, img_name)
    else:
        dst_path = os.path.join(test_folder, img_name)
    
    shutil.copy(src_path, dst_path)

print("Images have been successfully moved")

Images have been successfully moved


In [3]:
actual_train_count = len(os.listdir(train_folder))
actual_test_count = len(os.listdir(test_folder))

expected_train_count = len(annotations[annotations['Partition'] == 'train'])
expected_test_count = len(annotations[annotations['Partition'] == 'test'])

if actual_train_count == expected_train_count and actual_test_count == expected_test_count:
    print("All files have been moved correctly and counts match!")
else:
    print("Warning: There is a mismatch between the files moved and the expected count!")

All files have been moved correctly and counts match!


#### Unzip MNIST dataset

In [4]:
import os 
import gzip 
import shutil

dataset_dir = 'mnist_dataset/MNIST/raw'
dataset_dest = 'mnist_dataset/MNIST/images'

os.makedirs(dataset_dest, exist_ok=True)
files_to_unzip = [f for f in os.listdir(dataset_dir) if f.endswith('.gz')]

for file in files_to_unzip:
    gz_path =os.path.join(dataset_dir, file)
    unzipped_path = os.path.join(dataset_dest, file[:-3])
    
    with gzip.open(gz_path, 'rb') as f_in:
        with open(unzipped_path, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)