## **Importing Required Libraries**

In [None]:
import numpy as np
import pandas as pd
import os
from PIL import Image

import matplotlib.pyplot as plt
import seaborn as sns

## **Unzip files**

In [None]:
import zipfile

%time
def unzip(zip_file_path, output_dir_path):
    with zipfile.ZipFile(zip_file_path, "r") as zip:
        zip.extractall(output_dir_path)
    print("Data uncompressed successfully...")
    print()

zip_file_path = r"C:\Users\Vishn\Downloads\mnist.zip"
output_dir_path = r"D:\Extracted_alphabets_image_files"

unzip(zip_file_path,output_dir_path)

CPU times: total: 0 ns
Wall time: 0 ns
Data uncompressed successfully...



## **Pre Analysis of Given Data**

#### **What are the image file extensions?**

In [None]:
file_extensions = []
for root, dirs, files in os.walk(path):
    print(f"Reading images from {root}, Found {len(files)} files.")
    for file_name in files:
        file_extensions.append(os.path.splitext(file_name)[-1])
        file_path = os.path.join(root, file_name)

print()
print("Total Number of Files:", len(file_extensions))
print("File extensions:", set(file_extensions))
path = r"D:\Extracted_alphabets_image_files"

Reading images from D:\Extracted_alphabets_image_files, Found 0 files.
Reading images from D:\Extracted_alphabets_image_files\New folder, Found 0 files.
Reading images from D:\Extracted_alphabets_image_files\New folder\A, Found 13870 files.
Reading images from D:\Extracted_alphabets_image_files\New folder\B, Found 8668 files.
Reading images from D:\Extracted_alphabets_image_files\New folder\C, Found 23409 files.
Reading images from D:\Extracted_alphabets_image_files\New folder\D, Found 10134 files.
Reading images from D:\Extracted_alphabets_image_files\New folder\E, Found 11440 files.
Reading images from D:\Extracted_alphabets_image_files\New folder\F, Found 1163 files.
Reading images from D:\Extracted_alphabets_image_files\New folder\G, Found 5762 files.
Reading images from D:\Extracted_alphabets_image_files\New folder\H, Found 7218 files.
Reading images from D:\Extracted_alphabets_image_files\New folder\I, Found 1120 files.
Reading images from D:\Extracted_alphabets_image_files\New f

### **What are the image sizes, extensions and modes?**


In [None]:
def read_image_files(path):
    images_data = []
    for root, dirs, files in os.walk(path):
        for file_name in files:
            extension = os.path.splitext(file_name)[-1]
            if extension in file_extensions:
                file_path = os.path.join(root, file_name)
                img = Image.open(file_path)
                images_data.append({"file_name": file_name,
                                    "extension": extension,
                                    "size": img.size,
                                    "mode": img.mode,
                                    "dir": root})
    return images_data

def load_into_df(images_data):
    df = pd.DataFrame(images_data)
    return df

In [None]:
%%time

images_data = read_image_files(path)

df = load_into_df(images_data)

df.head()

CPU times: total: 48.7 s
Wall time: 30min 41s


Unnamed: 0,file_name,extension,size,mode,dir
0,A-0.png,.png,"(28, 28)",L,D:\Extracted_alphabets_image_files\New folder\A
1,A-1.png,.png,"(28, 28)",L,D:\Extracted_alphabets_image_files\New folder\A
2,A-10.png,.png,"(28, 28)",L,D:\Extracted_alphabets_image_files\New folder\A
3,A-100.png,.png,"(28, 28)",L,D:\Extracted_alphabets_image_files\New folder\A
4,A-1000.png,.png,"(28, 28)",L,D:\Extracted_alphabets_image_files\New folder\A


In [None]:
df.shape

(372451, 5)

In [None]:
df['extension'].value_counts()

extension
.png    372451
Name: count, dtype: int64

In [None]:
df['size'].value_counts()

size
(28, 28)    372451
Name: count, dtype: int64

In [None]:
df['mode'].value_counts()

mode
L    372451
Name: count, dtype: int64

## **Observations**

1.There are 372451 images in total.

2.All these images are having **.png** format.

3.All these images are of same size i.e. **(28, 28)** and same mode i.e. **Gray Scale**

## **Writing data to csv**

In [None]:
def read_image_files(path):
    images_data = []
    first_dir_skipped = False
    label_count = {}

    for root, dirs, files in os.walk(path):
        if not first_dir_skipped:
            first_dir_skipped = True
            continue

        dir_name = os.path.basename(root)
        dir_last_char = dir_name[-1]

        if dir_last_char not in label_count:
            label_count[dir_last_char] = 0

        files = [f for f in files if os.path.splitext(f)[-1].lower() in ['.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tiff']]
        for file_name in tqdm(files, desc=f'Processing {dir_name}', leave=False):
            file_path = os.path.join(root, file_name)
            img = Image.open(file_path).convert('L')  # Convert image to grayscale
            img = img.resize((8, 8))  # Resize image to 8x8 pixels
            img_array = list(img.getdata())  # Get pixel data

            # Create a dictionary with pixel data and label
            img_data = {f'pixel{i}': img_array[i] for i in range(64)}
            img_data['label'] = dir_last_char

            images_data.append(img_data)
            label_count[dir_last_char] += 1
    return images_data

def load_into_df(images_data):
    df = pd.DataFrame(images_data)
    return df

In [None]:
path = r"D:\Extracted_alphabets_image_files"
images_data = read_image_files(path)

image_df = load_into_df(images_data)

# Writing df to csv file
image_df.to_csv('image_alphabets.csv', index = False)

image_df.head()

                                                                    

Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel55,pixel56,pixel57,pixel58,pixel59,pixel60,pixel61,pixel62,pixel63,label
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,A
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,2,0,0,0,A
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,A
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,A
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,A
