# Description 

A company has a lot of client data in  word, pdf, images etc. containing different types of graphs. They want to have this data stored in their database. 

They approached you with a dataset of extracted plots of different types, and want you to: 

__Classify images of plots into different classes: (2pts)__

You are provided with a dataset.zip, containing images of different types of plots.  
Tasks: 

a) Clean the dataset (remove any file which is corrupted, make sure images are in 
same format (.jpg or .png )) 

b) Build a classifier training code (pytorch or keras or classical ML  model anything is 
fine) 

c) Do the accuracy metrics analysis 

d) Save model 

e) Inference code testing your model on any image from the dataset 


__Deliverables:__
1. Complete code or notebook 
2. Saved model file 
3. Documentation containing instructions to run your code and a requirements.txt (You can also make use of Markdown cells if you are using a notebook)

In [1]:
# Add all the needed imports
import os
from torchvision import datasets, transforms
from PIL import Image
import shutil
import random

## A) Clean the dataset (remove any file which is corrupted, make sure images are in same format (.jpg or .png )) 

In [2]:
# Create the valid extensions tuple
valid_extensions = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.tif', '.tiff', '.webp')

def is_valid_image(file):
    return file.lower().endswith(valid_extensions)

all_images_dataset = datasets.ImageFolder(
    'dataset_part1',
    transform=transforms.ToTensor(),
    is_valid_file=is_valid_image
)

# Show the number of images in the dataset
print(f"Number of images in the dataset: {len(all_images_dataset)}")

# Show the number of classes in the dataset
print(f"Number of classes in the dataset: {len(all_images_dataset.classes)}")

Number of images in the dataset: 3190
Number of classes in the dataset: 5


### !!! This number is incorrect because there are also files like GIF etc. The total amount of images are: "3202" !!!

### If we change the images to their correct type we will lose only 12 images! This is perfectly fine!

### Make a dataset with only ".png" images

In [3]:
# Path to original fixed dataset
source_dir = 'dataset_part1'

# Path to new location
output_dir = 'dataset_part1_fixed'
os.makedirs(output_dir, exist_ok=True)

# Load dataset to get image paths and labels
transform = transforms.ToTensor()
fixed_dataset = datasets.ImageFolder(source_dir, transform=transform)

# Save all images in the new location
for i, (img_path, label) in enumerate(fixed_dataset.imgs):
    class_name = fixed_dataset.classes[label]
    class_dir = os.path.join(output_dir, class_name)
    os.makedirs(class_dir, exist_ok=True)

    # Save image with consistent name format
    new_filename = f'image_{i}.png'
    new_path = os.path.join(class_dir, new_filename)

    # Convert and save as .jpg
    img = Image.open(img_path).convert('RGB')
    img.save(new_path, 'PNG')

print(f"✅ Fixed dataset saved to: {output_dir}")

✅ Fixed dataset saved to: dataset_part1_fixed


In [4]:
# Load the fixed dataset
fixed_dataset = datasets.ImageFolder(
    output_dir,
    transform=transforms.ToTensor(),
    is_valid_file=is_valid_image
)

# Show the number of images in the fixed dataset
print(f"Number of images in the fixed dataset: {len(fixed_dataset)}")

# Show the number of classes in the fixed dataset
print(f"Number of classes in the fixed dataset: {len(fixed_dataset.classes)}")
    

Number of images in the fixed dataset: 3190
Number of classes in the fixed dataset: 5


In [5]:
# Check if the dataset is fixed
for image_path in fixed_dataset.imgs:
    print(f"Image path: {image_path[0]}, Extension: {os.path.splitext(image_path[0])[1]}")

Image path: dataset_part1_fixed\bargraph\image_0.png, Extension: .png
Image path: dataset_part1_fixed\bargraph\image_1.png, Extension: .png
Image path: dataset_part1_fixed\bargraph\image_10.png, Extension: .png
Image path: dataset_part1_fixed\bargraph\image_100.png, Extension: .png
Image path: dataset_part1_fixed\bargraph\image_101.png, Extension: .png
Image path: dataset_part1_fixed\bargraph\image_102.png, Extension: .png
Image path: dataset_part1_fixed\bargraph\image_103.png, Extension: .png
Image path: dataset_part1_fixed\bargraph\image_104.png, Extension: .png
Image path: dataset_part1_fixed\bargraph\image_105.png, Extension: .png
Image path: dataset_part1_fixed\bargraph\image_106.png, Extension: .png
Image path: dataset_part1_fixed\bargraph\image_107.png, Extension: .png
Image path: dataset_part1_fixed\bargraph\image_108.png, Extension: .png
Image path: dataset_part1_fixed\bargraph\image_109.png, Extension: .png
Image path: dataset_part1_fixed\bargraph\image_11.png, Extension: .pn

### The conversion went well we kept all the files and turned them into ".png" or ".jpg". Now lets see if they all work. And if there are any corrupt files.

In [6]:
folder_path = 'dataset_part1_fixed'
corrupt_files = []

for root, _, files in os.walk(folder_path):
    for file in files:
        file_path = os.path.join(root, file)
        try:
            with Image.open(file_path) as img:
                img.verify()  # Verify that the file is not corrupt
        except Exception as e:
            corrupt_files.append(file_path)

if corrupt_files:
    print(f"Found {len(corrupt_files)} corrupt files:")
    for corrupt_file in corrupt_files:
        print(corrupt_file)
else:
    print("No corrupt files found in the dataset.")


No corrupt files found in the dataset.


### Both work perfectly so now it is onto the next step!

### Resize the images to ideal size

In [7]:
# Define a transform to resize images to 250x250
resize_transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.ToTensor(),
    transforms.Grayscale(num_output_channels=1),  # Convert to Grayscale

])

# Apply the transform to the dataset
resized_dataset = datasets.ImageFolder(
    folder_path,
    transform=resize_transform,
)

# Verify the transformation by checking the size of the first image
sample_image, _ = resized_dataset[0]
print(f"Resized image shape: {sample_image.shape}")

Resized image shape: torch.Size([1, 256, 256])


### Save the resized images to a new directory

In [8]:
output_dir = 'dataset_part1_resized'
os.makedirs(output_dir, exist_ok=True)

# Create subdirectories for each class
for class_name in resized_dataset.classes:
    os.makedirs(os.path.join(output_dir, class_name), exist_ok=True)
# Copy and resize images to the new directory
for i, (image, label) in enumerate(resized_dataset):
    class_name = resized_dataset.classes[label]
    image_path = os.path.join(output_dir, class_name, f'image_{i}.png')
    image_pil = transforms.ToPILImage()(image)
    image_pil.save(image_path)
# Print the number of images in the resized dataset
resized_dataset = datasets.ImageFolder(
    output_dir,
    transform=transforms.ToTensor(),
)
print(f"Number of images in the resized dataset: {len(resized_dataset)}")
# Print the number of classes in the resized dataset
print(f"Number of classes in the resized dataset: {len(resized_dataset.classes)}")

Number of images in the resized dataset: 3190
Number of classes in the resized dataset: 5


### Split the data

In [9]:
transform = transforms.ToTensor()

def split_dataset(source_dir, dest_dir, split_ratio=0.8):
    for class_name in os.listdir(source_dir):
        class_dir = os.path.join(source_dir, class_name)
        if not os.path.isdir(class_dir): continue

        images = os.listdir(class_dir)
        random.shuffle(images)
        split_point = int(len(images) * split_ratio)

        train_images = images[:split_point]
        val_images = images[split_point:]

        for phase, phase_images in zip(['train', 'val'], [train_images, val_images]):
            phase_class_dir = os.path.join(dest_dir, phase, class_name)
            os.makedirs(phase_class_dir, exist_ok=True)
            for img in phase_images:
                shutil.copy2(os.path.join(class_dir, img), os.path.join(phase_class_dir, img))

split_dataset('dataset_part1_resized', 'dataset_part1_split')


In [10]:
# Check the number of images in the train and validation directories
train_dir = 'dataset_part1_split/train'
val_dir = 'dataset_part1_split/val'

train_count = sum([len(files) for _, _, files in os.walk(train_dir)])
val_count = sum([len(files) for _, _, files in os.walk(val_dir)])
print(f"Number of training images: {train_count}")
print(f"Number of validation images: {val_count}")

Number of training images: 2551
Number of validation images: 639


### Now the data is ready for training. Might add Data Augmentation later.