In [None]:
# | default_exp data.preprocess

In [None]:
# | hide
# | eval: false

%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
# | hide

import os

In [None]:
# | hide
# DONT RUN THIS CELL IN JUPYTER

os.chdir("../")

assert all([dir in os.listdir() for dir in ["rocks_classifier", "data"]])  # "data"

In [None]:
# | hide

from nbdev.showdoc import *

In [None]:
# | export
# | hide

import os
import requests
import subprocess
import logging
import hydra, omegaconf

from rocks_classifier.data.utils import *

In [None]:
# | hide

path = "configs/config.yaml"
cfg = omegaconf.OmegaConf.load(path)

<a href="https://colab.research.google.com/github/udaylunawat/Whats-this-rock/blob/main/notebooks/02_a_preprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
<!--- @wandbcode{intro-colab} -->

# Preprocess Data
> Move images from both datasets to respective class labels, remove duplicates, bad and corrupted images.

In [None]:
# | export

_doc_ = """Preprocess the extracted dataset, to be ready to trained.

Performs the following:-
1. Rename and move files to data/2_processed.
2. List files other than jpg and png, to remove unsupported files.
3. List file by types before cleaning.
4. Remove
    - Bad Images
    - Duplicate Images
    - Misclassified Images
    - Unsupported Images
    - Corrupted Images
5. List file by types after cleaning.
6. Get count of files by class types.
7. Handle Imbalance using Undersampling, Oversampling.
"""

# Steps

> This is the description & steps of the `preprocess_data` function which combines the following functions

1. Rename and move files to data/2_processed.
2. List files other than jpg and png, to remove unsupported files.
3. List file by types before cleaning.
4. Remove
    - Bad Images
    - Duplicate Images
    - Misclassified Images
    - Unsupported Images
    - Corrupted Images
5. List file by types after cleaning.
6. Get count of files by class types.
7. Handle Imbalance using Undersampling, Oversampling.

In [None]:
# | hide

%whos

Variable                     Type        Data/Info
--------------------------------------------------
BasicHtmlRenderer            type        <class 'nbdev.showdoc.BasicHtmlRenderer'>
BasicMarkdownRenderer        type        <class 'nbdev.showdoc.BasicMarkdownRenderer'>
DocmentTbl                   type        <class 'nbdev.showdoc.DocmentTbl'>
ShowDocRenderer              type        <class 'nbdev.showdoc.ShowDocRenderer'>
clean_data_dir               function    <function clean_data_dir>
clean_images                 function    <function clean_images>
colab_link                   function    <function colab_link>
copy_configs_tocwd           function    <function copy_configs_tocwd>
doc                          function    <function doc>
download_and_move_datasets   function    <function download_and_mo<...>_datasets>
find_filepaths               function    <function find_filepaths>
get_df                       function    <function get_df>
get_new_name                 function    

## 1. Rename and move files to data/2_processed.

In [None]:
show_doc(move_to_processed)

---

[source](https://github.com/udaylunawat/Whats-this-rock/blob/main/rocks_classifier/data/utils.py#LNone){target="_blank" style="float:right; font-size:smaller"}

### move_to_processed

>      move_to_processed ()

Combines files with same subclass and moves them to the subclass under data/2_processed.

Uses `get_new_name` to create new names of files and then rename them and copy to data/2_processed.

In [None]:
# | echo: false
# | eval: false

move_to_processed()

Moving files from dataset1/Basalt and dataset2/Basalt to data/2_processed/Basalt ...
Moving files from dataset1/Coal and dataset2/Coal to data/2_processed/Coal ...
Moving files from dataset1/Granite and dataset2/Granite to data/2_processed/Granite ...
Moving files from dataset1/Limestone and dataset2/Limestone to data/2_processed/Limestone ...
Moving files from dataset1/Marble and dataset2/Marble to data/2_processed/Marble ...
Moving files from dataset1/Quartzite and dataset2/Quartzite to data/2_processed/Quartzite ...
Moving files from dataset1/Sandstone and dataset2/Sandstone to data/2_processed/Sandstone ...


## 2. List files other than jpg and png, to remove unsupported files.

In [None]:
# | eval: false
# | echo: true
# | code-fold: true
# | output-location: slide

print("\nFiles other than jpg and png.\n")
files, _ = find_filepaths("data/2_processed/")
print(
    "\n".join(
        list(filter(lambda x: not x.endswith("jpg") and not x.endswith("png"), files))
    )
)


Files other than jpg and png.

data/2_processed/Coal/dataset1_Coal_025_12.jpeg
data/2_processed/Coal/dataset1_Coal_070_162.jpeg
data/2_processed/Coal/dataset1_Coal_071_163.jpeg
data/2_processed/Coal/dataset1_Coal_072_164.jpeg
data/2_processed/Coal/dataset1_Coal_073_165.jpeg
data/2_processed/Coal/dataset1_Coal_074_166.jpeg
data/2_processed/Coal/dataset1_Coal_075_167.jpeg
data/2_processed/Coal/dataset1_Coal_076_168.jpeg
data/2_processed/Coal/dataset1_Coal_077_169.jpeg
data/2_processed/Coal/dataset1_Coal_079_170.jpeg
data/2_processed/Coal/dataset1_Coal_080_171.jpeg
data/2_processed/Coal/dataset1_Coal_081_172.jpeg
data/2_processed/Coal/dataset1_Coal_082_173.jpeg
data/2_processed/Coal/dataset1_Coal_083_174.jpeg
data/2_processed/Coal/dataset1_Coal_084_175.jpeg
data/2_processed/Coal/dataset1_Coal_085_176.jpeg
data/2_processed/Coal/dataset1_Coal_086_177.jpeg
data/2_processed/Coal/dataset1_Coal_087_178.jpeg
data/2_processed/Coal/dataset1_Coal_088_179.jpeg
data/2_processed/Coal/dataset1_Coal_09

## 3. List file by types before cleaning.


In [None]:
# | eval: false
# | echo: false

print("\nFile types before cleaning:")
get_value_counts("data/2_processed")


File types before cleaning:
.jpg     2550
.jpeg      28
.JPEG      20
.png       17
.jfif       7
.webp       7
Name: file_type, dtype: int64


## 4. Remove
    - Bad Images
    - Duplicate Images
    - Misclassified Images
    - Unsupported Images
    - Corrupted Images


In [None]:
show_doc(clean_images)

---

[source](https://github.com/udaylunawat/Whats-this-rock/blob/main/rocks_classifier/data/utils.py#LNone){target="_blank" style="float:right; font-size:smaller"}

### clean_images

>      clean_images (cfg)

Removes bad, misclassified, duplicate, corrupted and unsupported images.

|    | **Type** | **Details** |
| -- | -------- | ----------- |
| cfg | cfg (omegaconf.DictConfig) | Hydra Configuration |

In [None]:
# | eval: false
# | echo: false

clean_images(cfg)

## 5. List file by types after cleaning.


In [None]:
# | eval: false
# | echo: false

print("\nFile types after cleaning:")
get_value_counts("data/2_processed")


File types after cleaning:
.jpg     2550
.jpeg      28
.JPEG      20
.png       17
.jfif       7
.webp       7
Name: file_type, dtype: int64


## 6. Get count of files by class types.


In [None]:
# | eval: false
# | echo: false

print("\nCounts of classes:\n")
get_value_counts("data/2_processed", column="class")


Counts of classes:

Quartzite    517
Coal         469
Limestone    452
Marble       427
Sandstone    370
Granite      214
Basalt       180
Name: class, dtype: int64


## 7. Handle Imbalance

Using Undersampling, Oversampling and No Sampling.

In [None]:
show_doc(sampling)

---

[source](https://github.com/udaylunawat/Whats-this-rock/blob/main/rocks_classifier/data/utils.py#LNone){target="_blank" style="float:right; font-size:smaller"}

### sampling

>      sampling (cfg)

Oversamples/Undersample/No Sampling data into train, val, test.

|    | **Type** | **Details** |
| -- | -------- | ----------- |
| cfg | cfg (omegaconf.DictConfig) | Hydra Configuration |

In [None]:
# | eval: false
# | echo: false

sampling(cfg)

## Putting it all together

`process_data` wraps all the above functions.

In [None]:
# | export


@hydra.main(config_path=".", config_name="config", version_base="1.2")
def process_data(cfg):
    """Removes unsupported and corrupted images, and splits data into train, val and test.

    Steps -> `download_and_move_datasets` -> `move_to_processed` -> 'find_filepaths' -> `clean_images` -> `sampling`

    Parameters
    ----------
    cfg : cfg (omegaconf.DictConfig):
        Hydra Configuration
    """
    move_to_processed()

    print("\n\nFiles other than jpg and png.\n")
    files, _ = find_filepaths("data/2_processed/")
    print(
        "\n".join(
            list(
                filter(lambda x: not x.endswith("jpg") and not x.endswith("png"), files)
            )
        )
    )

    print("\nFile types before cleaning:")
    get_value_counts("data/2_processed")

    clean_images(cfg)

    print("\nFile types after cleaning:")
    get_value_counts("data/2_processed")

    print("\nCounts of classes:\n")
    get_value_counts("data/2_processed", column="class")

    sampling(cfg)

In [None]:
# | hide
# | eval: true

process_data(cfg)

In [None]:
show_doc(process_data)

---

[source](https://github.com/udaylunawat/Whats-this-rock/blob/main/rocks_classifier/data/preprocess.py#L18){target="_blank" style="float:right; font-size:smaller"}

### process_data

>      process_data (cfg)

Download dataset, removes unsupported and corrupted images, and splits data into train, val and test.
Steps -> `download_and_move_datasets` -> `move_to_processed` -> 'find_filepaths' -> `clean_images` -> `sampling`

|    | **Type** | **Details** |
| -- | -------- | ----------- |
| cfg | cfg (omegaconf.DictConfig): | Hydra Configuration |

In [None]:
# | hide
from nbdev import nbdev_export

nbdev_export()