In [None]:
#| default_exp data.preprocess

In [None]:
#| hide
#| eval: false

%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
#| hide

import os

In [None]:
#| hide

os.chdir('../')
# verify that data directory and rock_classifier dir are present
assert all([dir in os.listdir() for dir in ['data', 'rocks_classifier']])

# Preprocess Data
> Move images from both datasets to respective class labels, remove duplicates, bad and corrupted images.

# Steps

> This is the description & steps of the `preprocess_data` function which combines the following functions

1. Clear data directory, Download and move datasets.
2. Rename and move files to data/2_processed.
3. List files other than jpg and png, to remove unsupported files.
4. List file by types before cleaning.
5. Remove
    - Bad Images
    - Duplicate Images
    - Misclassified Images
    - Unsupported Images
    - Corrupted Images
6. List file by types after cleaning.
7. Get count of files by class types.
8. Handle Imbalance using Undersampling, Oversampling.

In [None]:
#| hide

from nbdev.showdoc import *

In [None]:
#| export
#| hide

import os
import requests
import subprocess
import logging
import hydra, omegaconf

from rocks_classifier.data.download import download_and_move_datasets
from rocks_classifier.data.utils import *

In [None]:
#| hide

%whos

In [None]:
#| hide
#| eval: false

download_configs()

In [None]:
#| hide

# path = 'configs/config.yaml'
# cfg = omegaconf.OmegaConf.load(path)

## 1. Download and move datasets.
> Also clears the data directory if files already exists. 

In [None]:
show_doc(download_and_move_datasets)

In [None]:
#|echo: false
#| eval: false

download_and_move_datasets()

## 2. Rename and move files to data/2_processed.

In [None]:
show_doc(move_to_processed)

In [None]:
#|echo: false
#| eval: false

move_to_processed()

## 3. List files other than jpg and png, to remove unsupported files.

In [None]:
#| eval: false
#|echo: true
#|code-fold: true

print("\nFiles other than jpg and png.\n")
files, _ = find_filepaths('data/2_processed/')
print('\n'.join(list(filter(lambda x: not x.endswith('jpg') and not x.endswith('png'), files))))

## 4. List file by types before cleaning.


In [None]:
#| eval: false
#|echo: false

print("\nFile types before cleaning:")
get_value_counts("data/2_processed")

## 5. Remove
    - Bad Images
    - Duplicate Images
    - Misclassified Images
    - Unsupported Images
    - Corrupted Images


In [None]:
show_doc(clean_images)

In [None]:
#| eval: false
#|echo: false

clean_images(cfg)

## 6. List file by types after cleaning.


In [None]:
#| eval: false
#|echo: false

print("\nFile types after cleaning:")
get_value_counts("data/2_processed")

## 7. Get count of files by class types.


In [None]:
#| eval: false
#|echo: false

print("\nCounts of classes:\n")
get_value_counts("data/2_processed", column="class")

## 8. Handle Imbalance

Using Undersampling, Oversampling and No Sampling.

In [None]:
show_doc(sampling)

In [None]:
#| eval: false
#| echo: false

sampling(cfg)

## Putting it all together

`process_data` wraps all the above functions.

In [None]:
#| export

@hydra.main(config_path="../../configs", config_name="config", version_base="1.2")
def process_data(cfg):
    """Download dataset, removes unsupported and corrupted images, and splits data into train, val and test.

    Parameters
    ----------
    cfg : cfg (omegaconf.DictConfig):
        Hydra Configuration
    """
    download_configs()
    
    download_and_move_datasets()
    move_to_processed()

    print("\n\nFiles other than jpg and png.\n")
    files, _ = find_filepaths('data/2_processed/')
    print('\n'.join(list(filter(lambda x: not x.endswith('jpg') and not x.endswith('png'), files))))

    print("\nFile types before cleaning:")
    get_value_counts("data/2_processed")

    clean_images(cfg)

    print("\nFile types after cleaning:")
    get_value_counts("data/2_processed")

    print("\nCounts of classes:\n")
    get_value_counts("data/2_processed", column="class")

    sampling(cfg)


In [None]:
show_doc(process_data)

In [None]:
#| hide

%%bash
rocks_process_data

In [None]:
#| hide
from nbdev import nbdev_export
nbdev_export()