In [None]:
#| default_exp data.preprocess

In [None]:
#| hide

import os

In [None]:
#| hide

os.chdir('../')

# 3) Preprocess Data
> Move images from both datasets to respective class labels, remove duplicates, bad and corrupted images.

## Steps

> This is the description & steps of the `preprocess_data` function which combines the following functions

1. Rename and move files to data/2_processed.
2. List files other than jpg and png, to remove unsupported files.
3. List file by types before cleaning.
4. Remove
    - Bad Images
    - Duplicate Images
    - Misclassified Images
    - Unsupported Images
    - Corrupted Images
4. List file by types after cleaning.
5. Get count of files by class types.
6. Handle Imbalance using Undersampling, Oversampling.

In [None]:
#| hide

from nbdev.showdoc import *

In [None]:
#| hide

import os
import subprocess
import logging
import hydra

from rocks_classifier.data.utils import move_to_processed, find_filepaths

In [None]:
#| hide

assert all([dir in os.listdir() for dir in ['data', 'rocks_classifier']])

### 1. Rename and move files to data/2_processed.

In [None]:
show_doc(move_to_processed)

---

[source](https://github.com/udaylunawat/Whats-this-rock/blob/nbdev/rocks_classifier/data/utils.py#LNone){target="_blank" style="float:right; font-size:smaller"}

### move_to_processed

>      move_to_processed ()

Combines files with same subclass and moves them to the subclass under data/2_processed.

Uses `get_new_name` to create new names of files and then rename them and copy to data/2_processed.

### 2. List files other than jpg and png, to remove unsupported files.

In [None]:
print("\n\nFiles other than jpg and png.\n")
files, _ = find_filepaths('data/2_processed/')
print('\n'.join(list(filter(lambda x: not x.endswith('jpg') and not x.endswith('png'), files))))



Files other than jpg and png.

data/2_processed/.DS_Store
data/2_processed/Coal/dataset1_Coal_025_12.jpeg
data/2_processed/Coal/dataset1_Coal_070_162.jpeg
data/2_processed/Coal/dataset1_Coal_071_163.jpeg
data/2_processed/Coal/dataset1_Coal_072_164.jpeg
data/2_processed/Coal/dataset1_Coal_073_165.jpeg
data/2_processed/Coal/dataset1_Coal_074_166.jpeg
data/2_processed/Coal/dataset1_Coal_075_167.jpeg
data/2_processed/Coal/dataset1_Coal_076_168.jpeg
data/2_processed/Coal/dataset1_Coal_077_169.jpeg
data/2_processed/Coal/dataset1_Coal_079_170.jpeg
data/2_processed/Coal/dataset1_Coal_080_171.jpeg
data/2_processed/Coal/dataset1_Coal_081_172.jpeg
data/2_processed/Coal/dataset1_Coal_082_173.jpeg
data/2_processed/Coal/dataset1_Coal_083_174.jpeg
data/2_processed/Coal/dataset1_Coal_084_175.jpeg
data/2_processed/Coal/dataset1_Coal_085_176.jpeg
data/2_processed/Coal/dataset1_Coal_086_177.jpeg
data/2_processed/Coal/dataset1_Coal_087_178.jpeg
data/2_processed/Coal/dataset1_Coal_088_179.jpeg
data/2_pro

## Functions

In [None]:
#| export
#| hide

import os
import subprocess
import logging
import hydra

from rocks_classifier.data.download import download_datasets
from rocks_classifier.data.utils import (
    clean_download_files,
    find_filepaths,
    get_df,
    get_value_counts,
    move_to_processed,
    sampling,
    clean_images,
)

In [None]:
#| export

# @hydra.main(config_path="../../configs", config_name="config", version_base="1.2")
def process_data():
    """Download dataset, removes unsupported and corrupted images, and splits data into train, val and test.

    Parameters
    ----------
    cfg : cfg (omegaconf.DictConfig):
        Hydra Configuration
    """
    import omegaconf
    path = 'configs/config.yaml'
    cfg = omegaconf.OmegaConf.load(path)
    
    clean_download_files()
    download_datasets()
    move_to_processed()

    print("\n\nFiles other than jpg and png.\n")
    files, _ = find_filepaths('data/2_processed/')
    print('\n'.join(list(filter(lambda x: not x.endswith('jpg') and not x.endswith('png'), files))))

    print("\nFile types before cleaning:")
    get_value_counts("data/2_processed")

    clean_images(cfg)

    print("\nFile types after cleaning:")
    get_value_counts("data/2_processed")

    print("\nCounts of classes:\n")
    get_value_counts("data/2_processed", column="class")

    sampling(cfg)


In [None]:
#| hide
from nbdev import nbdev_export
nbdev_export()