# Downloading Data

> Downloading 4 datasets.

## Downloading datasets
### Dataset 1

```bash
kaggle datasets download salmaneunus/rock-classification --path data/0_raw/
unzip -qn data/0_raw/rock-classification.zip -d data/1_extracted/
mv -vn data/1_extracted/Dataset data/1_extracted/dataset1
```

### Dataset 2

```bash
kaggle datasets download mahmoudalforawi/igneous-metamorphic-sedimentary-rocks-and-minerals --path data/0_raw/
unzip -qn data/0_raw/igneous-metamorphic-sedimentary-rocks-and-minerals.zip -d data/1_extracted/
mv data/1_extracted/Rock_Dataset data/1_extracted/dataset2

rm -rf data/1_extracted/dataset2/minerals
```

In [None]:
#| export data.utils

import os
from time import time

def timer_func(func):
    """Show the execution time of the function object passed.

    Parameters
    ----------
    func : _type_
        _description_
    """

    def wrap_func(*args, **kwargs):
        t1 = time()
        result = func(*args, **kwargs)
        t2 = time()
        print(f"Function {func.__name__!r} executed in {(t2-t1):.4f}s")
        return result

    return wrap_func


def find_filepaths(root_folder: str):
    """Recursively finds all files.

    Parameters
    ----------
    root_folder : str
        _description_

    Returns
    -------
    _type_
        _description_
    """
    filepaths = []
    for dirname, _, filenames in os.walk(root_folder):
        for filename in filenames:
            filepaths.append(os.path.join(dirname, filename))
    return filepaths, len(filepaths)


def get_new_name(dir_list: list) -> dict:
    '''Return dict with old name and new name of files in multiple directories.

    {'data/1_extracted/dataset1/Basalt/14.jpg': 'data/2_processed/Basalt/dataset1_01_Basalt_14.jpg'}


    Parameters
    ----------
    dir_list : list
        list of directories that needs to be combined

    Returns
    -------
    dict
        {old_path: new_path}
    '''
    file_list = []
    for dir in dir_list:
        paths, _ = find_filepaths(dir)
        file_list.extend(paths)
    
    count = 1
    file_dict = {}
    for file_path in file_list:
        dataset = file_path.split('/')[-3]
        class_name = file_path.split('/')[-2]
        basename = os.path.basename(file_path)
        file_name = os.path.splitext(basename)[0]
        extension = os.path.splitext(basename)[1]
        new_file_name = os.path.join('data','2_processed', class_name, f'{dataset}_{class_name}_{str(count).zfill(3)}_{file_name}{extension}')
        file_dict[file_path] = new_file_name
        count += 1

    return file_dict

In [None]:
#| export data.download

import os
import logging
from src.data.utils import timer_func, find_filepaths


@timer_func
def download_datasets():
    """Download the dataset with dataset_id.

    Parameters
    ----------
    dataset_id : int
        Dataset number
    """
    data_dict = {
        1: {"script": "src/scripts/dataset1.sh", "filecount": 2083},
        2: {"script": "src/scripts/dataset2.sh", "filecount": 4553},
    }
    for dataset_id in data_dict:
        if not os.path.exists(
            os.path.join("data", "1_extracted", f"dataset{dataset_id}")
        ):
            print(f"Downloading dataset {dataset_id}...")
            os.system(f"sh {data_dict[dataset_id]['script']}")
        else:
            _, count = find_filepaths(
                os.path.join("data", "1_extracted", f"dataset{dataset_id}")
            )
            assert count == data_dict[dataset_id]["filecount"]
            print(f"dataset{dataset_id} already exists.")
            print(f"Total Files in dataset{dataset_id}:- {count}.\n")


if __name__ == "__main__":
    download_datasets()


In [None]:
#| export data.utils

import shutil
from src.data.utils import find_filepaths, get_new_name

def move_to_processed():
    dir1 = 'data/1_extracted/dataset1'
    dir2 = 'data/1_extracted/dataset2'
    for d1, d2 in zip(os.listdir(dir1), os.listdir(dir2)):
        assert d1 == d2
        path_dict = get_new_name([os.path.join(dir1, d1), os.path.join(dir2, d2)])
        
        for old_path, new_path in path_dict.items():
            shutil.copy(old_path, new_path)

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()