In [None]:
#| default_exp data.download

# 1) Download dataset

> We'll create the project directory structure and download the datasets.

- We'll create and use some bash scripts to create a directory structure for our project
- We'll be following the project template by [cookiecutter - by datadriven](https://drivendata.github.io/cookiecutter-data-science/)

In [None]:
#| hide

import os

In [None]:
#| hide

os.chdir('../')

In [None]:
#| hide

from nbdev.config import *
from nbdev.showdoc import *

In [None]:
#| hide

repo_name = get_config().lib_name
!mkdir -p $repo_name\/scripts

In [None]:
#| export
#| hide

import os
from rocks_classifier.data.utils import timer_func, find_filepaths

## Creating project directory structure

Let's create a script `scripts/clean_dir.sh` 

> It creates the directory structure, and clears existing data files.

```bash
sh scripts/clean_dir.sh
```

In [None]:
#| hide

with open ('rocks_classifier/scripts/clean_dir.sh', 'w') as rsh:
    rsh.write('''\
#!/bin/bash

# setting up data dir
rm -rf data/1_extracted/* data/2_processed/* data/3_tfds_dataset/*
rm -rf data/corrupted_images/* data/duplicate_images/* data/bad_images/* data/misclassified_images/*
mkdir -p data/0_raw data/1_extracted data/2_processed data/3_tfds_dataset
mkdir -p data/corrupted_images data/duplicate_images data/bad_images data/misclassified_images/ checkpoints

mkdir -p data/2_processed/Coal/
mkdir -p data/2_processed/Basalt/
mkdir -p data/2_processed/Granite/
mkdir -p data/2_processed/Marble/
mkdir -p data/2_processed/Quartzite/
mkdir -p data/2_processed/Limestone/
mkdir -p data/2_processed/Sandstone/
''')

In [None]:
%%bash
#|eval: false
#|code-fold: true

#!/bin/bash

# setting up data dir
rm -rf data/1_extracted/* data/2_processed/* data/3_tfds_dataset/*
rm -rf data/corrupted_images/* data/duplicate_images/* data/bad_images/* data/misclassified_images/*
mkdir -p data/0_raw data/1_extracted data/2_processed data/3_tfds_dataset
mkdir -p data/corrupted_images data/duplicate_images data/bad_images data/misclassified_images/ checkpoints

mkdir -p data/2_processed/Coal/
mkdir -p data/2_processed/Basalt/
mkdir -p data/2_processed/Granite/
mkdir -p data/2_processed/Marble/
mkdir -p data/2_processed/Quartzite/
mkdir -p data/2_processed/Limestone/
mkdir -p data/2_processed/Sandstone/

## Creating scripts to download and setup datasets

### Dataset 1

Downloads [Dataset1](https://www.kaggle.com/datasets/salmaneunus/rock-classification) and moves the extracted files to `data/1_extracted/dataset1`.

```bash
sh scripts/dataset1.sh
```

In [None]:
#| hide

with open ('rocks_classifier/scripts/dataset1.sh', 'w') as rsh:
    rsh.write('''\
#!/bin/bash

# dataset 1 processing
wget --quiet -O data/0_raw/rock-classification.zip -nc https://huggingface.co/datasets/udayl/rocks/resolve/main/rock-classification.zip
unzip -qn data/0_raw/rock-classification.zip -d data/1_extracted/
mv -vn data/1_extracted/Dataset data/1_extracted/dataset1

mv data/1_extracted/dataset1/Igneous/* data/1_extracted/dataset1/
mv data/1_extracted/dataset1/Metamorphic/* data/1_extracted/dataset1/
mv data/1_extracted/dataset1/Sedimentary/* data/1_extracted/dataset1/

rm -rf data/1_extracted/dataset1/Igneous/
rm -rf data/1_extracted/dataset1/Metamorphic/
rm -rf data/1_extracted/dataset1/Sedimentary/
''')

In [None]:
%%bash

#|eval: false
#|code-fold: true

#!/bin/bash

# dataset 1 processing
wget --quiet -O data/0_raw/rock-classification.zip -nc https://huggingface.co/datasets/udayl/rocks/resolve/main/rock-classification.zip
unzip -qn data/0_raw/rock-classification.zip -d data/1_extracted/
mv -vn data/1_extracted/Dataset data/1_extracted/dataset1

mv data/1_extracted/dataset1/Igneous/* data/1_extracted/dataset1/
mv data/1_extracted/dataset1/Metamorphic/* data/1_extracted/dataset1/
mv data/1_extracted/dataset1/Sedimentary/* data/1_extracted/dataset1/

rm -rf data/1_extracted/dataset1/Igneous/
rm -rf data/1_extracted/dataset1/Metamorphic/
rm -rf data/1_extracted/dataset1/Sedimentary/

data/1_extracted/Dataset -> data/1_extracted/dataset1


### Dataset 2

Downloads [Dataset2](https://www.kaggle.com/datasets/mahmoudalforawi/igneous-metamorphic-sedimentary-rocks-and-minerals) and moves the extracted files to `data/1_extracted/dataset2`.

```bash
sh scripts/dataset2.sh
```

In [None]:
#| hide

with open ('rocks_classifier/scripts/dataset2.sh', 'w') as rsh:
    rsh.write('''\
#!/bin/bash

# dataset 2 processing
wget --quiet -O data/0_raw/igneous-metamorphic-sedimentary-rocks-and-minerals.zip -nc https://huggingface.co/datasets/udayl/rocks/resolve/main/igneous-metamorphic-sedimentary-rocks-and-minerals.zip
unzip -qn data/0_raw/igneous-metamorphic-sedimentary-rocks-and-minerals.zip -d data/1_extracted/
mv data/1_extracted/Rock_Dataset data/1_extracted/dataset2

rm -rf data/1_extracted/dataset2/minerals

mv data/1_extracted/dataset2/igneous\ rocks/Basalt data/1_extracted/dataset2/
mv data/1_extracted/dataset2/igneous\ rocks/granite data/1_extracted/dataset2/
mv data/1_extracted/dataset2/metamorphic\ rocks/marble data/1_extracted/dataset2/
mv data/1_extracted/dataset2/metamorphic\ rocks/quartzite data/1_extracted/dataset2/
mv data/1_extracted/dataset2/sedimentary\ rocks/Limestone data/1_extracted/dataset2/
mv data/1_extracted/dataset2/sedimentary\ rocks/Sandstone data/1_extracted/dataset2/
mv data/1_extracted/dataset2/sedimentary\ rocks/coal data/1_extracted/dataset2/

mv data/1_extracted/dataset2/granite data/1_extracted/dataset2/Granite
mv data/1_extracted/dataset2/marble data/1_extracted/dataset2/Marble
mv data/1_extracted/dataset2/quartzite data/1_extracted/dataset2/Quartzite
mv data/1_extracted/dataset2/coal data/1_extracted/dataset2/Coal

rm -rf data/1_extracted/dataset2/igneous\ rocks
rm -rf data/1_extracted/dataset2/metamorphic\ rocks
rm -rf data/1_extracted/dataset2/sedimentary\ rocks
''')

In [None]:
%%bash
#|eval: false
#|code-fold: true

#!/bin/bash

# dataset 2 processing
wget --quiet -O data/0_raw/igneous-metamorphic-sedimentary-rocks-and-minerals.zip -nc https://huggingface.co/datasets/udayl/rocks/resolve/main/igneous-metamorphic-sedimentary-rocks-and-minerals.zip
unzip -qn data/0_raw/igneous-metamorphic-sedimentary-rocks-and-minerals.zip -d data/1_extracted/
mv data/1_extracted/Rock_Dataset data/1_extracted/dataset2

rm -rf data/1_extracted/dataset2/minerals

mv data/1_extracted/dataset2/igneous\ rocks/Basalt data/1_extracted/dataset2/
mv data/1_extracted/dataset2/igneous\ rocks/granite data/1_extracted/dataset2/
mv data/1_extracted/dataset2/metamorphic\ rocks/marble data/1_extracted/dataset2/
mv data/1_extracted/dataset2/metamorphic\ rocks/quartzite data/1_extracted/dataset2/
mv data/1_extracted/dataset2/sedimentary\ rocks/Limestone data/1_extracted/dataset2/
mv data/1_extracted/dataset2/sedimentary\ rocks/Sandstone data/1_extracted/dataset2/
mv data/1_extracted/dataset2/sedimentary\ rocks/coal data/1_extracted/dataset2/

mv data/1_extracted/dataset2/granite data/1_extracted/dataset2/Granite
mv data/1_extracted/dataset2/marble data/1_extracted/dataset2/Marble
mv data/1_extracted/dataset2/quartzite data/1_extracted/dataset2/Quartzite
mv data/1_extracted/dataset2/coal data/1_extracted/dataset2/Coal

rm -rf data/1_extracted/dataset2/igneous\ rocks
rm -rf data/1_extracted/dataset2/metamorphic\ rocks
rm -rf data/1_extracted/dataset2/sedimentary\ rocks

# Download and verify the data 

```bash
python rock_classifier/data/download.py
```

In [None]:
#| exports
#| code-fold: true


class download_datasets:
    data_dict = {
        1: {"script": "rocks_classifier/scripts/dataset1.sh", "filecount": 2083},
        2: {"script": "rocks_classifier/scripts/dataset2.sh", "filecount": 546},
    }
        
    @timer_func  # | hide_line
    def run_scripts(self):
        """
        Download the datasets using scripts.

        Uses `find_filepaths` to recursively find paths for all files in a directory.
        """

        for dataset_id in self.data_dict:
            if self.files_exists(dataset_id):
                print(f"Dataset{dataset_id} already exists.")
                self.verify_files(dataset_id)
            else:
                print(f"Downloading dataset {dataset_id}...")
                os.system(f"sh {self.data_dict[dataset_id]['script']}")
                
                
    def files_exists(self, dataset_id):
        if os.path.exists(
                os.path.join("data", "1_extracted", f"dataset{dataset_id}")):
            self.verify_files(dataset_id)
            return True
    
    def verify_files(self, dataset_id):
        """verifies the image counts"""
        _, count = find_filepaths(
            os.path.join("data", "1_extracted", f"dataset{dataset_id}"))
        assert count == self.data_dict[dataset_id]["filecount"]
        print(f"Total Files in dataset{dataset_id}:- {count}.\n")
        

In [None]:
#| hide 
%%bash
rocks_clean_data

In [None]:
#| hide

download_datasets().run_scripts()

Downloading dataset 1...
data/1_extracted/Dataset -> data/1_extracted/dataset1
Downloading dataset 2...
Function 'run_scripts' executed in 7.6447s


In [None]:
#| hide

assert all([dir in os.listdir() for dir in ['data', 'rocks_classifier']])

In [None]:
#| hide
from nbdev import nbdev_export
nbdev_export()