# TODO format with PEP8

In [1]:
# !pip install X

In [2]:
# Imports
from pathlib import Path
import pandas as pd
import numpy as np
import requests
import re

# Acquiring the NIH Pill image data from the web

## Explaining Imports:

| Import name | Usage |
| - | - |
| pathlib.Path | folder/directory management |
| pandas | data management |
| numpy | data management |
| requests | accessing the web |
| re | (regex) string simplification |

## Downloading checkpoint:

It is a safety measure in case your internet connection would break during the downloading process (just like mine did), thus allowing only a portion of the desired data to download. The CHECKPOINT lets you start the downloading process from the N-th photo. Note that the CHECKPOINT must be of the lesser value than the number of photos to download, otherwise the download will not start.

In [3]:
# Default is 0
CHECKPOINT = 0

## Constant variables:

In [4]:
# Chosen pill categories
CATEGORIES_FILE = ".\\tablets_list.txt"

# regex for the usp pill images
USP_REGEX = "usp*$|USP*$|Usp*$|usp dosepack*$"

# 'images' folder
IMG_PATH = ".\\images"

# Reserved folders; should be created/deleted manually
RESERVED = [
    Path(f"{IMG_PATH}\\.cropped"),
    Path(f"{IMG_PATH}\\.backup"),
    Path(f"{IMG_PATH}\\.train"),
    Path(f"{IMG_PATH}\\.valid"),
    Path(f"{IMG_PATH}\\.test"),
]

### prepare_for_csv function:

Used to prepare the dataframe to save it as a *\*.csv* file, consisting only of desired pills from the *tablets_list.txt* file.

In [5]:
def prepare_for_csv(df: np.ndarray, cats: list):
    """
    Extract the desired data from the numpy array.
    We only want the data that consists of all these traits:
        - Images that end with .JPG
        - Images' names that equal our desired categories
    """

    new_df = np.empty((0, 3))

    # Append only valid rows
    for i in range(len(df)):

        if (df[i][1] in cats) and (".JPG" in df[i][2]):

            new_df = np.append(new_df, [df[i]], axis=0)

    for i in range(len(new_df)):

        # Add ids
        new_df[i][0] = i

        # Simplify the names
        new_df[i][1] = re.sub("[^A-Za-z0-9 *]+", "", new_df[i][1])
        new_df[i][1] = re.sub(USP_REGEX, "", new_df[i][1]).rstrip()
        new_df[i][1] = f"\"{re.sub('[ *]+', '_', new_df[i][1])}\"".lower()

        # Fix the urls
        new_df[i][2] = f'"https://data.lhncbc.nlm.nih.gov/public/Pills/{new_df[i][2]}"'

    # Label the new dataframe
    new_df = np.insert(new_df, 0, ['"Id"', '"Name"', '"Image"'], axis=0)

    return new_df

## Execution Part 1:

Used to prepare the data for the downloading process (I.e. creating a data list of desired photos to download).

In [6]:
# Put the desired categories into the variable
with open(CATEGORIES_FILE, "r") as f:
    cats = f.read().split("\n")

In [7]:
# Load the reference dataframe
ref_df = pd.read_excel(".\\directory_of_images.xlsx")

In [8]:
# Prepare the new dataframe
df = ref_df[["Part", "Name", "Image"]].to_numpy()
df = prepare_for_csv(df, cats)

In [9]:
# Save the dataframe as a *.csv file
np.savetxt(".\\data_list.csv", df, delimiter=",", fmt="%s")

### make_img_path function:

Used to create an image path out of a given name.

In [10]:
def make_img_path(name: str) -> Path:
    """
    Create an image path as a pathlib.Path object from a name.
    """
    return Path(f"{IMG_PATH}\\{name}")

### create_folders function:

Used to create folders out of a prepared dataframe.

In [11]:
def create_folders(df: pd.DataFrame):
    """
    Create folders for the downloading process.
    """

    # Convert names to paths
    cats = list(set(df["Name"]))
    cats = list(map(make_img_path, cats))

    # Clear previous data
    # or create the 'images' folder if it does not exist
    if Path(IMG_PATH).is_dir():
        for dir in Path(IMG_PATH).iterdir():

            # Delete the file; dir should be a directory
            if dir.is_file():
                dir.unlink()
                continue

            # Delete the contents
            dir_len = len(list(dir.iterdir()))
            if (dir not in RESERVED) and (dir_len != 0):
                for obj in dir.iterdir():
                    # Had to add this 'if' clause below...
                    # pathlib calls windows error 5 when unlinking a folder,
                    # even though unlink() should instantly call rmdir() when used on a folder
                    # rmdir() works fine
                    if obj.is_dir():
                        obj.rmdir()
                    else:
                        obj.unlink(missing_ok=True)

            # Delete useless folders
            if (dir not in RESERVED) and (dir not in cats):
                dir.rmdir()
    else:
        Path(IMG_PATH).mkdir(parents=True)

    # Create folders for all categories
    for cat in cats:
        cat.mkdir(parents=True, exist_ok=True)

### download_imgs function:

Used to download the data off the web (I.e. the NIH Pill image data).

In [12]:
def download_imgs(df: pd.DataFrame, silent=False):
    """
    Download images listed in the dataframe. An optional 'silent' parameter is used to silence the output.
    """

    # Open a session
    with requests.Session() as sess:

        for i in range(CHECKPOINT, len(df)):

            # Set path and url variables
            path = f"{IMG_PATH}\\{df['Name'][i]}"
            url = df["Image"][i]

            if Path(path).is_dir():

                # Get the response
                response = sess.get(url)

                # Download the image
                if response.status_code == 200:
                    with open(f"{path}\\{i}.jpg", "wb") as d_file:

                        d_file.write(response.content)

                        if not silent:
                            print(f"File {i}.jpg was saved in {path}.")

## Execution Part 2:

In [13]:
# Extract data from the data_list.csv file
df = pd.read_csv(".\\data_list.csv")

In [14]:
# Create folders for the downloading process
if CHECKPOINT == 0:
    create_folders(df)

In [15]:
# Download images to created folders
if CHECKPOINT < len(df):
    download_imgs(df)

File 0.jpg was saved in .\images\amiloride_hcl_and_hydrochlorothiazide_tablets.
File 1.jpg was saved in .\images\amiloride_hcl_and_hydrochlorothiazide_tablets.
File 2.jpg was saved in .\images\amiloride_hcl_and_hydrochlorothiazide_tablets.


#### Warning!

Unfortunately, the used database, as well as other outside image sources used in our project consist of uncropped images; consequently, all images should be manually cropped and put into the **data\\images\\.cropped\\** folder if downloaded through this *\*.ipynb* script. Not cropping the images will result in worse performance of the final model.

As a reminder, all cropped data should be available to download from the link located in the main *README.md* file.

## Renaming images prepared for training

Use this codeblock in case you made any changes to the image set, such as renaming files or adding more.

In [16]:
if RESERVED[2].is_dir():
    for dir in RESERVED[2].iterdir():
        num = 0

        for file in dir.iterdir():
            new_path = Path(f"{dir}\\{num}.jpg")

            if file.is_file() and file not in dir.iterdir():
                file.rename(new_path)
            
            num += 1