# This Notebook Extracts Images from Derment Website for all catogeries from the 1st table

Imports

In [1]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
from urllib.parse import urljoin 
import re
import os.path
import pickle
import urllib.request

In [2]:
#Constants - path definitions
HOME_PAGE = "http://www.dermnet.com/dermatology-pictures-skin-disease-pictures/"
DOMAIN_PAGE = "http://www.dermnet.com/"
IMAGE_DIR = "E:/Major Project 2022/downloaded_dataset"

Getting Images from Derment Webpage

In [3]:
def openPg(url):
    """Opens a web page for parsing.
    
    Args:
        url: a web address.
    
    Returns:
        BeautifulSoup object to parse.
    """
    print("Url open")
    url = url.replace(" ","%20")
    html = urlopen(url)
    soup = BeautifulSoup(html, "lxml")
    return soup

fun getClassImages - A function that returns a list of all images for a given skin disease class.
fun getClassCategories - A function that returns a list of all subcategories for a given skin disease.


In [4]:
def getClassImages(class_url):
    """Returns all images for a class label, including all child (webpage) categories.
    
    Args:
        class_url: a web address for a skin disease class.
    
    Returns:
        class_images: A list containing all image links for a class label.
    """
    print(class_url)
    class_images = []
    cat_urls = getClassCategories(class_url)
    for url in cat_urls:
        class_images.extend(getCategoryImages(url))
    return class_images

def getClassCategories(class_url):
    """Returns all category urls for a skin disease class.
    
    Args:
        class_url: a web address for a skin disease class.
    
    Returns:
        categories: A list containing category urls.
    """
    soup = openPg(class_url)
    cat_links = soup.find("table").find_all("a")
    categories = []
    for link in cat_links:
        abs_link = urljoin(DOMAIN_PAGE, link.get('href'))
        categories.append(abs_link)
    return categories

getCategoryImages - Helper functions to get all images within a category.

For a given category, we parse through paginated pages of thumbnail images and add the full image path to a list.


In [5]:
def getCategoryImages(cat_url):
    """Captures all category image urls within a series of paginated links.
    
    Args:
        cat_url: a category web address.
    
    Returns:
        cat_images: A list containing image urls.
    """
    cat_images = []
    cat_thumbpgs = []
    #add to category image list
    addPgImages(cat_url, cat_images)
    cat_thumbpgs = getAllThumbPgs(cat_url)    
    # more pages in category, add images from those thumbnail pages
    if cat_thumbpgs: 
        for page in cat_thumbpgs:
            addPgImages(page, cat_images)
    return cat_images

def getAllThumbPgs(cat_url):
    """Returns pagnated links associated to a category, if any.
    
    Args:
        cat_url: a category web address.
    
    Returns:
        thumb_pgs: A list of pagnated link addresses.
    """
    soup = openPg(cat_url)
    pages = soup.find("div","pagination")
    thumb_pgs = []
    if pages:  #there are multiple pages for this category
        for page in pages:
            if page.name == 'a' and page.string != 'Next':
                thumb_pgs.append(urljoin(DOMAIN_PAGE, page['href']))
    return thumb_pgs

def addPgImages(url,image_list):
    """Finds all image links in a webpage and adds them to the image list.
    
    Args:
        url: a web address for a pagnated category page.
        image_list: a list of image urls
    
    Returns:
        Nothing.
    """
    soup = openPg(url)
    thumbnails = soup.find_all("div","thumbnails")
    if thumbnails: ## there are thumbnails actually on the page
        for thumb in thumbnails:
            thumb_link = thumb.img['src']
            #use full image link instead of thumbnail link
            image_link = re.sub(r'Thumb',"",thumb_link)
            image_list.append(image_link)
            

The Dermnet webpage is structured as follows:

    Home Page (Root)
        Class Label Links
            Class Categories Links
                Paginated pages of images

We traverse through the webpage hierarchy and build a flattened dictionary of (class: list of image links)

In [6]:
#createImageDict - Function to load or create a dictionary of classes and their associated image links.

In [7]:
def createImageDict(dict_file):
    """Create image dictionary and serialize to disk (pickle). Unpickle to dictionary object if already exists.
    
    Args:
        dict_file: Absolute path + filename of pickle file object.
    
    Returns:
        image_dict: dictionary containing image urls for 23 skin disease classes.
    """
    # load dictionary object hierarchy if pickled file exists
    if os.path.exists(dict_file):
        print("Loading image dictionary %s" % dict_file)
        with open(dict_file, 'rb') as f:
            try:
                img_dict = pickle.load(f)
                print("Loaded image dictionary.")
                return img_dict
            except:
                print("Failure to load: %s. Creating dictionary. " % dict_file)
    
    #create dictionary by parsing Dermnet
    #open website root directory and get class links
    soup = openPg(HOME_PAGE)
    class_links = soup.find("table").find_all("a")

    print("Populating image dictionary...")
    img_dict = {}
    for link in class_links:
        print("class links")
        abs_link = urljoin(DOMAIN_PAGE, link.get('href'))
        print("class links1")
        class_name = re.sub(r'[^a-z0-9A-Z\s]+', '', link.string)
        #add to final dictionary {class_name: list of image links}
        print("class links 2")
        img_dict[class_name] = getClassImages(abs_link)
        print("class links 3")

    print("Image dictionary populated. Total classes: %s" % len(img_dict))

    #save dictionary to pickle file
    with open(dict_file, 'wb') as f:
        try:
            pickle.dump(img_dict, f)
            print("Saved image dictionary to %s" % dict_file)
        except:
            print("Failure to save dictionary %s. \nPlease investigate. " % dict_file)

    return img_dict

##Main Function

In [8]:
## load existing class-to-image url dictionary, or scrape website.
img_dictionary = createImageDict(os.path.join(IMAGE_DIR,'imageUrls.p'))

## Downloading pictures from dictionary
for key, class_imgs in img_dictionary.items():

    print("Processing class: %s" %key)

    #create class folders, if it doesn't exist
    class_path = os.path.join(IMAGE_DIR,key)
    if not os.path.exists(class_path):
        print("Creating dir in: %s" %class_path)
        os.mkdir(class_path)
    else:
        print("Found Dir")

    #check if more images to be added to class dir
    num_dirImgs = len([name for name in os.listdir(class_path) if os.path.isfile(os.path.join(class_path, name))])
    count = 0
    if not num_dirImgs == len(class_imgs):
        for img in class_imgs:

            img_name = os.path.basename(img)
            file_name = os.path.join(class_path,img_name)

            if os.path.isfile(file_name):
                print("Skipping: " + img_name + " has already downloaded.")
            else:
                #download image
                if class_imgs.index(img) % 10 == 0:
                    print("Downloading image #%s: %s" %(class_imgs.index(img),img_name))
                
                try:
                    img=img.replace(" ","%20")
                    #img = urllib.parse.quote(img)
                    print(img)
                    f = urlopen(img).read()
                    open(file_name, 'wb').write(f)
                except:
                    print("Error in Parsing this image, due to url issue")

    print("Download complete for: %s" %key)

print("Scraping Complete.")

Loading image dictionary E:/Major Project 2022/downloaded_dataset\imageUrls.p
Loaded image dictionary.
Processing class: Acne and Rosacea Photos
Found Dir
Skipping: acne-closed-comedo-002.jpg has already downloaded.
Skipping: acne-closed-comedo-003.jpg has already downloaded.
Skipping: acne-closed-comedo-1.jpg has already downloaded.
Skipping: acne-closed-comedo-1.jpg has already downloaded.
Skipping: acne-closed-comedo-10.jpg has already downloaded.
Skipping: acne-closed-comedo-11.jpg has already downloaded.
Skipping: acne-closed-comedo-12.jpg has already downloaded.
Skipping: acne-closed-comedo-13.jpg has already downloaded.
Skipping: acne-closed-comedo-14.jpg has already downloaded.
Skipping: acne-closed-comedo-15.jpg has already downloaded.
Skipping: acne-closed-comedo-16.jpg has already downloaded.
Skipping: acne-closed-comedo-17.jpg has already downloaded.
Skipping: acne-closed-comedo-18.jpg has already downloaded.
Skipping: acne-closed-comedo-19.jpg has already downloaded.
Skipp

Skipping: perioral-dermatitis-94.jpg has already downloaded.
Skipping: perioral-dermatitis-95.jpg has already downloaded.
Skipping: perioral-dermatitis-96.jpg has already downloaded.
Skipping: perioral-dermatitis-97.jpg has already downloaded.
Skipping: perioral-dermatitis-98.jpg has already downloaded.
Skipping: perioral-dermatitis-99.jpg has already downloaded.
Skipping: anal-Comedones-1.jpg has already downloaded.
Skipping: anal-Comedones-2.jpg has already downloaded.
Skipping: fordyce-spots-1.jpg has already downloaded.
Skipping: fordyce-spots-2.jpg has already downloaded.
Skipping: fordyce-spots-3.jpg has already downloaded.
Skipping: fordyce-spots-4.jpg has already downloaded.
Skipping: fordyce-spots-5.jpg has already downloaded.
Skipping: fordyce-spots-6.jpg has already downloaded.
Skipping: fordyce-spots-7.jpg has already downloaded.
Skipping: fordyce-spots-lip.jpg has already downloaded.
Skipping: Sebaceous-glands-Areola-1.jpg has already downloaded.
Skipping: Sebaceous-glands

Skipping: 05atopicFlid080105.jpg has already downloaded.
Skipping: 05atopicFoot0801051.jpg has already downloaded.
Skipping: 05atopicFoot0801052.jpg has already downloaded.
Skipping: 05AtopicFossa.jpg has already downloaded.
Skipping: 05AtopicFossa1.jpg has already downloaded.
Skipping: 05AtopicFossa1q.jpg has already downloaded.
Skipping: 05AtopicFossa2q.jpg has already downloaded.
Skipping: 05AtopicHand.jpg has already downloaded.
Skipping: 05AtopicHand1.jpg has already downloaded.
Skipping: 05AtopicHand1q.jpg has already downloaded.
Skipping: 05AtopicHand2.jpg has already downloaded.
Skipping: 05AtopicHand3.jpg has already downloaded.
Skipping: 05AtopicHand4.jpg has already downloaded.
Skipping: 05AtopicHandq.jpg has already downloaded.
Skipping: 05atopicKnee080105.jpg has already downloaded.
Skipping: 05atopicKnee0801051.jpg has already downloaded.
Skipping: 05atopicKnee0801052.jpg has already downloaded.
Skipping: 05AtopicLichenification.jpg has already downloaded.
Skipping: 05ato

Found Dir
Skipping: AIDS-1.jpg has already downloaded.
Skipping: AIDS-11.jpg has already downloaded.
Skipping: AIDS-12.jpg has already downloaded.
Skipping: AIDS-14.jpg has already downloaded.
Skipping: AIDS-15.jpg has already downloaded.
Skipping: AIDS-16.jpg has already downloaded.
Skipping: AIDS-17.jpg has already downloaded.
Skipping: AIDS-19.jpg has already downloaded.
Skipping: AIDS-2.jpg has already downloaded.
Skipping: AIDS-20.jpg has already downloaded.
Skipping: AIDS-21.jpg has already downloaded.
Skipping: AIDS-22.jpg has already downloaded.
Skipping: AIDS-23.jpg has already downloaded.
Skipping: AIDS-25.jpg has already downloaded.
Skipping: AIDS-26.jpg has already downloaded.
Skipping: AIDS-27.jpg has already downloaded.
Skipping: AIDS-28.jpg has already downloaded.
Skipping: AIDS-29.jpg has already downloaded.
Skipping: AIDS-3.jpg has already downloaded.
Skipping: AIDS-31.jpg has already downloaded.
Skipping: AIDS-35.jpg has already downloaded.
Skipping: AIDS-36.jpg has a

Download complete for: Nail Fungus and other Nail Disease
Processing class: Poison Ivy Photos and other Contact Dermatitis
Found Dir
Download complete for: Poison Ivy Photos and other Contact Dermatitis
Processing class: Psoriasis pictures Lichen Planus and related diseases
Found Dir
Skipping: Axillary-Granular-Parakeratosis-1.jpg has already downloaded.
Skipping: Axillary-Granular-Parakeratosis-2.jpg has already downloaded.
Skipping: Axillary-Granular-Parakeratosis-3.jpg has already downloaded.
Skipping: Axillary-Granular-Parakeratosis-4.jpg has already downloaded.
Skipping: Axillary-Granular-Parakeratosis-5.jpg has already downloaded.
Skipping: Axillary-Granular-Parakeratosis-6.jpg has already downloaded.
Skipping: Axillary-Granular-Parakeratosis-7.jpg has already downloaded.
Skipping: Axillary-Granular-Parakeratosis-8.jpg has already downloaded.
Skipping: Axillary-Granular-Parakeratosis-9.jpg has already downloaded.
Skipping: erythrokeratodermia-variabilis-1.jpg has already download

Skipping: pityriasis-rosea-172.jpg has already downloaded.
Skipping: pityriasis-rosea-173.jpg has already downloaded.
Skipping: pityriasis-rosea-174.jpg has already downloaded.
Skipping: pityriasis-rosea-175.jpg has already downloaded.
Skipping: pityriasis-rosea-176.jpg has already downloaded.
Skipping: pityriasis-rosea-177.jpg has already downloaded.
Skipping: pityriasis-rosea-178.jpg has already downloaded.
Skipping: pityriasis-rosea-179.jpg has already downloaded.
Skipping: pityriasis-rosea-18.jpg has already downloaded.
Skipping: pityriasis-rosea-180.jpg has already downloaded.
Skipping: pityriasis-rosea-181.jpg has already downloaded.
Skipping: pityriasis-rosea-182.jpg has already downloaded.
Skipping: pityriasis-rosea-183.jpg has already downloaded.
Skipping: pityriasis-rosea-184.jpg has already downloaded.
Skipping: pityriasis-rosea-185.jpg has already downloaded.
Skipping: pityriasis-rosea-186.jpg has already downloaded.
Skipping: pityriasis-rosea-187.jpg has already downloaded

Skipping: 08sebKerm1122046.jpg has already downloaded.
Skipping: 08sebKerm11220467.jpg has already downloaded.
Skipping: 08sebKerm11220468.jpg has already downloaded.
Skipping: 08sebKerm1122049.jpg has already downloaded.
Skipping: 08TineaAmiantacea.jpg has already downloaded.
Skipping: 08TineaAmiantacea1.jpg has already downloaded.
Skipping: 13tineaAmiantacea082106.jpg has already downloaded.
Skipping: 13tineaAmiantacea0821061.jpg has already downloaded.
Skipping: seborrheic-dermatitis-10.jpg has already downloaded.
Skipping: seborrheic-dermatitis-100.jpg has already downloaded.
Skipping: seborrheic-dermatitis-101.jpg has already downloaded.
Skipping: seborrheic-dermatitis-102.jpg has already downloaded.
Skipping: seborrheic-dermatitis-103.jpg has already downloaded.
Skipping: seborrheic-dermatitis-104.jpg has already downloaded.
Skipping: seborrheic-dermatitis-106.jpg has already downloaded.
Skipping: seborrheic-dermatitis-107.jpg has already downloaded.
Skipping: seborrheic-dermatit

Skipping: epidermal-nevus-51.jpg has already downloaded.
Skipping: epidermal-nevus-52.jpg has already downloaded.
Skipping: epidermal-nevus-53.jpg has already downloaded.
Skipping: epidermal-nevus-54.jpg has already downloaded.
Skipping: epidermal-nevus-55.jpg has already downloaded.
Skipping: epidermal-nevus-56.jpg has already downloaded.
Skipping: epidermal-nevus-57.jpg has already downloaded.
Skipping: epidermal-nevus-58.jpg has already downloaded.
Skipping: epidermal-nevus-59.jpg has already downloaded.
Skipping: epidermal-nevus-6.jpg has already downloaded.
Skipping: epidermal-nevus-60.jpg has already downloaded.
Skipping: epidermal-nevus-61.jpg has already downloaded.
Skipping: epidermal-nevus-62.jpg has already downloaded.
Skipping: epidermal-nevus-63.jpg has already downloaded.
Skipping: epidermal-nevus-64.jpg has already downloaded.
Skipping: epidermal-nevus-65.jpg has already downloaded.
Skipping: epidermal-nevus-66.jpg has already downloaded.
Skipping: epidermal-nevus-67.jpg

Skipping: skin-tags-polyps-70.jpg has already downloaded.
Skipping: skin-tags-polyps-71.jpg has already downloaded.
Skipping: skin-tags-polyps-72.jpg has already downloaded.
Skipping: skin-tags-polyps-73.jpg has already downloaded.
Skipping: skin-tags-polyps-74.jpg has already downloaded.
Skipping: skin-tags-polyps-75.jpg has already downloaded.
Skipping: skin-tags-polyps-76.jpg has already downloaded.
Skipping: skin-tags-polyps-77.jpg has already downloaded.
Skipping: skin-tags-polyps-78.jpg has already downloaded.
Skipping: skin-tags-polyps-79.jpg has already downloaded.
Skipping: skin-tags-polyps-8.jpg has already downloaded.
Skipping: skin-tags-polyps-80.jpg has already downloaded.
Skipping: skin-tags-polyps-81.jpg has already downloaded.
Skipping: skin-tags-polyps-82.jpg has already downloaded.
Skipping: skin-tags-polyps-83.jpg has already downloaded.
Skipping: skin-tags-polyps-84.jpg has already downloaded.
Skipping: skin-tags-polyps-85.jpg has already downloaded.
Skipping: skin-

Skipping: candidiasis-diaper-14.jpg has already downloaded.
Skipping: candidiasis-diaper-15.jpg has already downloaded.
Skipping: candidiasis-diaper-16.jpg has already downloaded.
Skipping: candidiasis-diaper-17.jpg has already downloaded.
Skipping: candidiasis-diaper-18.jpg has already downloaded.
Skipping: candidiasis-diaper-19.jpg has already downloaded.
Skipping: candidiasis-diaper-2.jpg has already downloaded.
Skipping: candidiasis-diaper-20.jpg has already downloaded.
Skipping: candidiasis-diaper-21.jpg has already downloaded.
Skipping: candidiasis-diaper-22.jpg has already downloaded.
Skipping: candidiasis-diaper-23.jpg has already downloaded.
Skipping: candidiasis-diaper-24.jpg has already downloaded.
Skipping: candidiasis-diaper-25.jpg has already downloaded.
Skipping: candidiasis-diaper-26.jpg has already downloaded.
Skipping: candidiasis-diaper-27.jpg has already downloaded.
Skipping: candidiasis-diaper-28.jpg has already downloaded.
Skipping: candidiasis-diaper-29.jpg has a

Skipping: tinea-hand-dorsum-32.jpg has already downloaded.
Skipping: tinea-hand-dorsum-33.jpg has already downloaded.
Skipping: tinea-hand-dorsum-34.jpg has already downloaded.
Skipping: tinea-hand-dorsum-35.jpg has already downloaded.
Skipping: tinea-hand-dorsum-36.jpg has already downloaded.
Skipping: tinea-hand-dorsum-37.jpg has already downloaded.
Skipping: tinea-hand-dorsum-38.jpg has already downloaded.
Skipping: tinea-hand-dorsum-39.jpg has already downloaded.
Skipping: tinea-hand-dorsum-4.jpg has already downloaded.
Skipping: tinea-hand-dorsum-40.jpg has already downloaded.
Skipping: tinea-hand-dorsum-41.jpg has already downloaded.
Skipping: tinea-hand-dorsum-42.jpg has already downloaded.
Skipping: tinea-hand-dorsum-43.jpg has already downloaded.
Skipping: tinea-hand-dorsum-44.jpg has already downloaded.
Skipping: tinea-hand-dorsum-45.jpg has already downloaded.
Skipping: tinea-hand-dorsum-46.jpg has already downloaded.
Skipping: tinea-hand-dorsum-47.jpg has already downloaded

# All Images from Derment website are saved in downloaded_dataset via web scrapping