# Opengameart.org CC0 Image Download

This notebook downloads and cleans data from opengameart.org.

#### Step 1: Get Collection Links
#### Step 2: Get File Links
#### Step 3: Download Files
#### Step 4: Copy and Rename Files

In [1]:
import requests, bs4, os, json, zipfile, io, cv2, numpy as np
from statistics import mean
from collections import Counter
from datetime import datetime
from distutils.dir_util import copy_tree

In [47]:
def est_time_remaining(i, total, times, last_time, a='Page'):
    """
    Calculates the time remaining on a function given time and iteration count.
    Parameters:
        i: iteration number
        total: total iterations
        times: list of run times
        last_time: last datetime
        a: things iterated over
    Returns:
        times, now
    """
    now = datetime.now()
    times.append((now-last_time).microseconds/1000000)
    avg_time = round(mean(times), 2)
    ETA = round(avg_time*(total-i))  
    print("{}/{} {}(s) Scraped. Avg. Time: {}. Est. Remaining: {}".format(i, total, a, avg_time, ETA), end='\r')
    return times, now

## Step 1: Get Collection Links

In [3]:
def get_links(dim, pages=2):
    """
    Gets every collection link from opengameart.org
    parameters:
        dim: dimension integer
        pages: number of pages o scrape
    Returns:
        list of links
    """
    times = []; last_time = datetime.now()
    if 'data_1.json' not in os.listdir():
        if dim == 2:
            base = "https://opengameart.org/art-search-advanced?keys=&title=&field_art_tags_tid_op=or&field_art_tags_tid=&name=&field_art_type_tid%5B%5D=9&field_art_licenses_tid%5B%5D=4&sort_by=count&sort_order=DESC&items_per_page=144&Collection=&page={}"
        if dim == 3:
            base = "https://opengameart.org/art-search-advanced?keys=&title=&field_art_tags_tid_op=or&field_art_tags_tid=&name=&field_art_type_tid%5B%5D=10&field_art_licenses_tid%5B%5D=4&sort_by=count&sort_order=DESC&items_per_page=144&Collection=&page={}"
        links_all = []
        for page in list(range(pages)):
            r = requests.get(base.format(str(page)))
            if r.status_code==200:
                soup = bs4.BeautifulSoup(r.content, 'lxml')
                links = []
                for s in soup.find_all('div', {'class':'field-item even'}):
                    try:
                        href = s.find('a')['href']
                        if '/content' in href:
                            links.append(href)
                    except:
                        pass
                links_all+=links; links_all=list(set(links_all))
            times, last_time = est_time_remaining(page+1, pages, times, last_time)
    return links_all

In [4]:
links_all = get_links(2)
len(links_all)

2/2 Page(s) Scraped. Avg. Time: 0.58. Est. Remaining: 0

289

## Step 2: Get File Links

In [15]:
def get_file_links(links_all):
    """For each collection, gets the links of the files to download
    Parameters:
        links_all: list of collection links
    Returns:
        list of file links
    """
    files = []; i = 1
    total = len(links_all); times = []; last_time=datetime.now()
    for link in links_all:
        base='https://opengameart.org'
        try:
            r=requests.get(base+link)
            if r.status_code == 200:
                soup = bs4.BeautifulSoup(r.content, 'lxml')
                try:
                    file_path = soup.find('span', {'class':'file'}).find('a')['href']
                    files.append([link, file_path])
                except:
                    pass
                try:
                    for div in soup.find('div', {'class':'group-right right-column'}).find('div', {'class':'field-items'}).find_all('div'):
                        files.append([link,div.find('a')['href']])
                except:
                    pass
        except:
            pass
        times, last_time = est_time_remaining(i, total, times, last_time); i+=1
    return files

In [19]:
file_links = get_file_links(links_all[:3])
len(file_links)

3/3 Page(s) Scraped. Avg. Time: 0.54. Est. Remaining: 0

14

## Step 3: Download Files

In [50]:
def download_files(files):
    """Downloads every file to the local directory (requires about 10gb storage for all collections)
    Parameters:
        files: list of files to download
    Returns:
        None
    """
    try:
        os.makedirs('SpriteFiles')
    except:
        return "Directory Exists"
    len(links_all); times = []; last_time=datetime.now()
    i=1; l=len(files)
    for file_pair in files:
        if 'zip' in file_pair[-1]:
            try:
                os.makedirs('SpriteFiles/'+file_pair[0].split('/')[-1])
            except:
                pass
            try:
                r = requests.get(file_pair[-1])
                z = zipfile.ZipFile(io.BytesIO(r.content))
                z.extractall('SpriteFiles/'+file_pair[0].split('/')[-1])
            except:
                print(file_pair[-1])
                
        if 'png' in file_pair[-1]:
            try:
                os.makedirs('SpriteFiles/'+file_pair[0].split('/')[-1])
            except:
                pass
            
            try:
                r = requests.get(file_pair[-1], stream=True)
                if r.status_code == 200:
                    with open('SpriteFiles/'+file_pair[0].split('/')[-1]+'/test.png', 'wb') as f:
                        for chunk in r:
                            f.write(chunk)
            except:
                pass

            
        times, last_time = est_time_remaining(i, l, times, last_time, a='Files'); i+=1

In [52]:
download_files(file_links[:2])

4/2 Files(s) Scraped. Avg. Time: 0.63. Est. Remaining: -1

## Step 4: Copy and Rename Files  to be Sorted

In [None]:
# note your mappings may differ due to newly created collections. 
# use the mapping file for categorizing the sprites (if desired)
def copy_rename():
    """This function takes the downloaded raw sprite files and moves them into another folder.
    Then, it renames the files based on an index (lots of duplicates like sword). 
    Only use this function before sorting into categories for the first time. 
    Parameters:
        None
    Returns:
        None  
    """
    try:
        os.makedirs('SpriteFiles2D')
        fromDirectory = "SpriteFiles2DBase"
        toDirectory = "SpriteFiles2D"
        copy_tree(fromDirectory, toDirectory)

        ind = 0; key = {}
        for root_folder in os.listdir('SpriteFiles2D'):
            os.makedirs('RenamedSprites\\'+root_folder)
            for r, d, filenames in os.walk('SpriteFiles2D\\'+root_folder):
                for filename in filenames:
                    old_path = os.path.join(r,filename)
                    if '.png' in filename:
                        new_path = "RenamedSprites\\"+root_folder+"\\"+f"{ind:05d}"+".png"
                        os.rename(old_path, new_path)

                        key[old_path] = f"{ind:05d}"

                        ind+=1
                    else:
                        os.remove(old_path)
                        
        inv_key =  {v: k for k, v in key.items()}
        mappings = {'filepath_id':key, 'id_filepath':inv_key}
        
        with open('mappings.json', 'w') as outfile:
            json.dump(mappings, outfile)
    except:
        pass