In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
import os
import glob
import subprocess
import pandas as pd

from concurrent.futures import as_completed, ThreadPoolExecutor, ProcessPoolExecutor
from tqdm.notebook import tqdm

In [None]:
IN_DIR_ROOT = '/content/drive/Shareddrives/stand_mapping/data/interim/training_tiles'
OUT_DIR_ROOT = '/content/drive/Shareddrives/stand_mapping/data/processed/training_tiles'

In [None]:
def merge_subtiles(tile_id, state, year):
    OUT_DIR_ROOT = '/content/drive/Shareddrives/stand_mapping/data/processed/training_tiles'
    infiles = subtiles.loc[tile_id]['path'].values
    outfile = '_'.join([tile_id, 'naip', str(year)]) + '.tif'
    outpath = os.path.join(OUT_DIR_ROOT, state.lower(), 'naip', str(year), outfile)
    proc = subprocess.run([
                          'gdal_merge.py',
                          '-o', outpath, 
                          '-co', 'TILED=YES',
                          '-co', 'BLOCKXSIZE=256',
                          '-co', 'BLOCKYSIZE=256',
                          '-co', 'COMPRESS=LZW',
                          *infiles],
                          stdout=subprocess.PIPE, stderr=subprocess.PIPE
                          )
    if proc.returncode != 0:
        print(f'{tile_id} had an error.')
    return proc

In [None]:
def parallel_merge(to_run, workers=8):
    num_jobs = len(to_run)
    print('\n', 'Merging {:,d} tiles'.format(num_jobs))
    if num_jobs > 0:
        with ThreadPoolExecutor(workers) as executor:
            jobs = [executor.submit(merge_subtiles, *params) for params in to_run]
            results = []
            
            for job in tqdm(as_completed(jobs), total=len(jobs)):
                results.append(job.result())
        return results
    else:
        return

In [None]:
STATE = 'oregon'
for YEAR in [2009, 2011, 2014, 2016]:
    tifs_to_merge = glob.glob(os.path.join(IN_DIR_ROOT, STATE, 'naip', str(YEAR), '*.tif'))
    already_run_tifs = glob.glob(os.path.join(OUT_DIR_ROOT, STATE, 'naip', str(YEAR), '*.tif'))
    cell_ids = [os.path.basename(x).split('_')[0] for x in tifs_to_merge]
    already_run_cell_ids = [os.path.basename(x).split('_')[0] for x in already_run_tifs]
    subtiles = pd.DataFrame(data=tifs_to_merge, index=cell_ids, columns=['path'])
    unique_tiles = pd.unique(cell_ids)
    
    to_run = [(cell_id, STATE, YEAR) for cell_id in unique_tiles if cell_id not in already_run_cell_ids]
    print(YEAR, end='. ')
    res = parallel_merge(to_run)

2009. 
 Merging 0 tiles
2011. 
 Merging 0 tiles
2014. 
 Merging 0 tiles
2016. 
 Merging 0 tiles


In [None]:
STATE = 'washington'
for YEAR in [2009, 2011, 2015, 2017]:
    tifs_to_merge = glob.glob(os.path.join(IN_DIR_ROOT, STATE, 'naip', str(YEAR), '*.tif'))
    already_run_tifs = glob.glob(os.path.join(OUT_DIR_ROOT, STATE, 'naip', str(YEAR), '*.tif'))
    cell_ids = [os.path.basename(x).split('_')[0] for x in tifs_to_merge]
    already_run_cell_ids = [os.path.basename(x).split('_')[0] for x in already_run_tifs]
    subtiles = pd.DataFrame(data=tifs_to_merge, index=cell_ids, columns=['path'])
    unique_tiles = pd.unique(cell_ids)
    
    to_run = [(cell_id, STATE, YEAR) for cell_id in unique_tiles if cell_id not in already_run_cell_ids]
    print(YEAR, end='. ')
    res = parallel_merge(to_run)

2009. 
 Merging 3 tiles


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


2011. 
 Merging 0 tiles
2015. 
 Merging 0 tiles
2017. 
 Merging 3 tiles


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


