In [1]:
import sys
PROJECT_DIR='/home/ericp/tree_canopy_fcn/repo'
sys.path.append(PROJECT_DIR)
from importlib import reload

In [2]:
import re
import pandas as pd
import image_kit.io as io
import mproc

---

In [3]:
LIDAR_SUBFOLDER='USGS_LPC_CA_LosAngeles_2016_LAS_2018'
DSET_PATH=f'{PROJECT_DIR}/datasets/los_angeles-naip-lidar_USGS_LPC_CA_LosAngeles_2016_LAS_2018.STATS.csv'
MAX_BLACK_PIXEL=512*4

---

In [4]:
def lidar_path(rgbn_path,lidar_subfolder=LIDAR_SUBFOLDER,lidar_prefix='hag'):
    parts=rgbn_path.split('/')
    root="/".join(parts[:-1])
    root=re.sub('naip','lidar',root)
    if lidar_subfolder:
        root=f'{root}/{lidar_subfolder}'
    name=parts[-1]
    name=re.sub('naip',lidar_prefix,name)
    name=re.sub('_20[0-9]{2}-','-',name)
    return f'{root}/{name}'


def dset_type(rgbn_path):
    name=rgbn_path.split('-')[-1]
    return re.sub('.tif$','',name)


def stat_row(row_dict):
    r=row_dict.copy()
    im=io.read(row_dict['rgbn_path'],return_profile=False)
    r['means']=im.mean(axis=(1,2))
    r['stdevs']=im.std(axis=(1,2))
    r['black_pixel_count']=(im[:3].sum(axis=0)==0).sum()
    try:
        im=io.read(row_dict['hag_path'],return_profile=False)
        r['lidar_shape']=im.shape
    except:
        r['lidar_shape']=None
    return r

---

In [5]:
!ls /DATA/download-csvs/*.csv

/DATA/download-csvs/naip-los_angeles.csv


In [6]:
dsets_df=pd.read_csv('/DATA/download-csvs/naip-los_angeles.csv')
dsets_df.columns=['tile_key', 'year', 'rgbn_path', 'error', 'error_msg']
dsets_df['hag_path']=dsets_df.rgbn_path.apply(lidar_path)
dsets_df['dset_type']=dsets_df.rgbn_path.apply(dset_type)
dsets_df.shape[0]

3387

---

In [7]:
dsets_dicts=dsets_df.to_dict('records')

In [8]:
%time out=mproc.map_with_threadpool(stat_row,dsets_dicts,max_processes=64)
df=pd.DataFrame(out)
print(df.shape)

CPU times: user 1min 4s, sys: 8.17 s, total: 1min 12s
Wall time: 17.6 s
(3387, 11)


---

In [9]:
test=df.lidar_shape.isna()
print('NB Missing Lidar:',df[test].shape[0])
df=df[~test]
print('=>',df.shape)

NB Missing Lidar: 31
=> (3356, 11)


In [10]:
test=df.lidar_shape==(1, 512, 512)
print('NB Bad Lidar Shape:',df[test].shape[0])
df=df[~test]
print('=>',df.shape)

NB Bad Lidar Shape: 3346
=> (10, 11)


In [11]:
test=(df.black_pixel_count>MAX_BLACK_PIXEL)
print('NB BLACK > MAX BLACK PIXS:',df[test].shape[0])
df=df[~test]
print('=>',df.shape)

NB BLACK > MAX BLACK PIXS: 0
=> (10, 11)


---

In [12]:
COLS=[
    'dset_type',
    'tile_key',
    'year',
    'means',
    'stdevs',
    'black_pixel_count',
    'rgbn_path',
    'hag_path']
df=df[COLS]
df.to_csv(
    DSET_PATH,
    index=False)

---

In [13]:
print(f'MEANS={df.means.mean(axis=0).tolist()}')
print(f'STDEVS={df.stdevs.mean(axis=0).tolist()}')

MEANS=[97.33728790283203, 93.58218002319336, 89.59181938171386, 109.35328521728516]
STDEVS=[32.67245474001072, 28.142484246235437, 24.125673156344252, 21.711092083738954]


---

In [14]:
df.sample(3)

Unnamed: 0,dset_type,tile_key,year,means,stdevs,black_pixel_count,rgbn_path,hag_path
2248,train,480:16:1.0:11:-270:7778,2016,"[46.63837814331055, 53.684268951416016, 59.593...","[37.96750196540931, 32.015391039285596, 25.495...",0,/DATA/imagery/los_angeles/v1/naip/naip_480:16:...,/DATA/imagery/los_angeles/v1/lidar/USGS_LPC_CA...
3033,valid,480:16:1.0:11:-144:7842,2016,"[135.64130020141602, 132.26757049560547, 127.1...","[42.67174082074695, 39.28968447695191, 36.4302...",0,/DATA/imagery/los_angeles/v1/naip/naip_480:16:...,/DATA/imagery/los_angeles/v1/lidar/USGS_LPC_CA...
1470,train,480:16:1.0:11:-155:7825,2016,"[110.82851028442383, 100.07578659057617, 91.69...","[25.161990113250905, 20.326465615577472, 15.52...",0,/DATA/imagery/los_angeles/v1/naip/naip_480:16:...,/DATA/imagery/los_angeles/v1/lidar/USGS_LPC_CA...
