In [1]:
import pandas as pd
#import geopandas as gpd
import rasterio as rio
from rasterio.plot import show
from rasterio.warp import transform
from rasterio.transform import (xy, rowcol)
from rasterio.windows import Window
import itertools as it
import rioxarray as rx
import xarray as xr
import matplotlib.pyplot as plt
import numpy as np
import time
import timeit


In [None]:
xs = np.array([3327493.433, 16177493.43])
ys = np.array([7389201.61, -580798.392])

with rio.open("data/TCL_DD_2022_20230407.tif") as src:

    pt = src.xy(0, 0) # latitude is N/S AKA y, 

    print(f'Number of bands: {src.count}')
    print(f'Image resolution: ({src.height}, {src.width})')
    print(f'Coordinate Reference System: ({src.crs}')

    fig, ax = plt.subplots()

    extent = [src.bounds[0], src.bounds[2], src.bounds[1], src.bounds[3]]
    ax = show(src, extent=extent, ax=ax, cmap="pink")

    # https://epsg.io/4326
    # WGS 84 -- WGS84 - World Geodetic System 1984, used in GPS

    # 18.18517 55.56466
    # 29.766234, 69.421527 (latitude, longitude) pakistan
    # -15.28575, -55.94627 (latitude, longitude) brazil
    lat = -15.28575
    long = -55.94627

    to_crs = src.crs
    from_crs = rio.crs.CRS.from_epsg(4326)
    x, y = transform(from_crs, to_crs, [long], [lat])

    # expect 1 - 5... but mention is made of Zero or Minor Loss
    # 1. CommodityDriven.Deforestation
    # 2. Shifting.Agriculture
    # 3. TreeFarm.ForestryOther
    # 4. Wildfire
    # 5. Urban

    for val in src.sample([(x, y)]): 
        print(val)
    

In [None]:
xr.set_options(display_expand_attrs=False, display_expand_data=False)
ds = xr.tutorial.load_dataset("air_temperature")



In [None]:
print(ds)
ds.air.plot()

In [None]:
# rasterio.features.dataset_features # works on raster value rather than mask...
# data.attrs["units"] = "metres/sec"
# rioxarray combines xarray and rasterio similar to how geopandas combines functionality from pandas and fiona
# and if you want to connect between geopandas and xarray, you can use geocube...

lat = -22.00027
long = -58.99658

# we can convert to and from pixel data with xy and rowcol respectively
# nearest neighbout in raster space would be the shortest distance in pixel space
# for certain values
# convert query from GPS to rowcol
# mask values not of interest... find rowcol of closest
# convert from rowcol to GPS

# shapely.ops.nearest_points(geom1, geom2)
# but we use rowcol values that can then be converted back to xy AKA GPS
# this way we would have 40000 x 40000 Point rather than pixel values
# distance calculation on projection may be distorted...
# use ‘haversine’ distance



with rx.open_rasterio('data/Hansen_GFC-2022-v1.10_lossyear_20S_060W.tif').squeeze() as xda:
    
    #print(xda)
    
    #xda.rio.write_crs(3347, inplace=True)
    #print(xda.spatial_ref)

    #xda.rio.set_spatial_dims("lon", "lat", inplace=True)

    # band is lossyear
    # x is long
    # y is lat

    to_crs = xda.rio.crs
    from_crs = rio.crs.CRS.from_epsg(4326)
    x, y = transform(from_crs, to_crs, [long], [lat])

    # If you use "projected coordinate system", no problem. The distance that 
    # you get is the distance on the map (not on the spherical earth). When 
    # using "geographic coordinate system - GCS", the distance that you get 
    # will be the shortest distance in 3D space.

    # This uses the ‘haversine’ formula to calculate the great-circle distance
    # between two points – that is, the shortest distance over the earth’s surface
    #  – giving an ‘as-the-crow-flies’ distance between the points (ignoring any 
    # hills they fly over, of course!).

    #xarray_distance.data = np.sqrt((lat - spec_lat)[:,None]**2 + ((lon - spec_lon)**2)
    
    # get value from grid
    value = xda.sel(x=x[0], y=y[0], method="nearest")
    # ... is unspecified or infinite number of arguments!
    print(value)

    # An empty (tuple) index is a full scalar index into a zero dimensional array. x[()] returns a scalar
    # if x is zero dimensional and a view otherwise. On the other hand x[...] always returns a view.

    # When an ellipsis (...) is present but has no size (i.e. replaces zero :) the 
    # result will still always be an array. A view if no advanced index is present, otherwise a copy.

    #value.values[()]





In [2]:
# We want to grab treecover2000 per asset/location, and pixels per lossyear
# The result is a simple DataFrame as we do not use shapely geometry...

lat = -22.00027
long = -58.99658
offset = 16 # 16 * 2 + 1 == 33, odd number allows for single pixel centre, sides are close to 1km

# The term "kernel" in the context of convolutional neural networks (CNNs) comes from image processing 
# and mathematics, specifically from the field of signal processing. In image processing, a kernel is a 
# small matrix used for blurring, sharpening, edge detection, or other image processing operations.
# kernel -- a small matrix of weights --

# data/Hansen_GFC-2022-v1.10_treecover2000_10S_050W.tif 
# data/Hansen_GFC-2022-v1.10_treecover2000_10S_060W.tif 
# data/Hansen_GFC-2022-v1.10_treecover2000_20S_050W.tif 
# data/Hansen_GFC-2022-v1.10_treecover2000_20S_060W.tif

# data/Hansen_GFC-2022-v1.10_lossyear_10S_050W.tif 
# data/Hansen_GFC-2022-v1.10_lossyear_10S_060W.tif 
# data/Hansen_GFC-2022-v1.10_lossyear_20S_050W.tif 
# data/Hansen_GFC-2022-v1.10_lossyear_20S_060W.tif

assets = pd.read_csv('data/assets_for_deforestation.csv', sep='\t')
assets.shape, assets.uid_gem.unique().size, assets.head()
# TODO: should not be needed...
assets.drop_duplicates('uid_gem', inplace=True)
assets = assets.set_index('uid_gem')
assets.shape, assets.index.unique().size, assets.head()



((23847, 10),
 23847,
           latitude  longitude                 sector capacity_unit  \
 uid_gem                                                              
 L905180 -52.936100 -70.828800     wind power/onshore            mw   
 L100003 -51.546015 -72.231256  coal plant/bituminous            mw   
 L905447 -47.299500 -66.993100     wind power/onshore            mw   
 L905494 -46.634300 -68.392500     wind power/onshore            mw   
 L905533 -46.592500 -67.648600     wind power/onshore            mw   
 
                             asset_name  start_year_first capacity_first  \
 uid_gem                                                                   
 L905180  Vientos Patagonicos wind farm            2020.0           10.0   
 L100003       Río Turbio power station            2022.0          120.0   
 L905447         Bicentenario wind farm            2019.0          101.0   
 L905494         Los Hercules wind farm            2021.0           97.0   
 L905533         Canado

In [3]:


uid_gems = assets.index.unique()
lossyears = range(2001, 2023)

index = pd.MultiIndex.from_tuples(tuples=it.product(uid_gems, lossyears), names=('uid_gem', 'lossyear'))
xda = rx.open_rasterio('data/Hansen_GFC-2022-v1.10_lossyear_20S_060W.tif').squeeze()


In [40]:

#print(xda)

to_crs = xda.rio.crs
from_crs = rio.crs.CRS.from_epsg(4326)
xs, ys = transform(from_crs, to_crs, assets.longitude, assets.latitude)
#print(xda.x.values)
rows, cols = rowcol(xda.rio.transform(), xs, ys)
assets['row'] = rows
assets['col'] = cols
# we may have coordinates beyond the extent of the DataArray...
local_assets = assets[(assets.row >= 0) & (assets.col >= 0)].copy()
display(local_assets)
    
# rio accessors...
# isel_window(window: Window, pad: bool = False) → Dataset | DataArray[source]
# slice_xy(minx: float, miny: float, maxx: float, maxy: float) → Dataset | DataArray[
# xdsc = xds.rio.clip_box, also specifying another CRS

# rows, cols should be same shape and order as df...
#print(local_assets.row)
#print(local_assets.col)

def bar(r): # Optional[xarray.DataArray]
    da = xda.isel(x=slice(r.col-offset-1, r.col+offset), y=slice(r.row-offset-1, r.row+offset))
    if da.size > 0:
        return da
    else:
        return None

def myfunc(row, col, xdarray, size):
    s1 = slice(col-size-1, col+size)
    s2 = slice(row-size-1, row+size)
    foo = xdarray.isel(x=s1, y=s2)
    unique, counts = np.unique(foo.data, return_counts=True)
    years = unique + 2000
    result = dict(zip(years, counts))
    #display(result)
    return result

#vfunc = np.vectorize(myfunc)

print(f'-> {myfunc(1000, 1000, xda, offset)}')
print(f'-> {type(myfunc(1000, 1000, xda, offset))}')

#display(local_assets)

np_row = local_assets.row.to_numpy()
np_col = local_assets.col.to_numpy()

print(type(np_row))
print(type(np_col))
print(np_row.shape)
print(np_col.shape)


Unnamed: 0_level_0,latitude,longitude,sector,capacity_unit,asset_name,start_year_first,capacity_first,start_year,capacity,number_units,row,col
uid_gem,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
L906177,-45.761900,169.919700,wind power/onshore,mw,Mahinerangi wind farm,2011.0,36.0,[2011.0],['36.0'],1,103047,919678
L906149,-42.161400,146.688200,wind power/onshore,mw,Cattle Hill wind farm,2020.0,144.0,[2020.0],['144.0'],1,88645,826752
L906143,-41.781600,145.042200,wind power/onshore,mw,Granville Harbour wind farm,2020.0,112.0,[2020.0],['112.0'],1,87126,820168
M1532,-41.612538,148.125448,coal mine/surface,mt per year,Cullenswood Coal Mine,2002.0,0.05,[2002.0],['0.05'],1,86450,832501
L906088,-41.258400,174.668800,wind power/onshore,mw,West Wind wind farm,2009.0,143.0,[2009.0],['143.0'],1,85033,938675
...,...,...,...,...,...,...,...,...,...,...,...,...
L200457,-20.218700,-47.653900,bioenergy,mw,Buriti power station,2011.0,74.0,[2011.0],['74.0'],1,874,49384
L807567,-20.198500,-50.034500,solar power/pv,"mw (peak value, grid connected, or unknown)",Pedranópolis PV Solar Complex,2022.0,30.0,[2022.0],['30.0'],1,794,39862
L200441,-20.087100,-46.039900,bioenergy,mw,Bambuí power station,2014.0,30.0,[2014.0],['30.0'],1,348,55840
L200535,-20.070900,-45.542900,bioenergy,mw,LDC Bioenergia Lagoa da Prata power station,2009.0,40.0,"[2009.0, 2012.0]","['40.0', '45.0']",2,283,57828


-> {2000: 1084, 2007: 2, 2014: 3}
-> <class 'dict'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(557,)
(557,)


In [48]:

#my_lambda = lambda r: xda.isel(x=slice(r[12]-offset-1, r[12]+offset), y=slice(r[11]-offset-1, r[11]+offset))
# important to specify 'otypes' to avoid 'only size-1 arrays can be converted to Python scalars'
# vectorize is just a for loop really so mostly syntactic sugar...
# excluded needs both positional and keyword members to be flexible... 
#otypes=[dict], 
#, excluded=['xda', 'offset']
#, otypes=[dict]
#del my_func
my_func = np.vectorize(myfunc, excluded=['xdarray','size'], cache=False)
                       #, signature='(n, n, (), ()) -> n')


In [64]:

# Nota bene: np.vectorize is consistently a little quicker than apply... %timeit 
result = my_func(row=np_row, col=np_col, xdarray=xda, size=offset)
#%timeit local_assets.apply(lambda r: myfunc(r.row,r.col,xda,offset), axis=1)

#display(result)
print(result.shape)
print(type(result.dtype))

poo = pd.Series(result)
display(poo[poo.notna()])


(557,)
<class 'numpy.dtype[object_]'>


0                         {}
1                         {}
2                         {}
3                         {}
4                         {}
               ...          
552                       {}
553    {2000: 1084, 2021: 5}
554                       {}
555                       {}
556             {2000: 1089}
Length: 557, dtype: object

In [79]:

local_assets['region'] = pd.Series(result, index=local_assets.index)
columns = local_assets.columns.drop('region')
print(columns)
# expand to columns i.e. to wide format... indexed by uid_gem
#display(local_assets[local_assets.index == 'L800190'])
#display(local_assets[local_assets.region.notna()])
#display(local_assets.tail())
#bar = local_assets['region'].apply(pd.Series)
#display(bar.tail())
local_assets_lossyears = pd.concat([local_assets['region'].apply(pd.Series)], axis=1)
# remove nodata before we shift to long format
# TODO: where does the colum 0 come from?
#local_assets.drop(['region'], inplace=True, axis=1)
display(local_assets_lossyears.tail())
#local_assets_lossyears.drop([2000, 0], inplace=True, axis=1)
local_assets_lossyears.shape, local_assets_lossyears.columns
#local_assets_lossyears[local_assets_lossyears[2021].notna()].head(30)
#local_assets_lossyears[local_assets_lossyears.index == 'L800190']


Index(['latitude', 'longitude', 'sector', 'capacity_unit', 'asset_name',
       'start_year_first', 'capacity_first', 'start_year', 'capacity',
       'number_units', 'row', 'col'],
      dtype='object')


Unnamed: 0_level_0,2000,2008,2012,2011,2013,2003,2004,2005,2006,2007,...,2010,2014,2009,2017,2019,2021,2022,2016,2018,2020
uid_gem,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
L200457,,,,,,,,,,,...,,,,,,,,,,
L807567,1084.0,,,,,,,,,,...,,,,,,5.0,,,,
L200441,,,,,,,,,,,...,,,,,,,,,,
L200535,,,,,,,,,,,...,,,,,,,,,,
L200480,1089.0,,,,,,,,,,...,,,,,,,,,,


((557, 22),
 Index([2000, 2008, 2012, 2011, 2013, 2003, 2004, 2005, 2006, 2007, 2001, 2002,
        2010, 2014, 2009, 2017, 2019, 2021, 2022, 2016, 2018, 2020],
       dtype='int64'))

In [87]:

# the id variables need to uniquely identify each row... 
# TODO: it does not like region as a dict
#foo = pd.wide_to_long(local_assets, stubnames=range(2001, 2023), i=columns, j='lossyear').reset_index()
# value_vars=range(2001, 2023) # takes all except id_vars when not given...
#display(columns)
#display(local_assets.columns)
years = local_assets_lossyears.columns
display(years)
display(local_assets_lossyears)
foo = pd.melt(local_assets_lossyears, value_vars=years, var_name='lossyear', value_name='count', ignore_index=False)
display(foo.tail())
#display(local_assets[local_assets.index == 'L800190'])
#display(foo[foo.index == 'L800190'])


Index([2000, 2008, 2012, 2011, 2013, 2003, 2004, 2005, 2006, 2007, 2001, 2002,
       2010, 2014, 2009, 2017, 2019, 2021, 2022, 2016, 2018, 2020],
      dtype='int64')

Unnamed: 0_level_0,2000,2008,2012,2011,2013,2003,2004,2005,2006,2007,...,2010,2014,2009,2017,2019,2021,2022,2016,2018,2020
uid_gem,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
L906177,,,,,,,,,,,...,,,,,,,,,,
L906149,,,,,,,,,,,...,,,,,,,,,,
L906143,,,,,,,,,,,...,,,,,,,,,,
M1532,,,,,,,,,,,...,,,,,,,,,,
L906088,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
L200457,,,,,,,,,,,...,,,,,,,,,,
L807567,1084.0,,,,,,,,,,...,,,,,,5.0,,,,
L200441,,,,,,,,,,,...,,,,,,,,,,
L200535,,,,,,,,,,,...,,,,,,,,,,


Unnamed: 0_level_0,lossyear,count
uid_gem,Unnamed: 1_level_1,Unnamed: 2_level_1
L200457,2020,
L807567,2020,
L200441,2020,
L200535,2020,
L200480,2020,


In [94]:
bar = foo.groupby(['uid_gem', 'lossyear']).first()
bar = bar.reindex(index)
bar.ffill(inplace=True)
#bar.loc['L800190', ]
bar.reset_index(inplace=True)
#display(bar[bar.uid_gem == 'L800190'])

# pivot on uid_gem and 
far = bar.pivot(index='uid_gem', columns='lossyear', values='count')
display(far.columns)
display(local_assets.columns)
# combine far with local_assets based on index...
local_assets_with_lossyears = pd.merge(local_assets, far, validate='one_to_one', left_on='uid_gem', right_on='uid_gem')

display(local_assets_with_lossyears)

local_assets_with_lossyears.to_csv('data/geotiff-sample.csv')

# Index contains duplicate entries, cannot reshape
#foo.pivot(index='uid_gem', columns='start_year_first')


#len(uid_gem), len(lossyears)


Index([2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012,
       2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022],
      dtype='int64', name='lossyear')

Index(['latitude', 'longitude', 'sector', 'capacity_unit', 'asset_name',
       'start_year_first', 'capacity_first', 'start_year', 'capacity',
       'number_units', 'row', 'col', 'region'],
      dtype='object')

Unnamed: 0_level_0,latitude,longitude,sector,capacity_unit,asset_name,start_year_first,capacity_first,start_year,capacity,number_units,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
uid_gem,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
L906177,-45.761900,169.919700,wind power/onshore,mw,Mahinerangi wind farm,2011.0,36.0,[2011.0],['36.0'],1,...,,,,,,,,,,
L906149,-42.161400,146.688200,wind power/onshore,mw,Cattle Hill wind farm,2020.0,144.0,[2020.0],['144.0'],1,...,,,,,,,,,,
L906143,-41.781600,145.042200,wind power/onshore,mw,Granville Harbour wind farm,2020.0,112.0,[2020.0],['112.0'],1,...,,,,,,,,,,
M1532,-41.612538,148.125448,coal mine/surface,mt per year,Cullenswood Coal Mine,2002.0,0.05,[2002.0],['0.05'],1,...,,,,,,,,,,
L906088,-41.258400,174.668800,wind power/onshore,mw,West Wind wind farm,2009.0,143.0,[2009.0],['143.0'],1,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
L200457,-20.218700,-47.653900,bioenergy,mw,Buriti power station,2011.0,74.0,[2011.0],['74.0'],1,...,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0
L807567,-20.198500,-50.034500,solar power/pv,"mw (peak value, grid connected, or unknown)",Pedranópolis PV Solar Complex,2022.0,30.0,[2022.0],['30.0'],1,...,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,5.0,5.0
L200441,-20.087100,-46.039900,bioenergy,mw,Bambuí power station,2014.0,30.0,[2014.0],['30.0'],1,...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
L200535,-20.070900,-45.542900,bioenergy,mw,LDC Bioenergia Lagoa da Prata power station,2009.0,40.0,"[2009.0, 2012.0]","['40.0', '45.0']",2,...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0


In [None]:
xda