In [None]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Polygon
import rasterio as rio
from rasterio import features
from rasterio.windows import Window
import matplotlib.pyplot as plt
import numpy as np
import pprint as pp
import itertools as it
from tqdm import tqdm
import time


In [None]:
from rasterio.plot import show
from rasterio.warp import transform

xs = np.array([3327493.433, 16177493.43])
ys = np.array([7389201.61, -580798.392])

lossyear = 'data/Hansen_GFC-2022-v1.10_lossyear_50N_000E.tif'
datamask = 'data/Hansen_GFC-2022-v1.10_datamask_50N_000E.tif'
foo = 'data/TCL_DD_2022_20230407.tif'

src = rio.open(foo)
band = src.read(1)



In [None]:
mask = band != 12
shapes = features.shapes(band, mask=mask, transform=src.transform)
pp.pprint(next(shapes)) # first element
    

({'coordinates': [[(1.4645000000000001, 50.0),
                   (1.4645000000000001, 49.99975),
                   (1.46525, 49.99975),
                   (1.46525, 49.9995),
                   (1.4655, 49.9995),
                   (1.4655, 50.0),
                   (1.4645000000000001, 50.0)]],
  'type': 'Polygon'},
 5.0)

In [None]:
# rasterio.features.dataset_features # works on raster value rather than mask...

with rio.open('data/TCL_DD_2022_20230407.tif') as src:
    # TODO: check src.count for number of bands...
    print(src.meta)
    band = src.read(1)
    mask = band == 1
    # Object holding a feature collection that implements the __geo_interface__
    # TODO: result should be in EPSG 4326 i.e. GPS
    results = (
        {'properties': {'deforestation': v}, 'geometry': s}
        for i, (s, v) in enumerate(
            features.shapes(band, mask=mask)
            )
        )
    geoms=list(results)
    gdf = gpd.GeoDataFrame.from_features(geoms)
    
gdf.head()


geometry	deforestation
0	POLYGON ((2870.000 104.000, 2870.000 105.000, ...	4.0
1	POLYGON ((2777.000 110.000, 2777.000 111.000, ...	4.0
2	POLYGON ((2778.000 111.000, 2778.000 112.000, ...	4.0
3	POLYGON ((2730.000 113.000, 2730.000 114.000, ...	4.0
4	POLYGON ((2737.000 113.000, 2737.000 114.000, ...	4.0

In [None]:
# rasterio.features.dataset_features # works on raster value rather than mask...

with rio.open('data/Hansen_GFC-2022-v1.10_lossyear_20S_060W.tif') as src:
    # TODO: check src.count for number of bands...
    print(src.meta)
    band = src.read(1, window=Window(2100, 2000, 6000, 6000))
    #band = src.read(1)
    print(band.shape)
    rdf = gpd.GeoDataFrame()
    mask = band != 0
    # Object holding a feature collection that implements the __geo_interface__
    # TODO: result should be in EPSG 4326 i.e. GPS
    results = (
        {'properties': {'lossyear': v}, 'geometry': s}
        for i, (s, v) in enumerate(
            # connectivity, 4 on edges, 8 on edges and corners...
            features.shapes(band, mask=mask, connectivity=8)
            )
        )
    geoms=list(results)
    gdf = gpd.GeoDataFrame.from_features(geoms)
    rdf = gpd.GeoDataFrame( pd.concat( [rdf, gdf], ignore_index=True), crs=gdf.crs)

print(f'Chosen Window results in GeoDataFrame of .shape: {rdf.shape}')
rdf.head()


In [None]:
rdf.plot(column='lossyear', legend=True)

In [None]:
# sjoin does not modify the geometry...
intersects = rdf.sjoin(rdf, how="left", predicate="intersects")
intersects.shape, intersects.index.value_counts()

In [None]:
intersects.reset_index(inplace=True)
intersects.rename(columns={'index': 'index_left'}, inplace=True)
intersects.head()


In [None]:
# Nota bene: Robert Norris - this removes the self-intersection aggregates index_right, but
# also aggregates all other values into lists... it also spends far too much time in unary_union
# on geometry, the result of which we do not need.
#temp = intersects.dissolve("index_left", aggfunc=lambda x: x.tolist(),)

# Group by 'index_left', truncate intersects, then aggregate on 'index_right' only...
groups = intersects.groupby('index_left')
temp = intersects[intersects['index_left'] == intersects['index_right']].set_index('index_left')
temp['indices'] = groups['index_right'].aggregate(lambda x: x.tolist())
temp['indices'] = temp['indices'].apply(lambda x: np.sort(x))

temp.index.name = None
temp['lossyear'] = temp['lossyear_left'].astype("int") + 2000
temp.drop(['index_right', 'lossyear_left', 'lossyear_right'], axis=1, inplace=True)

temp.head()


In [None]:

group_id = 'group'

# See https://stackoverflow.com/questions/73566774/group-by-and-combine-intersecting-overlapping-geometries-in-geopandas
# This is not quite right... 5 should be part of the same group as 2, but since it was
# not previously encountered, a new group_id is taken that...
# we would need to take any of 'indices' that have previously been encountered...

'''
                 indices group
0                    [0]     0
1                    [1]     1
2                 [2, 3]     2
3           [2, 3, 4, 6]     2
4        [3, 4, 6, 7, 9]     2
5                 [5, 6]  None
6  [3, 4, 5, 6, 7, 8, 9]     2
                 indices group
0                    [0]     0
1                    [1]     1
2                 [2, 3]     2
3           [2, 3, 4, 6]     2
4        [3, 4, 6, 7, 9]     2
5                 [5, 6]     5
6  [3, 4, 5, 6, 7, 8, 9]     2
                 indices group
0                    [0]     0
1                    [1]     1
2                 [2, 3]     2
3           [2, 3, 4, 6]     2
4        [3, 4, 6, 7, 9]     2
5                 [5, 6]     5
6  [3, 4, 5, 6, 7, 8, 9]     5                 
'''

index_generator = range(len(temp))
start = time.time()
counter = it.count()

indices = temp['indices'].to_numpy()
groups = pd.Series([None] * len(temp))

for i, array in tqdm(zip(counter, indices), total=len(temp)):
    first_valid_index = groups.loc[array].first_valid_index()
    id = i if first_valid_index == None else groups.loc[first_valid_index]
    groups.loc[array] = id
end = time.time()
print(f'Loop over {len(temp)} took {end-start}s')

temp[group_id] = groups.copy()

# want to dissolve based on lossyear to generate any MULTIPOLYGON from disjoint geometry from same lossyear...
temp2 = temp.dissolve(
    [group_id, 'lossyear']
)
#temp.reset_index(inplace=True)
print(f'intersects.dissolve on group_id, lossyear in GeoDataFrame of .shape: {temp2.shape}')

temp2.head(30)
#temp['indices']




In [None]:
temp2.head()

In [None]:
group_ids = temp2.index.get_level_values(0).unique()
lossyears = range(2001, 2023)

#temp3 = temp2.set_index([group_id, 'lossyear'])
index = pd.MultiIndex.from_tuples(tuples=it.product(group_ids, lossyears), names=(group_id, 'lossyear'))
temp3 = temp2.reindex(index)

temp3.head(23)

In [None]:
# Must be aware of chained indexing as it may call __get_item__ before __set_item__ which will fail on Nan/None etc.
# https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

#temp.index.names, temp.index.levels, temp.index.codes
#temp.isna() # shows only rows where at least one column value is set...
#temp.fillna(0)
#temp.loc[(1, 2018), 'geometry'] = None
#temp.loc[(1, 2017), 'index_right'] = 666
#temp.loc[:,].isnull().sum()
#temp.loc[temp.loc[:,].isna(), 'geometry']
#temp.geometry.fillna()
#temp.head()
#temp.loc[1,2019].geometry = None #shapely.geometry.Polygon([])


In [None]:
temp3.loc[temp3['geometry'].isna(), 'geometry'] = Polygon([])
temp3['area'] = temp3.geometry.area
temp3['cum_area'] = temp3.groupby(group_id).area.cumsum()

for i in tqdm(group_ids.to_numpy()):
    temp3.loc[i, 'cum_geometry'] = list(it.accumulate(temp3.loc[i, 'geometry'], func=lambda x,y: x.union(y)))

temp3.reset_index(inplace=True)

temp3.drop(temp3[temp3.area == 0].index, inplace=True)

print(f'...and a final GeoDataFrame of .shape: {temp3.shape}')

temp3.to_csv('data/geoply-sample.csv')
temp3.head(30)