##### Prerequisites 

# A case study on deforestation through illegal amber mining in Ukraine


##### Abstract

## 1. Introduction

## 2. Data and Methods

### 2.1 Data

### 2.2 Methods
#### 2.2.1 Preparation

In [1]:
# third party libs
import os
import re
import rasterio
import numpy as np
import pandas as pd
import geopandas as gpd
from pathlib import Path
from collections import namedtuple
from shapely.geometry import Point
from sklearn.ensemble import RandomForestClassifier
from bokeh.plotting import show, figure, gridplot, output_notebook

# custom libs
from src.landsat import LandsatArchive
from src.utils import (get_data_dir,
                       ndvi,
                       RANDOM,
                       l7_radiance,
                       l7_reflectance,
                       l8_reflectance,
                       write,
                       clip_raster,
                       reproject_from,
                       draw_raster_sample,)


# make source data folders
directories = """
data
data.core
data.proc
data.prep
data.arch
"""

for item in directories.split():
    path = os.sep.join(item.split('.'))
    try:
        os.mkdir(path)
    except OSError:
        pass

# convenient access to data dir
DIRS = get_data_dir(str(Path('data').resolve()))

# pyproj definition of WGS84
WGS84 = {'init': 'epsg:4326'}

# area of interest
Bounds = namedtuple('Bounds', 'left bottom right top')
AIO = Bounds(25.6, 51.0, 27.6, 51.8)

# total number of threads to use
THREADS = 12

# force bokeh notebook output
output_notebook()

#### 2.2.2 Tree cover reference layer

In [2]:
# load 
src = DIRS.core / 'l7_2000'

if src.is_dir():
    l7_2000 = LandsatArchive.read(src)
else:
    l7_2000 = LandsatArchive.read(DIRS.arch / 'LE07_L1TP_184024_20000612_20170211_01_T1.tar.gz',
                                  extract_to=DIRS.core / 'l7_2000')
    
print(l7_2000)


        Spacecraft: LANDSAT_7
        Sensor: ETM
        Date acquired: 2000-06-12
        Cloud cover: 1.0
        Quality: 9
        


In [3]:
# compute band metrics and compose band stack
regex = re.compile(r'.*LE07.*_B(\d)\.TIF')

band_stack = []
for name in 'red green blue nir'.split():
    img = l7_2000[name]
    img_data = img.read(1)
    band_idx = regex.match(img.name).group(1)
        
    RMIN = l7_2000.metadata.get('MIN_MAX_RADIANCE', 'RADIANCE_MINIMUM_BAND_%s' % band_idx)
    RMAX = l7_2000.metadata.get('MIN_MAX_RADIANCE', 'RADIANCE_MAXIMUM_BAND_%s' % band_idx)
    QCMIN = l7_2000.metadata.get('MIN_MAX_PIXEL_VALUE', 'QUANTIZE_CAL_MIN_BAND_%s' % band_idx)
    QCMAX = l7_2000.metadata.get('MIN_MAX_PIXEL_VALUE', 'QUANTIZE_CAL_MAX_BAND_%s' % band_idx)
    ESD = l7_2000.metadata.get('IMAGE_ATTRIBUTES', 'EARTH_SUN_DISTANCE')
    SE = l7_2000.metadata.get('IMAGE_ATTRIBUTES', 'SUN_ELEVATION')

    radiance = l7_radiance(img_data, QCMIN, QCMAX, RMIN, RMAX, src_nodata=0)
    reflectance = l7_reflectance(radiance, ESD, SE, int(band_idx), src_nodata=0.0)
    
    band_stack.append(reflectance)
    img.close()

ndvi_data = ndvi(band_stack[0], band_stack[3])

band_stack.append(ndvi_data)
rgbn_data = np.array(band_stack, dtype=np.float32)

In [4]:
# reproject and clip
img = l7_2000['red']
crs = img.crs
transform = img.transform
img.close()

rgbn_path = write(rgbn_data, str(DIRS.proc / 'rgbn.tif'), 
                  driver='GTiff', transform=transform, 
                  crs=crs)

reproject = reproject_from(rgbn_path, WGS84, str(DIRS.proc / 'rgbn_reproject.tif'))

clip, transform = clip_raster(reproject, AIO)

clip_path = write(clip, str(DIRS.proc / 'rgbn_clip.tif'), 
                  driver='GTiff', transform=transform, 
                  crs=WGS84)

In [5]:
# sample image
ref_img = rasterio.open(clip_path, 'r')
ref_data = ref_img.read()
transform = ref_img.transform

sample = draw_raster_sample(ref_data, affine=transform, samples=500,
                            columns=['red', 'green', 'blue', 'nir', 'ndvi'])

points = [Point(coor.x, coor.y) 
          for idx, coor in sample[['x', 'y']].iterrows()]

geometry = gpd.GeoSeries(points)
geo_df = gpd.GeoDataFrame(sample, geometry=geometry)
geo_df.crs = WGS84

geo_df.to_file(str(DIRS.proc / 'samples.shp'))

In [6]:
# train rf
samples = gpd.read_file(str(DIRS.prep / 'samples.shp'))
samples['is_train'] = RANDOM.uniform(size=len(samples)) <= 0.75
train, test = samples[samples.is_train == True], samples[samples.is_train == False]

features = train.columns[:5]
labels = list(train.label)

clf = RandomForestClassifier(n_jobs=THREADS, random_state=RANDOM, n_estimators=1000, verbose=1)

clf.fit(train[features], labels)

assessment = clf.predict(test[features])

[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    0.3s
[Parallel(n_jobs=12)]: Done 776 tasks      | elapsed:    0.6s
[Parallel(n_jobs=12)]: Done 1000 out of 1000 | elapsed:    0.7s finished
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done 776 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done 1000 out of 1000 | elapsed:    0.2s finished


In [7]:
# classification
px_matrix = ref_data.T.reshape((ref_data.shape[1]*ref_data.shape[2],ref_data.shape[0]))
df = pd.DataFrame.from_records(px_matrix, columns=features)

classified = clf.predict(df[features])

classified = np.reshape(classified, (ref_data.shape[2], ref_data.shape[1])).T
classified = classified.astype(np.uint8)

treecover_2000 = write(classified, str(DIRS.proc / 'treecover_2000.tif'), crs=WGS84, driver='GTiff',
                       transform=transform, compress='lzw')

[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    2.7s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:   13.5s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:   31.9s
[Parallel(n_jobs=12)]: Done 776 tasks      | elapsed:   57.9s
[Parallel(n_jobs=12)]: Done 1000 out of 1000 | elapsed:  1.2min finished


## 3. Results

## 4. Discussion