# Data Processing Notebook

## Imports and Setup

In [1]:
import os
import pandas as pd
import numpy as np

import geopandas as gpd
import rasterio as rio

import sys
sys.path.insert(0, '../utils')
import geoutils

import logging
import warnings
logging.getLogger().setLevel(logging.ERROR)
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
%matplotlib inline

%load_ext autoreload
%autoreload 2

## File Locations

In [2]:
data_dir = "../data/"
pos_mask_dir = data_dir + 'pos_masks/'
neg_mask_dir = data_dir + 'neg_masks/'
sentinel_dir = data_dir + 'sentinel2/'

if not os.path.exists(data_dir):
    os.makedirs(data_dir)
if not os.path.exists(sentinel_dir):
    os.makedirs(sentinel_dir)
if not os.path.exists(pos_mask_dir):
    os.makedirs(pos_mask_dir)
if not os.path.exists(neg_mask_dir):
    os.makedirs(neg_mask_dir)

## Download Files from GCS

In [3]:
!gsutil -q -m cp gs://immap-gee/DEFLATED_gee_*.tif {sentinel_dir}
!gsutil -q -m cp gs://immap-gee/CROPPED_gee_*.tif {sentinel_dir}
!gsutil -q -m cp gs://immap-masks/informal_settlement_masks/*.gpkg {pos_mask_dir}
!gsutil -q -m cp gs://immap-masks/negative_sample_masks/*.gpkg {neg_mask_dir}
!gsutil -q -m cp gs://immap-masks/admin_boundaries/admin_bounds.gpkg {data_dir} 
print('Operations completed.')

Operations completed.


## Create Area Dictionary

In [3]:
area_dict = {area : dict() for area in ['maicao', 'riohacha', 'uribia']}
for area in area_dict:
    area_dict[area]['pos_mask_gpkg'] = '{}{}_mask.gpkg'.format(pos_mask_dir, area)
    area_dict[area]['neg_mask_gpkg'] = '{}{}-samples.gpkg'.format(neg_mask_dir, area)
    image_files, image_cropped = [], []
    for image_file in os.listdir(sentinel_dir):
        if area in image_file and 'DEFLATE' in image_file:
            image_files.append(sentinel_dir+image_file)
        if area in image_file and 'CROPPED' in image_file:
            image_cropped.append(sentinel_dir+image_file)
    area_dict[area]['images'] = sorted(image_files)
    area_dict[area]['images_cropped'] = sorted(image_cropped)
    
area_dict['maicao']

{'pos_mask_gpkg': '../data/pos_masks/maicao_mask.gpkg',
 'neg_mask_gpkg': '../data/neg_masks/maicao-samples.gpkg',
 'images': ['../data/sentinel2/DEFLATED_gee_maicao_2016.tif',
  '../data/sentinel2/DEFLATED_gee_maicao_2017.tif',
  '../data/sentinel2/DEFLATED_gee_maicao_2018.tif',
  '../data/sentinel2/DEFLATED_gee_maicao_2019.tif',
  '../data/sentinel2/DEFLATED_gee_maicao_2020.tif'],
 'images_cropped': ['../data/sentinel2/CROPPED_gee_maicao_2016.tif',
  '../data/sentinel2/CROPPED_gee_maicao_2017.tif',
  '../data/sentinel2/CROPPED_gee_maicao_2018.tif',
  '../data/sentinel2/CROPPED_gee_maicao_2019.tif',
  '../data/sentinel2/CROPPED_gee_maicao_2020.tif']}

## Generate Target Raster Masks

### Positive Labels (Informal Settlements)

In [4]:
for area, value in area_dict.items():
    tiff_file = value['images_cropped'][0]
    shape_file = value['pos_mask_gpkg']
    target_file = shape_file.replace('gpkg', 'tiff')
    
    geoutils.generate_mask(
         tiff_file=tiff_file, 
         shape_file=shape_file, 
         output_file=target_file, 
         plot=False
    )
    area_dict[area]['pos_mask_tiff'] = target_file

### Negative Labels (Formal Settlements and Unoccupied Land)

In [5]:
for area, value in area_dict.items():
    tiff_file = value['images_cropped'][0]
    shape_file = value['neg_mask_gpkg']
    target_file = shape_file.replace('gpkg', 'tiff')
    
    if os.path.isfile(shape_file):
        gdf = gpd.read_file(shape_file)
        gdf['class'] = gdf['class'].str.lower() 
        gdf = gdf[
            (gdf['class'] == 'unoccupied land') 
            | (gdf['class'] == 'formal settlement')
        ]
        shape_file = shape_file.replace('samples', 'masks')
        gdf.to_file(shape_file, driver='GPKG')
    
        _, target_dict = geoutils.generate_mask(
             tiff_file=tiff_file, 
             shape_file=shape_file, 
             output_file=target_file, 
             plot=False
        )
    area_dict[area]['neg_mask_tiff'] = target_file

target_dict

{'formal settlement': 2, 'unoccupied land': 3, 'informal settlement': 1}

## Generate Training Set

In [6]:
from tqdm import tqdm

data_all = []
for area in area_dict:
    print('Reading {}...'.format(area))
    
    # Read positive target mask
    pos_mask = rio.open(area_dict[area]['pos_mask_tiff'])
    pos_mask = pos_mask.read(1).ravel()
    
    # Read negative mask
    neg_mask = rio.open(area_dict[area]['neg_mask_tiff'])
    neg_mask = neg_mask.read(1).ravel()
    
    # Get sum of postive and negative mask
    mask = pos_mask + neg_mask

    data = dict()
    
    # Iterate over each year
    image_list = area_dict[area]['images_cropped']
    for image_file in tqdm(image_list, total=len(image_list)):
        year = image_file.split('_')[-1].split('.')[0]
        
        # Read each band
        raster = rio.open(image_file)
        for band_idx in range(raster.count):
            band = raster.read(band_idx+1).ravel()
            data['band{}_{}'.format(band_idx+1, year)] = band
    
    # cast to pandas dataframe
    data = pd.DataFrame(data)
    data['target'] = mask

    data = data[data.values.sum(axis=1) != 0] 
    data_all.append(data)

# Concatenate all areas
data = pd.concat(data_all)
print('Data dimensions: {}'.format(data.shape))
data.head(3)

Reading maicao...


100%|██████████| 5/5 [00:40<00:00,  8.04s/it]


Reading riohacha...


100%|██████████| 5/5 [01:02<00:00, 12.54s/it]
  0%|          | 0/5 [00:00<?, ?it/s]

Reading uribia...


100%|██████████| 5/5 [00:09<00:00,  1.83s/it]


Data dimensions: (53378064, 61)


Unnamed: 0,band1_2016,band2_2016,band3_2016,band4_2016,band5_2016,band6_2016,band7_2016,band8_2016,band9_2016,band10_2016,...,band4_2020,band5_2020,band6_2020,band7_2020,band8_2020,band9_2020,band10_2020,band11_2020,band12_2020,target
11131,0.1492,0.1331,0.1373,0.1587,0.1735,0.2247,0.2654,0.2486,0.3075,0.0377,...,0.1442,0.1754,0.2019,0.2289,0.2352,0.2579,0.2738,0.3625,0.2616,0
16695,0.1492,0.1359,0.1543,0.1761,0.1799,0.2381,0.2838,0.2657,0.3186,0.0377,...,0.1632,0.1908,0.2187,0.2398,0.253,0.2711,0.2738,0.3587,0.2642,0
16696,0.1492,0.1395,0.1514,0.1644,0.1799,0.2381,0.2838,0.2652,0.3186,0.0377,...,0.1498,0.1908,0.2187,0.2398,0.2464,0.2711,0.2738,0.3587,0.2642,0


## Save and Upload Final Dataset

In [11]:
output_file = data_dir+'dataset.csv'
data = data[data['target'] != 0]
data.to_csv(output_file, index=False)
print('Data dimensions: {}'.format(data.shape))
print('Class distribution:\n{}'.format(data['target'].value_counts()))
print('Class distribution (normalized):\n{}'.format(data['target'].value_counts()/len(data)))

Data dimensions: (334524, 61)
Class distribution:
3    248172
2     71796
1     14556
Name: target, dtype: int64
Class distribution (normalized):
3    0.741866
2    0.214621
1    0.043513
Name: target, dtype: float64


In [12]:
!gsutil -m cp {output_file} gs://immap-training/

Copying file://../data/dataset.csv [Content-Type=text/csv]...
\ [1/1 files][141.5 MiB/141.5 MiB] 100% Done                                    
Operation completed over 1 objects/141.5 MiB.                                    
