# Load and analyse data

## Import Packages
in this section all packages that are needed are imported and if needed installed

In [None]:
!pip install rasterio

In [2]:
import rasterio as rio
import numpy as np
import pandas as pd
import glob
from rasterio.plot import show
import seaborn as sns
import matplotlib.pyplot as plt
import re
import os
from scipy.ndimage.filters import uniform_filter
from scipy.ndimage.measurements import variance
from sklearn import preprocessing
from astropy.coordinates import get_sun, AltAz, EarthLocation
from astropy.time import Time
import astropy.coordinates as coord
import astropy.units as u
from datetime import datetime
import tqdm
from tqdm import tqdm
pd.options.mode.chained_assignment = None  # default='warn'


## Load dat from Google Cloud Storage

In [3]:
from google.colab import auth
auth.authenticate_user()

In [None]:
!curl https://sdk.cloud.google.com | bash

In [None]:
!gcloud init --skip-diagnostics --project earthengine-geouu # --account vandenelsen.max@gmail.com

In [None]:
!gsutil -m cp gs://frozen-lake-ee/SarExportLakeImgs/*.tif .
!gsutil -m cp gs://frozen-lake-ee/SarExport/*.csv .

In [7]:
# list of countries 
studyareas = ['Alaska','Canada','Finland','Russia']

# Split water and ice into different lists 
classes = ['Water', 'Ice']
root_dir = '/content/'

# create empty list for each class and fill them with files 

for studyarea in studyareas:
  sourceWater = []
  sourceIce = []

  for cls in classes:
    dir = root_dir + studyarea + cls + '*.tif'
    files = glob.glob(dir)
    for file in files:
      src = rio.open(file)
      if 'Water' in file:
        sourceWater.append(src)
      else: 
        sourceIce.append(src)
  w = sourceWater[0].read(5)
  print(len(w[w==1]))
  print('Total number of water images of', studyarea, len(sourceWater))
  print('Total number of ice images of', studyarea, len(sourceIce))

563106
Total number of water images of Alaska 39
Total number of ice images of Alaska 42
423291
Total number of water images of Canada 56
Total number of ice images of Canada 40
661422
Total number of water images of Finland 230
Total number of ice images of Finland 97
1227923
Total number of water images of Russia 48
Total number of ice images of Russia 43


## Functions
This section of the notebook contains all the functions that are needed to prepare the data and transform it into a pandas dataframe. The functions have the following purpose, respectively:
- Min max normalization function
- Function to apply lee speckle filter
- function to transform image into dataframe
- function to append all dataframes into a large dataframe that contains all the images

In [23]:
################# LEE FILTER ###################
def getBinMean(img, size):
  imgZero = np.nan_to_num(img)
  
  # Create binary map
  imgBin = img.copy()
  imgBin[~np.isnan(img)] = 1
  imgBin[np.isnan(img)] = 0

  # Calculate moving window 
  mwMean = uniform_filter(imgZero, (size, size))
  ratio = uniform_filter(imgBin, (size, size))

  # correct for ratios 
  output = np.divide(mwMean, ratio, out=np.zeros_like(mwMean), where=ratio!=0)

  return output 

def lee_filter(img, size):
  mu_k = getBinMean(img, size)

  imgsqr = img**2
  sqrMu_k = getBinMean(imgsqr, size)

  sigma_k = sqrMu_k - mu_k**2
  sigma = np.nanvar(img)
  W = sigma_k/(sigma_k+sigma)

  R_hat = mu_k + W * (img - mu_k)

  return R_hat 

################# MOVING WINDOW VARIANCE ###################
def window_variance(img, size):
  mean = getBinMean(img, size)
  imgsqr = img**2
  sqrmean = getBinMean(imgsqr, size)
  variance = sqrmean - mean**2
  return variance

################# IMAGE TO DATAFRAME ###################
def img_to_dataframe(img, listOfBandNames, moving_window):
  # Read dataset and define shape
  data = img.read()
  (nbands, rows, cols) = data.shape

  # apply lee filter to VV and VH
  for i in range(2):
    data[i] = lee_filter(data[i], moving_window)

    # determine neigbors 
    #left = np.roll(data[i], 1, axis = 1).reshape(1, rows,cols)
    #right = np.roll(data[i], -1, axis = 1).reshape(1, rows,cols)
    #up = np.roll(data[i], 1, axis = 0).reshape(1, rows,cols)
    #down = np.roll(data[i], -1, axis = 0).reshape(1, rows,cols)
    windowVar = window_variance(data[i], moving_window).reshape(1, rows,cols)
    data = np.concatenate([data, windowVar], axis = 0)

  # make id band
  n = rows*cols
  unique = np.arange(n)
  idBand = unique.reshape(1, rows,cols)

  # add ID band to image data
  dataID = np.append(data, idBand, axis = 0)

  # reshape array to fit in dataframe
  newRows = rows*cols
  newCols = dataID.shape[0] 
  dataRes = dataID.reshape(newCols, newRows).transpose()

  #include ID
  colnames = listOfBandNames
  addCols = ['VVwindowVar', 
             'VHwindowVar',
             'pixelId']
  df = pd.DataFrame(dataRes, columns = colnames + addCols)

  return df


def imgCollection_to_df(list_of_imgs, ImageData, listOfBandNames, listOfDataTypes, window, samplesize):
  
  df = pd.DataFrame([])
  for k in tqdm(range(len(list_of_imgs))):
    open_df = img_to_dataframe(list_of_imgs[k], listOfBandNames, window)
    df_filter = open_df.loc[open_df['lake'] == 1]
    
    df_sample = df_filter.sample(n=samplesize)

    string = list_of_imgs[k].name
    df_sample['imgId'] = re.split('_|/', string)[3]
    df_sample['countryState'] = re.split('_|/', string)[2]

    df = pd.concat([df, df_sample], ignore_index = True)

  #sample_size = int((sample_ratio * df.shape[0])/2)
  #df_sample = df.groupby(['ice']).sample(n=sample_size)

  bandNames = listOfBandNames + ['pixelId', 'imgId', 'countryState']
  dataTypes = listOfDataTypes + [int, int, str]
  convertdict = dict(zip(bandNames, dataTypes))
  df = df.astype(convertdict)

  dfMerge = pd.merge(df, imgInfo, on = ['imgId', 'countryState'], how = 'left')

  dfMerge['VV/VH'] = abs(dfMerge['VV'])/abs(dfMerge['VH'])
  dfMerge['windRes'] = (dfMerge['windU']**2) + (dfMerge['windV']**2)**(1/2)
  u = dfMerge.iloc[:,5]
  v = dfMerge.iloc[:,6]
  dfMerge['windDir'] = np.mod(180+np.rad2deg(np.arctan2(u, v)),360)

  dfMerge = dfMerge.drop(columns = ['countryState'], axis = 1)



  return dfMerge

## Get final processed train, test and validationsets 
### Load CSV with information

In [9]:
csvFiles = glob.glob('/content/features*csv')

imgInfo = pd.DataFrame([])

for csv in csvFiles:
  data = pd.read_csv(csv)
  imgInfo = pd.concat([imgInfo, data], ignore_index = True)

imgInfo = imgInfo.drop(['.geo', 'system:index'], axis = 1)

imgInfo['Date'] =  pd.to_datetime(imgInfo['Date'], format='%Y-%m-%d %H:%M:%S')
imgInfo['Date'] = imgInfo.Date.dt.round('1s')
imgInfo['month'] = imgInfo.Date.dt.to_period('M')

imgInfo[imgInfo['country'] == 'Canada']

barData = imgInfo.groupby(['country', 'state', 'month'], as_index = False).imgID.count()

imgInfo['countryState'] = imgInfo['country'] + imgInfo['state']

imgInfo = imgInfo.drop(['lon', 'lat', 'state'], axis = 1)
imgInfo.rename(columns = {'imgID':'imgId'}, inplace = True)
imgInfo.dtypes

Date            datetime64[ns]
country                 object
imgId                    int64
month                period[M]
countryState            object
dtype: object

### Test/train split 
First approach, split data in train test based on the images. 

In [19]:
# Split water and ice into different lists 
classes = ['Water', 'Ice']
root_dir = '/content/'

# set ratios 
test_ratio = 0.2

# create empty list for each class and fill them with files 
sourceWater = []
sourceIce = []

for cls in classes:
  dir = root_dir + '*' + cls + '*.tif'
  files = glob.glob(dir)
  for file in files:
    src = rio.open(file)
    if 'Water' in file:
      sourceWater.append(src)
    else: 
      sourceIce.append(src)

#Sample Training and testing images 
np.random.seed(1)
trainWater, testWater = np.split(np.array(sourceWater),
                                 [int(len(sourceWater)* (1 - test_ratio))])

trainIce, testIce = np.split(np.array(sourceIce),
                             [int(len(sourceIce)* (1 - test_ratio))])

train = np.append(trainWater, trainIce)
test = np.append(testWater, testIce)
print('Total number of train images', len(train))
print('Total number of test images', len(test))


Total number of train images 475
Total number of test images 120


### Load all images and apply function
Second approach, load all images and transform images to dataframe and sample pixels per image.

In [None]:
tifImgsPath = '*.tif'
files = glob.glob(os.path.join("/content/", tifImgsPath))

source = []

for file in files:
  src = rio.open(file)
  source.append(src)

print('Total number of images: ',len(source))

Total number of images:  595


In [20]:
train[1].read().shape == (7,2000,2000)

for i in range(len(train)):
  if train[i].read().shape == (7,2000,2000):
    print(train[i])

In [25]:
# input parameters function
bands = ['VV', 'VH', 'angle', 'ice', 'lake', 'windU', 'windV', 'cont', 'corr', 'ent']
dtypes = [float, float, float, int, int, float, float, float, float, float]
window = 3
imageSample = 2000

In [28]:
data = imgCollection_to_df(train, imgInfo, bands, dtypes, window, imageSample)

100%|██████████| 475/475 [26:40<00:00,  3.37s/it]


In [30]:
print('dimensions of the data:', data.shape)
print(data.head())

dimensions of the data: (950000, 20)
          VV         VH      angle  ice  lake     windU     windV  \
0 -25.610180 -32.770473  35.804420    0     1  1.676041  2.994339   
1 -21.608246 -28.077782  36.051094    0     1  1.709244  2.948441   
2 -18.697680 -24.187178  35.815872    0     1  1.676041  2.994339   
3 -19.537649 -27.114115  36.027145    0     1  1.709244  2.948441   
4 -26.375553 -30.735609  36.068623    0     1  1.709244  2.948441   

          cont      corr       ent  VVwindowVar  VHwindowVar  pixelId  imgId  \
0  1047.055176  0.660649  4.168513     7.610413     7.816162  3779274    170   
1   154.628479  0.501768  4.229198     0.072815     0.307861  2316635    170   
2  1436.851562  0.769786  4.342926     3.617493     3.084595  3745241    170   
3   175.116760  0.706706  4.099168     0.659851     2.446777  1112834    170   
4   946.104614  0.581719  4.281530     1.873474     2.241760  2344576    170   

                 Date  country    month     VV/VH   windRes     win

### Export dataframe to drive

In [29]:
from google.colab import drive
drive.mount('drive')

root_path = '/content/drive/My Drive/Thesis/Data/'
fileName =  'Train' + 'ImageSample' + str(imageSample) + '.csv'

data.to_csv(os.path.join(root_path, fileName), encoding='utf-8', index=False)

Mounted at drive


### Visualisations 

In [None]:
sns.set(font_scale=1)
sns.set_style("white")

p = sns.catplot(
    data=barData,
    x="month",
    y="imgID",
    col="country",
    col_wrap=2,
    kind="bar",
    hue="state",
    sharex=True,     
    dodge=False,
    sharey=True,
)

for axes in p.axes.flat:
    axes.set_xticklabels(
        axes.get_xticklabels(), rotation=65, horizontalalignment="right"
    )

# plt.tight_layout()
p.set_xlabels("")

p.fig.set_figwidth(25)
p.fig.set_figheight(10)