<h1> <center> Exploiting Sentinel-1 imagery time series to detect grasslands in northern Brazil tropical plains</center> </h1>
<h3> <center> Part 3 - Classification </center> </h3>
<center> Arian Ferreira Carneiro </center>
<center>Willian Vieira de Oliveira </center>

## Import required packages

In [None]:
import numpy as np
import pandas as pd
from osgeo import gdal, gdal_array
#from osgeo import osr
from numpy import genfromtxt

# Load scikit's random forest classifier library
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

#import matplotlib.pyplot as plt
#import time

## Input parameters

In [5]:
write_files = 'YES'

### Files that contain the pixel values of the area to be classified, the samples and their respective classes

In [2]:
# Directories for the input files
dir_NL_pixels = "OUTPUT/NL.csv"
dir_NL_samples = "OUTPUT/NL_AllSamples_pValues.csv"
dir_NL_classes = "OUTPUT/NL_AllSamples_classes.csv"

dir_Ratio_pixels = "OUTPUT/Ratio.csv"
dir_Ratio_samples = "OUTPUT/Ratio_AllSamples_pValues.csv"
dir_Ratio_classes = "OUTPUT/Ratio_AllSamples_classes.csv"

dir_RGI_pixels = "OUTPUT/RGI.csv"
dir_RGI_samples = "OUTPUT/RGI_AllSamples_pValues.csv"
dir_RGI_classes = "OUTPUT/RGI_AllSamples_classes.csv"

dir_VH_pixels = "OUTPUT/VH.csv"
dir_VH_samples = "OUTPUT/VH_AllSamples_pValues.csv"
dir_VH_classes = "OUTPUT/VH_AllSamples_classes.csv"

dir_VV_pixels = "OUTPUT/VV.csv"
dir_VV_samples = "OUTPUT/VV_AllSamples_pValues.csv"
dir_VV_classes = "OUTPUT/VV_AllSamples_classes.csv"

# Lists of the directories
filenames_C1 = ['NL', 'Ratio', 'RGI', 'VH', 'VV']
list_pixels_C1 = [dir_NL_pixels, dir_Ratio_pixels, dir_RGI_pixels, dir_VH_pixels, dir_VV_pixels]
list_samples_C1 = [dir_NL_samples, dir_Ratio_samples, dir_RGI_samples, dir_VH_samples, dir_VV_samples]
list_classes_C1 = [dir_NL_classes, dir_Ratio_classes, dir_RGI_classes, dir_VH_classes, dir_VV_classes]

dir_output_pixels = "OUTPUT/Classification_pixelValues/"

### Files that contain the metrics computed for the pixels to be classified, the samples and their respective classes

In [6]:
# Directories for the input files
dir_NL_metrics = "OUTPUT/NL_metrics.csv"
dir_NL_samples = "OUTPUT/NL_AllSamples_metrics.csv"
dir_NL_classes = "OUTPUT/NL_AllSamples_classes.csv"

dir_Ratio_metrics = "OUTPUT/Ratio_metrics.csv"
dir_Ratio_samples = "OUTPUT/Ratio_AllSamples_metrics.csv"
dir_Ratio_classes = "OUTPUT/Ratio_AllSamples_classes.csv"

dir_RGI_metrics = "OUTPUT/RGI_metrics.csv"
dir_RGI_samples = "OUTPUT/RGI_AllSamples_metrics.csv"
dir_RGI_classes = "OUTPUT/RGI_AllSamples_classes.csv"

dir_VH_metrics = "OUTPUT/VH_metrics.csv"
dir_VH_samples = "OUTPUT/VH_AllSamples_metrics.csv"
dir_VH_classes = "OUTPUT/VH_AllSamples_classes.csv"

dir_VV_metrics = "OUTPUT/VV_metrics.csv"
dir_VV_samples = "OUTPUT/VV_AllSamples_metrics.csv"
dir_VV_classes = "OUTPUT/VV_AllSamples_classes.csv"

# Lists of the directories
filenames_C2 = ['NL', 'Ratio', 'RGI', 'VH', 'VV']
list_metrics_C2 = [dir_NL_metrics, dir_Ratio_metrics, dir_RGI_metrics, dir_VH_metrics, dir_VV_metrics]
list_samples_C2 = [dir_NL_samples, dir_Ratio_samples, dir_RGI_samples, dir_VH_samples, dir_VV_samples]
list_classes_C2 = [dir_NL_classes, dir_Ratio_classes, dir_RGI_classes, dir_VH_classes, dir_VV_classes]

dir_output_metrics = "OUTPUT/Classification_metrics/"

## Auxiliary functions

### Function to write each classification map to file

In [3]:
def Write_GeoTiff(file, filename, Nrows, Ncols, geotransform, projection):
    driver = gdal.GetDriverByName('GTiff')
    
    dataset_output = driver.Create(filename, Ncols, Nrows, 1, gdal.GDT_Float32)
    dataset_output.GetRasterBand(1).WriteArray(file)
    
    if geotransform is not None:
        gt = list(geotransform)
        dataset_output.SetGeoTransform(tuple(gt))
    dataset_output.SetProjection(projection)
    
    dataset_output = None

## Classification 1: using pixel values

In this procedure, we perform the classification of the data cube considering the time series of pixel values.

In [4]:
#for i in range(1):
for i in range(len(filenames_C1)):
    print("Classifying the ", filenames_C1[i], " data cube...")
    
    pixel_values = pd.read_csv(list_pixels_C1[i])
    sample_values = pd.read_csv(list_samples_C1[i])
    sample_values = sample_values.drop(labels='Class', axis=1) # removing the column 'Class' from the dataframe
    sample_classes = pd.read_csv(list_classes_C1[i])
    
    smp_values = np.float32(sample_values)
    smp_classes = np.float32(sample_classes)
    smp_classes = np.ravel(smp_classes) # converting from column-vector to 1d array (expected by the classifier)
    pixels_values = np.float32(pixel_values)
    
    # ------------------------- SPLITTING THE SAMPLES INTO TRAINING AND TESTING DATASETS -------------------------
    
    # This scikit-learn function separates the datasets in train and test samples
    smp_values_train, smp_values_test, smp_classes_train, smp_classes_test = train_test_split(smp_values, smp_classes, 
                                                                                                test_size=0.3)
    
    rf = RandomForestClassifier(n_estimators=300, min_samples_leaf = 5, max_features = 5, n_jobs=-1, oob_score=True)
    rf.fit(smp_values_train, smp_classes_train) # X and Y parameters (as recognized by the classifier)
    
    Y_train_pred = rf.predict(smp_values_train) # Classifies the training samples (resubstitution validation technique)
    acc_train = accuracy_score(smp_classes_train, Y_train_pred) # Computes the accuracy
    print('    Overall accuracy (resubst): ' + str(round(acc_train,4)))
    #print('-------------------')
    
    Y_test_pred = rf.predict(smp_values_test) # Classifies the testing samples (hold-out validation technique)
    acc_test = accuracy_score(smp_classes_test, Y_test_pred) # Computes the accuracy
    print('    Overall accuracy (holdout): ' + str(round(acc_test,4)))
    confusion_pred = pd.crosstab(Y_test_pred, smp_classes_test, rownames=['Pred'], colnames=['Actual'], 
                                 margins=False, margins_name="Total") # confusion matrix for the testing dataset
    
    confusion_pred.loc['Accuracies','OA_Resubs'] = acc_train
    confusion_pred.loc['Accuracies','OA_Holdout'] = acc_test

    # -----------------------------------------------------------------------------------------------------------

    class_pred = rf.predict(pixels_values) #classification of all pixels
    
    imgPath = "DATA/ee_export/" + filenames_C1[i] + ".tif"
    example_img = gdal.Open(imgPath)

    Nrows = example_img.RasterYSize
    Ncols = example_img.RasterXSize
    GeoTransform = example_img.GetGeoTransform()
    Projection = example_img.GetProjection()

    example_img = None

    classif_array = np.reshape(class_pred, (Nrows-6, Ncols-6))

    # In this study, we disconsidered the border pixels (3-pixel length). Therefore, we need to adjust 
    # the array to follow the same characteristics of the example image (GeoTranform and Projection)
    classif_array_adjusted = np.empty((Nrows, Ncols), np.float32)
    classif_array_adjusted[3:Nrows-3, 3:Ncols-3] = classif_array

    # Writing the classification product and the confusion matrix to files
    if (write_files == 'YES'):
        filename_map = dir_output_pixels + filenames_C1[i] + "_Pixels_RFclassification.tif"
        filename_matrix = dir_output_pixels + filenames_C1[i] + "_Pixels_RFconfusionMatrix.csv"

        try:
            # Classification map
            Write_GeoTiff(classif_array_adjusted, filename_map, Nrows, Ncols, GeoTransform, Projection)
            
            # Confusion matrix
            confusion_pred.to_csv(filename_matrix, sep=',', index=True, header=True, index_label='Pred/Actual', 
                                  encoding='utf-8-sig')
            print("    The products were written to file!")
            
        except Exception as e:
            print(str(e))
    
    print("\n")

Classifying the  NL  data cube...
    Overall accuracy (resubst): 0.9499
    Overall accuracy (holdout): 0.861
    The products were written to file!


Classifying the  Ratio  data cube...
    Overall accuracy (resubst): 0.9337
    Overall accuracy (holdout): 0.7587
    The products were written to file!


Classifying the  RGI  data cube...
    Overall accuracy (resubst): 0.9391
    Overall accuracy (holdout): 0.7172
    The products were written to file!


Classifying the  VH  data cube...
    Overall accuracy (resubst): 0.9507
    Overall accuracy (holdout): 0.8542
    The products were written to file!


Classifying the  VV  data cube...
    Overall accuracy (resubst): 0.9383
    Overall accuracy (holdout): 0.8475
    The products were written to file!




## Classification 2: using basic metrics

In this procedure, we perform the classification of the data cube considering the metrics computed for the time series.

In [7]:
#for i in range(1):
for i in range(len(filenames_C2)):
    print("Classifying the ", filenames_C2[i], " data cube...")
    
    pixel_metrics = pd.read_csv(list_metrics_C2[i])
    sample_metrics = pd.read_csv(list_samples_C2[i])
    sample_metrics = sample_metrics.drop(labels='Class', axis=1) # removing the column 'Class' from the dataframe
    sample_classes = pd.read_csv(list_classes_C2[i])
    
    smp_metrics = np.float32(sample_metrics)
    smp_classes = np.float32(sample_classes)
    smp_classes = np.ravel(smp_classes) # converting from column-vector to 1d array (expected by the classifier)
    pixels_metrics = np.float32(pixel_metrics)
    
    # ------------------------- SPLITTING THE SAMPLES INTO TRAINING AND TESTING DATASETS -------------------------
    
    # This scikit-learn function separates the datasets in train and test samples
    smp_metrics_train, smp_metrics_test, smp_classes_train, smp_classes_test = train_test_split(smp_metrics, smp_classes, 
                                                                                                test_size=0.3)
    
    rf = RandomForestClassifier(n_estimators=300, min_samples_leaf = 5, max_features = 5, n_jobs=-1, oob_score=True)
    rf.fit(smp_metrics_train, smp_classes_train) # X and Y parameters (as recognized by the classifier)
    
    Y_train_pred = rf.predict(smp_metrics_train) # Classifies the training samples (resubstitution validation technique)
    acc_train = accuracy_score(smp_classes_train, Y_train_pred) # Computes the accuracy
    print('    Overall accuracy (resubst): ' + str(round(acc_train,4)))
    #print('-------------------')
    
    Y_test_pred = rf.predict(smp_metrics_test) # Classifies the testing samples (hold-out validation technique)
    acc_test = accuracy_score(smp_classes_test, Y_test_pred) # Computes the accuracy
    print('    Overall accuracy (holdout): ' + str(round(acc_test,4)))
    confusion_pred = pd.crosstab(Y_test_pred, smp_classes_test, rownames=['Pred'], colnames=['Actual'], 
                                 margins=False, margins_name="Total") # confusion matrix for the testing dataset
    
    confusion_pred.loc['Accuracies','OA_Resubs'] = acc_train
    confusion_pred.loc['Accuracies','OA_Holdout'] = acc_test

    # -----------------------------------------------------------------------------------------------------------

    class_pred = rf.predict(pixels_metrics) #classification of all pixels
    
    imgPath = "DATA/ee_export/" + filenames_C2[i] + ".tif"
    example_img = gdal.Open(imgPath)

    Nrows = example_img.RasterYSize
    Ncols = example_img.RasterXSize
    GeoTransform = example_img.GetGeoTransform()
    Projection = example_img.GetProjection()

    example_img = None

    classif_array = np.reshape(class_pred, (Nrows-6, Ncols-6))

    # In this study, we disconsidered the border pixels (3-pixel length). Therefore, we need to adjust 
    # the array to follow the same characteristics of the example image (GeoTranform and Projection)
    classif_array_adjusted = np.empty((Nrows, Ncols), np.float32)
    classif_array_adjusted[3:Nrows-3, 3:Ncols-3] = classif_array

    # Writing the classification product and the confusion matrix to files
    if (write_files == 'YES'):
        filename_map = dir_output_metrics + filenames_C2[i] + "_Metrics_RFclassification.tif"
        filename_matrix = dir_output_metrics + filenames_C2[i] + "_Metrics_RFconfusionMatrix.csv"

        try:
            # Classification map
            Write_GeoTiff(classif_array_adjusted, filename_map, Nrows, Ncols, GeoTransform, Projection)
            
            # Confusion matrix
            confusion_pred.to_csv(filename_matrix, sep=',', index=True, header=True, index_label='Pred/Actual', 
                                  encoding='utf-8-sig')
            print("    The products were written to file!")
            
        except Exception as e:
            print(str(e))
    
    print("\n")

Classifying the  NL  data cube...
    Overall accuracy (resubst): 0.911
    Overall accuracy (holdout): 0.8407
    The products were written to file!


Classifying the  Ratio  data cube...
    Overall accuracy (resubst): 0.8803
    Overall accuracy (holdout): 0.666
    The products were written to file!


Classifying the  RGI  data cube...
    Overall accuracy (resubst): 0.8758
    Overall accuracy (holdout): 0.64
    The products were written to file!


Classifying the  VH  data cube...
    Overall accuracy (resubst): 0.9118
    Overall accuracy (holdout): 0.8301
    The products were written to file!


Classifying the  VV  data cube...
    Overall accuracy (resubst): 0.8994
    Overall accuracy (holdout): 0.8127
    The products were written to file!


