## Loading required libraries 
- Required Libraries:
    - tensorflow > 2
    - numpy
    - pandas
    - sklearn
- Run all cells ( cell -> Run All )

In [58]:
import numpy as np
from numpy import mean
from numpy import std
from numpy import dstack
from pandas import read_csv
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import LSTM
from tensorflow.keras.utils import to_categorical
import os
import pandas as pd
from tensorflow.keras.layers import Bidirectional
from sklearn.metrics import recall_score, precision_score, f1_score
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## Load the data in required format
- Specify the base directories of train and test data
- The function **load_dataset** takes the parameter **dataset_type** to decide which dataset to load (train or test)
- The function checks the directory for records, assigns the label to the record as the parent directory (1: wheat, 0: rapeseed) 
- Convert the categorical features in the input data into one hot encodings
- Append each sample to collection of records 
- Return the collection

In [59]:
# load the dataset 

train_dir = '/projects/AAI/SS22/crop-classification/TimeSen2Crop-SmallSet/train'
test_dir = '/projects/AAI/SS22/crop-classification/TimeSen2Crop-SmallSet/test'

def load_dataset(dataset_type = None):
    train_x, train_y, test_x = list(), list(), list()
    if(dataset_type in ('train', 'test')):
        load_dir = None
        if(dataset_type == 'train'):
            load_dir = train_dir
            for f in os.listdir(load_dir):
                if(f == 'wheat'):
                    x_dir = os.path.join(load_dir, f)
                    for x_sample in os.listdir(x_dir):
                        X = read_csv(os.path.join(x_dir, x_sample))
                        flag_dummies = pd.get_dummies(X.iloc[:, -1])
                        #print(flag_dummies.shape)
                        #X.iloc[:,-1] = flag_dummies
                        train_x.append(X)
                        train_y.append(1)
                else:
                    x_dir = os.path.join(load_dir, f)
                    for x_sample in os.listdir(x_dir):
                        X = read_csv(os.path.join(x_dir, x_sample))
                        #flag_dummies = pd.get_dummies(X.iloc[:,-1])
                        #X.iloc[:,-1] = flag_dummies
                        train_x.append(X)
                        train_y.append(0)
            return np.array(train_x), np.array(train_y)
        else:
            load_dir = test_dir
            for f in os.listdir(load_dir):
                X = read_csv(os.path.join(load_dir, f))
                test_x.append(X)
            return np.array(test_x)
        
        

## Perform preprocessing , Train and Validation Split
- Cateogical variable, **flag** among various features of input records contains different inputs across samples.
- Collect all the unique values accross inputs, store them into a list and perform one-hot encoding on the list.
- Store the one hot encodings in a dictionary with corresponding flag key. (lookup dictionary)
- Training data is split into training set and validation set.
- The function **preprocess_data** performs following actions:
    - For Each sample in the data provided,
        - the sample is divided into variables containing numerical and categorical features.
        - The numerical part is standardized, The categorical part is converted to one-hot representation by refering to the lookup dictionary.
        - The processed parts are concatenated and the original sample is updated with the processed sample.
- preprocess the training data, validation data

In [60]:
def preprocess_data(data):
    for i in range(len(data)):
        sample = data[i]
        sample_n = sample[:, :-1] #all the variables with continous values
        
        stdizer = StandardScaler()
        stdizer = stdizer.fit(sample_n)
        sample_n = stdizer.transform(sample_n)
        
        sample_c = sample[:, -1] # categorical values
        

        sample_c_oh = np.array([all_flags_dict[j] for j in sample_c])
        data[i] = np.concatenate([sample_n, sample_c_oh], axis = 1)
        
        
    data = np.array(data)
    return data
    

In [61]:
trainX, trainY = load_dataset(dataset_type='train')
all_flags = set([j for i in trainX for j in i[:, -1]])
all_flags = np.array(list(all_flags))
all_flags_oh = np.array(pd.get_dummies(all_flags))
all_flags_dict = {key: value for (key, value) in zip(all_flags, all_flags_oh)}
trainX = list(trainX)
test_x = list(test_x)
trainY = pd.get_dummies(trainY)

train_x, validation_x, train_y, validation_y = train_test_split(trainX, trainY, test_size=0.1, random_state=42)

train_x_pp = preprocess_data(train_x)
validation_x_pp = preprocess_data(validation_x)

train_x_pp.shape


(199, 23, 13)

In [62]:
#train_x[198]

In [63]:
trainX, trainY = load_dataset(dataset_type='train')
test_x = load_dataset(dataset_type='test')

all_flags = set([j for i in trainX for j in i[:, -1]])
all_flags = np.array(list(all_flags))

all_flags_oh = np.array(pd.get_dummies(all_flags))
all_flags_dict = {key: value for (key, value) in zip(all_flags, all_flags_oh)}

trainX = list(trainX)
test_x = list(test_x)
trainY = pd.get_dummies(trainY)

train_x, validation_x, train_y, validation_y = train_test_split(trainX, trainY, test_size=0.1, random_state=42)


def preprocess_data(data):
    for i in range(len(data)):
        sample = data[i]
        sample_n = sample[:, :-1] #all the variables with continous values
        
        stdizer = StandardScaler()
        stdizer = stdizer.fit(sample_n)
        sample_n = stdizer.transform(sample_n)
        
        sample_c = sample[:, -1] # categorical values
        

        sample_c_oh = np.array([all_flags_dict[j] for j in sample_c])
        #data[i] = np.concatenate([sample_n, sample_c_oh], axis = 1)
        data[i] = sample_n
        
        
    data = np.array(data)
    return data
    
train_x_pp = preprocess_data(train_x)
validation_x_pp = preprocess_data(validation_x)


In [64]:
#train_x_pp

In [65]:
n_timesteps, n_features = train_x_pp.shape[1], train_x_pp.shape[2]
n_timesteps, n_features 

(23, 9)

## Predictive Model
- As the data contains **time series sequence**, the problem can be identified as **time series classification**.
- Since **RNNs**, work well for time series applications, the model was built using **LSTMs**.
- The model is essentially a **Bidirectional Stacked LSTM Neural Network**
- Network architecture is as follows:
    - input - (time steps, features)
    - Bidirectional LSTM layer with 70 LSTM units.
    - Dropout layer with p = 0.5.
    - Batch Normalization Layer
    - Bidirectional LSTM layer with 70 LSTM units.
    - Dropout layer with p = 0.5.
    - Batch Normalization Layer
    - Fully connected layer with 100 neurons
    - Relu activation layer.
    - output - Fully connected layer with 2 neurons for classification with Softmax activation function.
- Loss function: Categorical Cross Entropy Loss
- Optimizer : Adam
- Although it is a binary classification problem, 2 units in output layer were used with softmax instead of one unit simply because of the simplicity. 

In [66]:
def lstm_model(lstm_units, train_x, train_y):
    n_timesteps, n_features = train_x.shape[1], train_x.shape[2]
    
    model = Sequential()
    
    model.add(Bidirectional(LSTM(lstm_units, return_sequences=True), input_shape=(n_timesteps,n_features)))
    model.add(Dropout(0.5))
    model.add(tf.keras.layers.BatchNormalization())
    
    model.add(Bidirectional(LSTM(lstm_units)))
    model.add(Dropout(0.5))
    model.add(tf.keras.layers.BatchNormalization())
    
    model.add(Dense(100, activation='relu'))
    model.add(tf.keras.layers.BatchNormalization())
    model.add(Dense(2, activation='softmax'))
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model
    



## Training and Evaluation

- The model is trained with following hyper parameters:
    - epochs : 20, batch size: 32
    
- We perform training until we get the desired accuracy and we set a loop threshold as well, to prevent infinite loop.

- The **predict_crop_type** function is used to output the crop type given the dataset and trained model as parameters.

- The model is evaulated based on **accuracy**, **precision**, **recall** and **f1-score** metrics on the validation dataset.



In [67]:
train_x_pp.shape

(199, 23, 9)

In [68]:
lstm_model = lstm_model(70, train_x_pp, train_y)
accuracy = 0
loop_count = 0
loop_threshold = 10
accuracy_threshold = 0.95
while(accuracy < accuracy_threshold and loop_count < loop_threshold):
    lstm_model.fit(train_x_pp, train_y, epochs=20, batch_size=32, verbose=0)
    _, temp_acc = lstm_model.evaluate(validation_x_pp, validation_y, verbose=1)
    accuracy = temp_acc
    loop_count += 1
    

print(f'Final validation accuracy {accuracy}')

Final validation accuracy 0.95652174949646


In [69]:
def predict_crop_type(x, model):
    predictions = list()
    predictions.append(model.predict(x))
    return predictions

crop_type = {0:'rapeseed', 1:'wheat'}

In [70]:
predictions = predict_crop_type(validation_x_pp, lstm_model)
predictions = np.argmax(predictions, axis = 2)[0]
predictions_labels = np.array([str(crop_type[i]) for i in predictions])

validation_y_np = validation_y.to_numpy()
ground_truth = np.argmax(validation_y_np, axis= 1)
ground_truth_labels = np.array([str(crop_type[i]) for i in ground_truth])

precision = precision_score(ground_truth, predictions)
recall = recall_score(predictions, ground_truth)
f1_score = f1_score(predictions, ground_truth)

print(f'precision: {np.round(precision, decimals=2)}, recall: {np.round(recall, decimals=2)} f1 score: {np.round(f1_score, decimals=2)}')

precision: 0.92, recall: 0.92 f1 score: 0.96


## Comparison of predicted labels and ground truth labels
- The prediction is performed on validation data
- The table below shows the predictions compared to the ground truth.

In [71]:
comparison_matrix = np.stack([predictions_labels, ground_truth_labels], axis = 1)
comparison_matrix = pd.DataFrame(comparison_matrix, columns=['predcitions', 'ground truth'])
comparison_matrix



Unnamed: 0,predcitions,ground truth
0,rapeseed,rapeseed
1,rapeseed,rapeseed
2,wheat,wheat
3,rapeseed,rapeseed
4,rapeseed,rapeseed
5,rapeseed,rapeseed
6,wheat,wheat
7,wheat,wheat
8,wheat,wheat
9,wheat,wheat


## Prediction on the test data set
- preprocessing is performed on the test data
- Model predicts the outcomes
- The table shows the predictions for the test data

In [72]:
test_x_pp = preprocess_data(test_x)
test_x_pp

array([[[ 0.84651347,  0.82190026,  0.72522181, ...,  0.96373357,
          1.57808217,  1.70592206],
        [-0.45729739, -0.46955542, -0.51922234, ..., -0.12977125,
         -0.40608103, -0.26084409],
        [-0.49535224, -0.49767129, -0.50511712, ..., -0.61390676,
         -0.16394866, -0.25792893],
        ...,
        [-0.35697098, -0.31819831, -0.24894818, ..., -0.51769055,
          0.76160852,  0.83234361],
        [-0.34313286, -0.29804861, -0.22164776, ..., -0.5426355 ,
          0.81201833,  1.01211226],
        [-0.34053821, -0.31304374, -0.24849317, ..., -0.5920163 ,
          0.78226828,  1.03349015]],

       [[-0.46032361, -0.47623552, -0.51781801, ..., -0.67099601,
         -0.38695342, -0.41191963],
        [-0.46922351, -0.50332209, -0.52550737, ..., -0.73967127,
         -0.39045899, -0.41101785],
        [ 2.71550735,  2.53167902,  2.31855251, ...,  1.84817951,
          2.10900895,  2.23662246],
        ...,
        [-0.34778286, -0.30164137, -0.2229478 , ..., -

In [73]:
predictions_test = predict_crop_type(test_x_pp, lstm_model)
print(predictions_test)
predictions_test = np.argmax(predictions_test, axis = 2)[0]
predictions_test_labels = np.array([str(crop_type[i]) for i in predictions_test])

file_names_test = np.array(os.listdir(test_dir))
prediction_table = np.stack([file_names_test, predictions_test_labels], axis = 1)
prediction_table_df = pd.DataFrame(prediction_table, columns=['Sample', 'Predicted Label'])
prediction_table_df

[array([[1.0508592e-03, 9.9894911e-01],
       [1.0477438e-02, 9.8952252e-01],
       [1.6227259e-03, 9.9837720e-01],
       [5.6860312e-03, 9.9431396e-01],
       [3.3585823e-03, 9.9664140e-01],
       [4.6494088e-04, 9.9953508e-01],
       [5.5279462e-03, 9.9447203e-01],
       [2.0823538e-02, 9.7917652e-01],
       [4.2195231e-01, 5.7804769e-01],
       [1.0973314e-02, 9.8902673e-01]], dtype=float32)]


Unnamed: 0,Sample,Predicted Label
0,10.csv,wheat
1,8.csv,wheat
2,5.csv,wheat
3,9.csv,wheat
4,6.csv,wheat
5,3.csv,wheat
6,1.csv,wheat
7,2.csv,wheat
8,7.csv,wheat
9,4.csv,wheat


## Since almost all of the samples are predicted as ***wheat***, we can assume that the test data belongs to ***wheat*** class.