# Experiment 1 Data Preparation

import libraries

In [1]:
import pandas as pd
import numpy as np
from copy import copy

define function to convert raw data to numpy array as (batch, time len, features)

In [36]:
def file_to_data(filename):
    
    # read file
    raw = pd.read_csv(filename)
    
    # define columns to be extracted
    ndvi_col = ['ndvi_mean'+str(i) for i in range(1, 24+1)] # ndvi_mean1, ..., ndvi_mean24
    savi_col = ['ndvi_mean'+str(i) for i in range(1, 24+1)] # ndvi_mean1, ..., savi_mean24
    evi_col = ['ndvi_mean'+str(i) for i in range(1, 24+1)] # ndvi_mean1, ..., evi_mean24

    # extract each feature
    ndvi_timeseries = raw[ndvi_col]
    savi_timeseries = raw[savi_col]
    evi_timeseries = raw[evi_col]
    
    # dimension stack
    data = np.dstack([ndvi_timeseries, savi_timeseries, evi_timeseries])
    
    # get only label
    label = raw.iloc[:, -1]
    
    return data, label

def data_to_lv1_x_y(data, labels):
    
    # export all X
    # but reclass the plant cane -> 100, ratoon cane -> 200
    target = copy(labels)
    target[target > 200] = 200
    target[(target < 200) & (target > 100)] = 100
    
    return data, target

def data_to_lv2_x_y(data, target):
    
    # export X plant which labels are 100-200
    X_plant = data[(target > 100) & (target <200)]
    
    # export X ratoon which labels are >200
    X_ratoon = data[target > 200]
    
    # export y plant which labels are 100-200
    y_plant = target[(target > 100) & (target < 200)]
    
    # export y ratoon which labels are >200
    y_ratoon = target[target > 200]
    
    return X_plant, y_plant, X_ratoon, y_ratoon

def label_encode(y):
    
    target = copy(y)
    # redefine the cane plantation label to be 1 - 24
    new_y = [int(str(label)[1:]) for label in target]
    
    return np.array(new_y)

## Preparation step 1 -> raw data extraction

from file to data and lebels, use <code>file_to_data</code> function

In [3]:
data, labels = file_to_data('training_data_1718.csv')

Shape of data is (batch, sequence length, features)

In [47]:
data.shape

(12860, 24, 3)

and labels are

In [4]:
labels

0         70
1         70
2         70
3         70
4         70
        ... 
12855    224
12856    224
12857    224
12858    224
12859    224
Name: code, Length: 12860, dtype: int64

## Preparation step 2.1 -> X, y level 1 extraction

use <code>data_to_lv1_x_y</code> function to extract X and y

In [5]:
X, y = data_to_lv1_x_y(data, labels)

Shape of X is (batch, sequence length, features)

In [6]:
X.shape

(12860, 24, 3)

But y still be the identific class

In [7]:
np.unique(y)

array([ 40,  50,  60,  70,  71,  72,  80,  81,  82,  90, 100, 200])

Encode them 

In [24]:
from sklearn.preprocessing import LabelEncoder

lu_encoder = LabelEncoder()

y = lu_encoder.fit_transform(y)

From these identific class

In [25]:
encoder.classes_

array([ 40,  50,  60,  70,  71,  72,  80,  81,  82,  90, 100, 200])

To be new y

In [26]:
np.unique(y)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

**For level 1 use above code and, use this X, and y directly**

## Preparation step 2.2 -> X, y level 2 extraction

Level 2 is separated from lv 1 (not related). we can implement these below code without perform level 1 extraction.

After raw extraction, just use <code>data_to_lv2_x_y</code> function

In [41]:
X_plant, y_plant, X_ratoon, y_ratoon = data_to_lv2_x_y(data, labels)

extracted X for each sugarcane types are

In [53]:
print('X plant shape ', X_plant.shape)
print('X ratoon shape ', X_ratoon.shape)

X plant shape  (4489, 24, 3)
X ratoon shape  (7878, 24, 3)


out y_plant is the labels which are plant cane

In [42]:
np.unique(y_plant)

array([101, 102, 103, 104, 105, 106, 107, 108, 121, 122, 123, 124])

out y_plant is the labels which are ratoon cane

In [43]:
np.unique(y_ratoon)

array([201, 202, 203, 204, 205, 206, 207, 208, 224])

However, this label is not suitable for feeding to model, encode it to simple 

In [44]:
y_plant = label_encode(y_plant)-1
y_ratoon = label_encode(y_ratoon)-1

after encoding, the new y will be the date identification and interpretable

In [45]:
np.unique(y_plant)

array([ 0,  1,  2,  3,  4,  5,  6,  7, 20, 21, 22, 23])

In [46]:
np.unique(y_ratoon)

array([ 0,  1,  2,  3,  4,  5,  6,  7, 23])

For Level 2, implement above code and use 

- Plant date classification: **X_plant, y_plant**
- Ratoon date classification: **X_ratoon, y_ratoon**