# Level 2 - Multinomial Logistic Regression

## 1. Import

In [1]:
import pandas as pd
import numpy as np
from copy import copy
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

import matplotlib.pyplot as plt

## 2. Data Prepare

In [2]:
def file_to_data(filename):
    
    # read file
    raw = pd.read_csv(filename)
    
    # define columns to be extracted
    ndvi_col = ['ndvi_mean'+str(i) for i in range(1, 24+1)] # ndvi_mean1, ..., ndvi_mean24
    savi_col = ['ndvi_mean'+str(i) for i in range(1, 24+1)] # ndvi_mean1, ..., savi_mean24
    evi_col = ['ndvi_mean'+str(i) for i in range(1, 24+1)] # ndvi_mean1, ..., evi_mean24

    # extract each feature
    ndvi_timeseries = raw[ndvi_col]
    savi_timeseries = raw[savi_col]
    evi_timeseries = raw[evi_col]
    
    # dimension stack
    data = np.dstack([ndvi_timeseries, savi_timeseries, evi_timeseries])
    
    # get only label
    label = raw.iloc[:, -1]
    
    return data, label

def data_to_lv1_x_y(data, labels):
    
    # export all X
    # but reclass the plant cane -> 100, ratoon cane -> 200
    target = copy(labels)
    target[target > 200] = 200
    target[(target < 200) & (target > 100)] = 100
    
    return data, target

def data_to_lv2_x_y(data, target):
    
    # export X plant which labels are 100-200
    X_plant = data[(target > 100) & (target <200)]
    
    # export X ratoon which labels are >200
    X_ratoon = data[target > 200]
    
    # export y plant which labels are 100-200
    y_plant = target[(target > 100) & (target < 200)]
    
    # export y ratoon which labels are >200
    y_ratoon = target[target > 200]
    
    return X_plant, y_plant, X_ratoon, y_ratoon

def label_encode(y):
    
    target = copy(y)
    # redefine the cane plantation label to be 1 - 24
    new_y = [int(str(label)[1:]) for label in target]
    
    return np.array(new_y)

In [3]:
data_train, labels_train = file_to_data('training_data_1718.csv')
data_test, labels_test = file_to_data('training_data_1819.csv')

In [4]:
X_plant_train, y_plant_train, X_ratoon_train, y_ratoon_train = data_to_lv2_x_y(data_train, labels_train)
X_plant_test, y_plant_test, X_ratoon_test, y_ratoon_test = data_to_lv2_x_y(data_test, labels_test)

In [5]:
y_plant_train.unique()

array([101, 102, 103, 104, 105, 106, 107, 108, 121, 122, 123, 124],
      dtype=int64)

In [6]:
def makelabel(y):
    y = y.to_numpy()
    for i in range(len(y)):
        if y[i] %100 >0 and y[i] %100<=2:
              y[i] = 0 
        elif y[i] %100>2 and y[i] %100<=4:
              y[i] = 1 
        elif y[i] %100>4 and y[i] %100<=6:
              y[i] = 2 
        elif y[i] %100>6 and y[i] %100<=8:
              y[i] = 3                 
        elif y[i] %100>8 and y[i] %100<=10:
              y[i] = 4 
        elif y[i] %100>20 and y[i] %100<=22:
              y[i] = 10 
        elif y[i] %100>22 and y[i] %100<=24:
              y[i] = 11           
    return y
y_plant_train = makelabel(y_plant_train)
y_ratoon_train = makelabel(y_ratoon_train)

y_plant_test = makelabel(y_plant_test)
y_ratoon_test = makelabel(y_ratoon_test)

In [7]:
print(f'numeric class of y train: {np.unique(y_plant_train)}')
print(f'numeric class of y test: {np.unique(y_plant_test)}')

numeric class of y train: [ 0  1  2  3 10 11]
numeric class of y test: [ 0  1  2  3 10 11]


In [8]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

In [9]:
X_plant_train = X_plant_train.reshape(X_plant_train.shape[0], -1)
X_ratoon_train = X_ratoon_train.reshape(X_ratoon_train.shape[0], -1)
print(f'X plant train: {X_plant_train.shape}')
print(f'X ratoon train: {X_ratoon_train.shape}')

X plant train: (4489, 72)
X ratoon train: (7878, 72)


In [10]:
# instantiating the random over sampler 
ros = RandomOverSampler()
# resampling X, y
X_plant_train, y_plant_train = ros.fit_resample(X_plant_train, y_plant_train)
X_ratoon_train, y_ratoon_train = ros.fit_resample(X_ratoon_train, y_ratoon_train)

print(f'after oversampled-> X plant shape: {X_plant_train.shape}')
print(f'after oversampled-> X ratoon shape: {X_ratoon_train.shape}')

after oversampled-> X plant shape: (10206, 72)
after oversampled-> X ratoon shape: (17650, 72)


In [11]:
from sklearn.model_selection import train_test_split

X_plant_train, X_plant_valid, y_plant_train, y_plant_valid = train_test_split(
    X_plant_train,
    y_plant_train,
    test_size=0.3,
    stratify=y_plant_train
)

X_ratoon_train, X_ratoon_valid, y_ratoon_train, y_ratoon_valid = train_test_split(
    X_ratoon_train,
    y_ratoon_train,
    test_size=0.3,
    stratify=y_ratoon_train
)

In [12]:
y_ratoon_train.shape

(12355,)

In [13]:
xtrain_mix = np.concatenate((X_plant_train, X_ratoon_train), axis=0)
ytrain_mix = np.concatenate((y_plant_train, y_ratoon_train), axis=0)

xvali_mix = np.concatenate((X_plant_valid, X_ratoon_valid), axis=0)
yvali_mix = np.concatenate((y_plant_valid, y_ratoon_valid), axis=0)

In [14]:
print(xtrain_mix.shape, ytrain_mix.shape)

(19499, 72) (19499,)


## 3. Model

In [15]:
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
model.fit(xtrain_mix, ytrain_mix)

LogisticRegression(max_iter=1000, multi_class='multinomial')

## 4 Training Evaluate

In [16]:
yhat = model.predict(xvali_mix)
print("=========Classification report=======")
print("Report: ", classification_report(yvali_mix, yhat))

Report:                precision    recall  f1-score   support

           0       0.56      0.63      0.59      1570
           1       0.49      0.39      0.43      1569
           2       0.48      0.38      0.42      1570
           3       0.67      0.70      0.69      1569
          10       0.91      0.98      0.94       510
          11       0.77      0.94      0.85      1569

    accuracy                           0.63      8357
   macro avg       0.65      0.67      0.65      8357
weighted avg       0.61      0.63      0.62      8357



## 5. Validation Dataset

In [17]:
X_plant_test = X_plant_test.reshape(X_plant_test.shape[0], -1)
X_ratoon_test = X_ratoon_test.reshape(X_ratoon_test.shape[0], -1)
print(f'X plant test: {X_plant_train.shape}')
print(f'X ratoon test: {X_ratoon_train.shape}')

X plant test: (7144, 72)
X ratoon test: (12355, 72)


In [18]:
xtest_mix = np.concatenate((X_plant_test, X_ratoon_test), axis=0)
ytest_mix = np.concatenate((y_plant_test, y_ratoon_test), axis=0)

In [19]:
print(xtest_mix.shape)
print(ytest_mix.shape)

(7452, 72)
(7452,)


## 6. Validation Evaluate

In [20]:
yhat = model.predict(xtest_mix)
print("=========Classification report=======")
print("Report: ", classification_report(ytest_mix, yhat))

Report:                precision    recall  f1-score   support

           0       0.37      0.31      0.34       898
           1       0.50      0.44      0.47      2163
           2       0.54      0.43      0.48      2410
           3       0.62      0.70      0.66      1729
          10       0.06      1.00      0.12         3
          11       0.23      0.81      0.36       249

    accuracy                           0.49      7452
   macro avg       0.39      0.62      0.40      7452
weighted avg       0.52      0.49      0.50      7452

