# Level 1 - Multinomial Logistic Regression

## 1. Import

In [1]:
import pandas as pd
import numpy as np
from copy import copy
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# from google.colab import drive
from IPython.display import clear_output

## 2. Data Prepare

In [2]:
def file_to_data(filename):
    
    # read file
    raw = pd.read_csv(filename)
    
    # define columns to be extracted
    ndvi_col = ['ndvi_mean'+str(i) for i in range(1, 24+1)] # ndvi_mean1, ..., ndvi_mean24
    savi_col = ['ndvi_mean'+str(i) for i in range(1, 24+1)] # ndvi_mean1, ..., savi_mean24
    evi_col = ['ndvi_mean'+str(i) for i in range(1, 24+1)] # ndvi_mean1, ..., evi_mean24

    # extract each feature
    ndvi_timeseries = raw[ndvi_col]
    savi_timeseries = raw[savi_col]
    evi_timeseries = raw[evi_col]
    
    # dimension stack
    data = np.dstack([ndvi_timeseries, savi_timeseries, evi_timeseries])
    
    # get only label
    label = raw.iloc[:, -1]
    
    return data, label

def data_to_lv1_x_y(data, labels):
    
    # export all X
    # but reclass the plant cane -> 100, ratoon cane -> 200
    target = copy(labels)
    target[target > 200] = 200
    target[(target < 200) & (target > 100)] = 100
    
    return data, target

def data_to_lv2_x_y(data, target):
    
    # export X plant which labels are 100-200
    X_plant = data[(target > 100) & (target <200)]
    
    # export X ratoon which labels are >200
    X_ratoon = data[target > 200]
    
    # export y plant which labels are 100-200
    y_plant = target[(target > 100) & (target < 200)]
    
    # export y ratoon which labels are >200
    y_ratoon = target[target > 200]
    
    return X_plant, y_plant, X_ratoon, y_ratoon

def label_encode(y):
    
    target = copy(y)
    # redefine the cane plantation label to be 1 - 24
    new_y = [int(str(label)[1:]) for label in target]
    
    return np.array(new_y)

In [3]:
data_train, labels_train = file_to_data('training_data_1718.csv')
data_test, labels_test = file_to_data('training_data_1819.csv')

In [4]:
X_train, y_train = data_to_lv1_x_y(data_train, labels_train)
X_test, y_test = data_to_lv1_x_y(data_test, labels_test)

print(np.unique(y_train))

[ 40  50  60  70  71  72  80  81  82  90 100 200]


In [5]:
from sklearn.preprocessing import LabelEncoder

lu_encoder = LabelEncoder()

y_train = lu_encoder.fit_transform(y_train)
y_test = lu_encoder.transform(y_test)

In [6]:
print(f'numeric class of y train: {np.unique(y_train)}')
print(f'numeric class of y test: {np.unique(y_test)}')

numeric class of y train: [ 0  1  2  3  4  5  6  7  8  9 10 11]
numeric class of y test: [ 0  1  2  3  4  5  6  7  8  9 10 11]


In [7]:
y_train[y_train>9] = 10
y_test[y_test > 9] = 10

In [8]:
print(f'numeric class of y train: {np.unique(y_train)}')
print(f'numeric class of y test: {np.unique(y_test)}')

numeric class of y train: [ 0  1  2  3  4  5  6  7  8  9 10]
numeric class of y test: [ 0  1  2  3  4  5  6  7  8  9 10]


In [9]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

In [10]:
shape = X_train.shape
print(Counter(y_train))

Counter({10: 12367, 0: 118, 1: 92, 2: 85, 4: 30, 5: 30, 7: 30, 9: 30, 8: 30, 3: 26, 6: 22})


In [11]:
X_train = X_train.reshape(X_train.shape[0], -1)
X_test = X_test.reshape(X_test.shape[0], -1)
X_train.shape

(12860, 72)

In [12]:
# instantiating the random over sampler 
# ros = RandomOverSampler()
# # resampling X, y
# X_train, y_train = ros.fit_resample(X_train, y_train)

# print(f'after oversampled-> X shape: {X_train.shape}')

In [13]:
# add intercept to our X
intercept = np.ones((X_train.shape[0], 1))
X_train   = np.concatenate((intercept, X_train), axis=1)  #add intercept
intercept = np.ones((X_test.shape[0], 1))
X_test    = np.concatenate((intercept, X_test), axis=1)  #add intercept

## 3. Model

In [14]:
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
model.fit(X_train, y_train)

LogisticRegression(max_iter=1000, multi_class='multinomial')

## 4. Training Evaluate

In [17]:
yhat = model.predict(X_train)
print("=========Classification report=======")
print("Report: ", classification_report(y_train, yhat, zero_division=True))

Report:                precision    recall  f1-score   support

           0       1.00      1.00      1.00       118
           1       0.95      1.00      0.97        92
           2       0.98      0.96      0.97        85
           3       1.00      0.08      0.14        26
           4       0.86      0.20      0.32        30
           5       1.00      0.00      0.00        30
           6       1.00      0.95      0.98        22
           7       0.96      0.83      0.89        30
           8       0.93      0.87      0.90        30
           9       0.78      0.23      0.36        30
          10       0.99      1.00      1.00     12367

    accuracy                           0.99     12860
   macro avg       0.95      0.65      0.68     12860
weighted avg       0.99      0.99      0.99     12860



## 5. Testing Evaluate

In [16]:
yhat = model.predict(X_test)
print("=========Classification report=======")
print("Report: ", classification_report(y_test, yhat))

Report:                precision    recall  f1-score   support

           0       1.00      1.00      1.00       118
           1       0.93      1.00      0.96        92
           2       0.98      0.96      0.97        85
           3       0.67      0.08      0.14        26
           4       1.00      0.20      0.33        30
           5       0.00      0.00      0.00        30
           6       1.00      0.95      0.98        22
           7       0.96      0.83      0.89        30
           8       0.93      0.87      0.90        30
           9       0.88      0.23      0.37        30
          10       0.99      1.00      0.99      7452

    accuracy                           0.98      7945
   macro avg       0.85      0.65      0.68      7945
weighted avg       0.98      0.98      0.98      7945

