In [None]:
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix, f1_score)
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
import numpy as np
import pandas as pd


# download custom functions
import sys
sys.path.append('../model_dev_functions')

from model_development import base_model_pred, df_to_arr, model_scores
from feature_engineering import add_NDVI, add_vhvv, get_high_corr_cols
from data_prep import (batch_aggregate_pickle, get_aggregation_from_window,
                       read_multiple_pickles)
from submission_format import prediction_to_submission_df
from temp_acc import temp_accuracy   # evaluate accuracy based on the prediction we used to get 1.0 accuracy score

## Iteration 2
test accuracy = 0.86
- all February, August, December data + NDVI
- window size of 5*5, aggregated by mean
- remove highly correlated features
- stack models

 As we realize from our model development exploration that different models lead to different result, stacking models might lead to improved performance by making our result more robust. However, this reduces our performance instead.

#### Data preparation

In [None]:
# get data for february, march and december for Sentinel-2 (training data)

# read multiple pickle files for band data corresponding to available dates in february, august and december
fad_s2_paths, fad_s2_dfs_list = read_multiple_pickles('../11-datasets/feb_aug_dec-S2', ['latitude', 'longitude', 'geometry', 'grouping'])

# aggregate all the features with windows 5*5 by mean for the list of dataframes read from the pickle files
fad_s2_df, fad_s2_df_list  = batch_aggregate_pickle(fad_s2_dfs_list, fad_s2_paths, '_w5', 'Class of Land', agg_method=lambda x:x.mean())

In [None]:
# get data for february, march and december for Sentinel-2 (coordinates from submission template)
sub2_fad_paths, sub2_fad_df_list = read_multiple_pickles('../11-datasets/SUBMISSION-template_data', 
                                                         ['id', 'latitude', 'longitude', 'geometry', 'target'], filter_condition='-03-')
template_data_s2_df, template_s2_df_list = batch_aggregate_pickle(sub2_fad_df_list, sub2_fad_paths, '_w5', None)

#### Data preprocessing 
to add NDVI and remove highly correlated features

In [None]:
def train_2_processing(list_s2_to_add_NDVI, y_in, corr_thresh):
    X2 = add_NDVI(list_s2_to_add_NDVI)
    high_corr_cols = get_high_corr_cols(X2, corr_thresh)
    X2 = X2.drop(high_corr_cols, axis=1)

    y2 = y_in.copy()

    return X2, y2, high_corr_cols

X_train2, y_train2, corr_cols2 = train_2_processing(fad_s2_df_list, fad_s2_df['Class of Land'], corr_thresh = 0.95)


number of high_corr_cols: 70


In [None]:
def pred_2_processing(list_s2_to_add_NDVI, corr_cols):
    X2 = add_NDVI(list_s2_to_add_NDVI)
    X2 = X2.drop(corr_cols, axis=1)

    return X2

X_pred2 = pred_2_processing(template_s2_df_list, corr_cols2)
X_pred2

Unnamed: 0,AOT_w5_0209,B02_w5_0209,B08_w5_0209,WVP_w5_0209,visual_w5_0209,B11_w5_0209,SCL_w5_0209,B01_w5_0209,B09_w5_0209,NDVI_0,...,SCL_w5_1216,B09_w5_1216,NDVI_6,B02_w5_1226,B08_w5_1226,WVP_w5_1226,visual_w5_1226,SCL_w5_1226,B01_w5_1226,NDVI_7
0,204.0,586.320007,2676.879883,4240.600098,51.160000,1421.520020,4.00,787.280029,3860.560059,-0.686394,...,9.00,12270.919922,-0.069629,789.880005,1139.680054,4347.000000,89.760002,2.28,727.640015,-0.129313
1,204.0,379.640015,5017.359863,4734.040039,30.160000,1784.520020,4.00,373.440002,4630.279785,-0.890205,...,8.80,13114.280273,0.015639,895.440002,1312.160034,4347.000000,98.800003,7.00,871.000000,-0.151119
2,204.0,858.719971,4147.040039,4255.000000,75.639999,2619.479980,4.40,1945.239990,4604.359863,-0.697297,...,9.00,15611.559570,-0.073813,896.840027,1298.880005,4347.000000,83.680000,8.92,949.880005,-0.226979
3,204.0,1200.560059,688.080017,4377.000000,112.040001,689.159973,6.08,2009.199951,2318.120117,0.229456,...,9.00,16110.000000,-0.037297,1811.680054,1415.640015,4347.000000,222.440002,8.64,2333.879883,0.226469
4,204.0,286.239990,4420.959961,4496.520020,34.639999,1483.280029,4.00,208.199997,4497.319824,-0.859015,...,7.20,4733.200195,-0.306312,580.119995,4616.479980,4508.640137,45.119999,4.00,549.080017,-0.826212
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,204.0,705.000000,258.440002,4377.000000,90.760002,147.440002,6.00,650.880005,117.680000,0.549442,...,10.00,1142.959961,0.487675,5862.720215,5614.560059,4347.000000,255.000000,9.00,4649.240234,-0.010744
246,204.0,2149.760010,1895.760010,4377.000000,188.600006,1425.160034,7.36,1920.959961,2152.199951,-0.011612,...,9.00,16110.000000,-0.039739,1522.479980,1028.160034,4347.000000,181.600006,8.76,2415.040039,0.269378
247,204.0,2886.239990,2818.959961,4377.000000,229.559998,1935.599976,7.88,1835.000000,2008.359985,-0.064964,...,9.00,16110.000000,-0.029654,1568.079956,1149.000000,4347.000000,160.919998,9.12,1867.319946,0.158007
248,204.0,683.359985,2796.239990,4598.720215,61.919998,1583.920044,4.00,696.799988,2879.560059,-0.644441,...,8.44,10499.919922,-0.003553,990.520020,1584.319946,4347.000000,95.360001,10.00,978.039978,-0.258276


#### Model development and prediction

1st layer

In [None]:
# train base models
models = [
    RandomForestClassifier(random_state = 42, n_estimators=100),
    LogisticRegression(random_state = 42 ),
    #xgb.XGBClassifier(n_estimators=100, random_state=42, n_jobs = -1),
    lgb.LGBMClassifier(random_state = 42, n_estimators=100)
]


def base_model_pred_for_submission(X_train, y_train, X_pred, models):
    '''
    Use each model in the list of models to predict label based on X_pred,
    with each predicted array combined to form a resultant array with the shape of (250, number_of_models)
    '''
    # Generate predictions from base models
    X = []  # This will store the prediction outputs of each model
    y = []  # This will store the true labels
    for model in models:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_pred)
        X.append(y_pred)

    X_2 = np.array(X).T  # Convert to a 2D array, where each row represents a sample and each column represents a model's prediction

    return X_2 

X_pred2_2 = base_model_pred_for_submission(X_train2, y_train2, X_pred2, models)
X_pred2_2.shape

(250, 3)

2nd layer

In [None]:
from sklearn.model_selection import cross_val_predict
def base_model_pred(X_train, y_train, models):
    '''
    Use each model in the list of models to predict label based on X_train,
    with each predicted array combined to form a resultant array with the shape of (600, number_of_models).

    Used to train the 2nd layer of model.
    '''
    # Generate predictions from base models
    X = []  # This will store the prediction outputs of each model
    y = []  # This will store the true labels
    for model in models:
        y_pred = cross_val_predict(model, X_train, y_train, cv=5)
        X.append(y_pred)

    # Convert to a 2D array, where each row represents a sample and each column represents a model's prediction
    X = np.array(X).T
    y = y_train  # Flatten the true labels into a 1D array

    return X, y

# train 2nd layer model
X_train2_2, y_train2_2 = base_model_pred(X_train2, y_train2, models)
models[0].fit(X_train2_2, y_train2_2)
meta_rf = models[0].predict(X_pred2_2) 

# final prediction
pred2 = meta_rf

In [None]:
print(f'accuracy={temp_accuracy(pred2)[0]}, F1-score={temp_accuracy(pred2)[1]}')

accuracy=0.904, F1-score=0.8636363636363636


In [None]:
# load submission
submission_df2 = prediction_to_submission_df('../submission/challenge_1_submission_template.csv', pred2)
submission_df2

Unnamed: 0,id,target
0,"(10.18019073690894, 105.32022315786804)",Rice
1,"(10.561107033461816, 105.12772097986661)",Rice
2,"(10.623790611954897, 105.13771401411867)",Non Rice
3,"(10.583364246115156, 105.23946127195805)",Non Rice
4,"(10.20744446668854, 105.26844107128906)",Rice
...,...,...
245,"(10.308283266873062, 105.50872812216863)",Non Rice
246,"(10.582910017285496, 105.23991550078767)",Non Rice
247,"(10.581547330796518, 105.23991550078767)",Non Rice
248,"(10.629241357910818, 105.15315779432643)",Rice
