In [None]:
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix, f1_score)
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
import numpy as np
import pandas as pd


# download custom functions
import sys
sys.path.append('../model_dev_functions')

from model_development import base_model_pred, df_to_arr, model_scores
from feature_engineering import add_NDVI, add_vhvv, get_high_corr_cols
from data_prep import (batch_aggregate_pickle, get_aggregation_from_window,
                       read_multiple_pickles)
from submission_format import prediction_to_submission_df
from temp_acc import temp_accuracy   # evaluate accuracy based on the prediction we used to get 1.0 accuracy score

## iteration 3
test score = 0.94 / 0.95

we tried adding Sentinel-1 data to the model stacking method used in iteration 2, which improves our model performance significantly. F1-score = 0.94 /  0.95 based on the normalization method we use (MinMax and RobustScaler performs better). To help readability, we will only illustrate experimentation with MinMaxScaler. 

- Sentinel-2: Feb, Aug, Dec raw data + NDVI 
- Sentinel-1: Sentinel-1 raw data data + VH / VV
- drop highly correlated features
- normalization (StandardScaler, MinMaxScaler, RobustScaler)
- stack models



#### Data Preparation

training data

In [None]:
# get data for february, august and december for Sentinel-2 (training data)
fad_s2_paths, fad_s2_dfs_list = read_multiple_pickles('../11-datasets/feb_aug_dec-S2', ['latitude', 'longitude', 'geometry', 'grouping'])     # read multiple pickle files for band data corresponding to available dates in february, august and december
fad_s2_df, fad_s2_df_list  = batch_aggregate_pickle(fad_s2_dfs_list, fad_s2_paths, '_w5', 'Class of Land', agg_method=lambda x:x.mean())      # aggregate all the features with windows 5*5 by mean for the list of dataframes read from the pickle files

# get data for february, august and december for Sentinel-1 (training data)
fad_s1_paths, fad_s1_dfs_list = read_multiple_pickles('../11-datasets/sentinel1a_data', 
                                                        ['latitude', 'longitude', 'geometry', 'grouping'], filter_condition='-03-')     # read multiple pickle files for band data corresponding to available dates in february, august and december
fad_s1_df, fad_s1_df_list  = batch_aggregate_pickle(fad_s1_dfs_list, fad_s1_paths, '_w5', 'Class of Land', agg_method=lambda x:x.mean())      # aggregate all the features with windows 5*5 by mean for the list of dataframes read from the pickle files


template data

In [None]:
# get data for february, march and december for Sentinel-2 (coordinates from submission template)
sub2_fmad_paths, sub2_fmad_df_list = read_multiple_pickles('../11-datasets/SUBMISSION-template_data', 
                                                         ['id', 'latitude', 'longitude', 'geometry', 'target'], filter_condition='-03-')
template_data_s2_df, template_s2_df_list = batch_aggregate_pickle(sub2_fmad_df_list, sub2_fmad_paths, '_w5', None)

# get data for february, march and december for Sentinel-1
sub1_fad_paths, sub1_fad_df_list = read_multiple_pickles('../11-datasets/SUBMISSION-template_data/sentinel1a_template_data', 
                                                         ['id', 'latitude', 'longitude', 'geometry', 'target'], filter_condition='-03-')
template_data_s1_df, template_s1_df_list = batch_aggregate_pickle(sub1_fad_df_list, sub1_fad_paths, '_w5', None)

#### model development

In [None]:
def train_3_processing(list_s2_to_add_NDVI, s1_X_in, y_in, corr_thresh):
    X3 = pd.concat([add_NDVI(list_s2_to_add_NDVI), s1_X_in], axis=1)
    high_corr_cols = get_high_corr_cols(X3, corr_thresh)
    X3 = X3.drop(high_corr_cols, axis=1)

    y3 = y_in.copy()

    return X3, y3, high_corr_cols

X_train3_prescaled, y_train3, corr_cols3 =  train_3_processing(fad_s2_df_list, add_vhvv(fad_s1_df_list), fad_s1_df['Class of Land'], corr_thresh = 0.95)

number of high_corr_cols: 70


In [None]:
def pred_3_processing(list_s2_to_add_NDVI, sub1_X_in, corr_cols):
    X3 = pd.concat([add_NDVI(list_s2_to_add_NDVI), sub1_X_in], axis=1)
    X3 = X3.drop(corr_cols, axis=1)

    return X3

X_pred3_prescaled = pred_3_processing(template_s2_df_list, add_vhvv(template_s1_df_list), corr_cols3)

In [None]:
len(X_pred3_prescaled.columns)

111

In [None]:
scale = MinMaxScaler()
scale.fit(X_train3_prescaled)

X_train3= scale.transform(X_train3_prescaled)
X_pred3 = scale.transform(X_pred3_prescaled)
y_train3 = y_train3

#### model development and prediction

In [None]:
# train base models
models = [
    RandomForestClassifier(random_state = 42, n_estimators=100),
    LogisticRegression(random_state = 42 ),
    #xgb.XGBClassifier(n_estimators=100, random_state=42, n_jobs = -1),
    lgb.LGBMClassifier(random_state = 42, n_estimators=100)
]


def base_model_pred_for_submission(X_train, y_train, X_pred, models):
    # Generate predictions from base models
    X = []  # This will store the prediction outputs of each model
    y = []  # This will store the true labels
    for model in models:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_pred)
        X.append(y_pred)

    X = np.array(X).T  # Convert to a 2D array, where each row represents a sample and each column represents a model's prediction
    # y = y_train  # Flatten the true labels into a 1D array

    return X #, y

X_pred3_2 = base_model_pred_for_submission(X_train3, y_train3, X_pred3, models)
X_pred3_2.shape

(250, 3)

In [None]:
X_train3_2, y_train3_2 = base_model_pred(X_train3, y_train3, models)

models[0].fit(X_train3_2, y_train3_2)
meta_rf = models[0].predict(X_pred3_2) 

# models[1].fit(X_train2_2, y_train2_2)
# meta_lr = models[1].predict(X_pred2_2)

# pred2 = (meta_rf + meta_lr) / 2
pred3 = meta_rf

In [None]:
print(f'accuracy={temp_accuracy(pred3)[0]}, F1-score={temp_accuracy(pred3)[1]}')

accuracy=0.964, F1-score=0.9528795811518325


In [None]:
# load submission
submission_df3 = prediction_to_submission_df('../submission/challenge_1_submission_template.csv', pred3)
submission_df3

Unnamed: 0,id,target
0,"(10.18019073690894, 105.32022315786804)",Rice
1,"(10.561107033461816, 105.12772097986661)",Rice
2,"(10.623790611954897, 105.13771401411867)",Rice
3,"(10.583364246115156, 105.23946127195805)",Non Rice
4,"(10.20744446668854, 105.26844107128906)",Rice
...,...,...
245,"(10.308283266873062, 105.50872812216863)",Non Rice
246,"(10.582910017285496, 105.23991550078767)",Non Rice
247,"(10.581547330796518, 105.23991550078767)",Non Rice
248,"(10.629241357910818, 105.15315779432643)",Rice


In [None]:
# submission_df3.to_csv("L1_Submission_3.csv", index=False)