In [None]:
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix, f1_score)
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pandas as pd


# download custom functions
import sys
sys.path.append('../model_dev_functions')

from model_development import base_model_pred, df_to_arr, model_scores
from feature_engineering import add_NDVI, add_vhvv, get_high_corr_cols
from data_prep import (batch_aggregate_pickle, get_aggregation_from_window,
                       read_multiple_pickles)
from submission_format import prediction_to_submission_df
from temp_acc import temp_accuracy


## Iteration 1
test accuracy = 0.89, <br>
- Random Forest
- all February, August, December data
- window size of 5*5, aggregated by mean

 We think that the cropping season is where the rice crop differ the most from the other landcover the most. So we use these months - february august december based on the variance analysis on NDVI for rice crop. As a baseline model, this model only include Sentinel-2 data. 

<img src='../image_analysis/NDVI variance analysis.png' alt="Alternative text" />

The variance of NDVI between rice and the mean of non-rice by dates is calculated. From this analysis, we found 3 peaks in the variance and choose to select the 3 months corresponding to them, i.e. February, August and December. (See LEVEL-1 data preparation for code.)

#### data preparation

In [None]:
# get data for february, march and december for Sentinel-2 (training data)

# read multiple pickle files for band data corresponding to available dates in february, august and december
fad_s2_paths, fad_s2_dfs_list = read_multiple_pickles('../11-datasets/feb_aug_dec-S2', ['latitude', 'longitude', 'geometry', 'grouping'])

# aggregate all the features with windows 5*5 by mean for the list of dataframes read from the pickle files
fad_s2_df, fad_s2_df_list  = batch_aggregate_pickle(fad_s2_dfs_list, fad_s2_paths, '_w5', 'Class of Land', agg_method=lambda x:x.mean())

X_train1 = fad_s2_df.drop('Class of Land', axis=1)
y_train1 = fad_s2_df['Class of Land']

In [None]:
# get data for february, march and december for Sentinel-2 (coordinates from submission template)
sub2_fad_paths, sub2_fad_df_list = read_multiple_pickles('../11-datasets/SUBMISSION-template_data', 
                                                         ['id', 'latitude', 'longitude', 'geometry', 'target'], filter_condition='-03-')
template_data_s2_df, template_s2_df_list = batch_aggregate_pickle(sub2_fad_df_list, sub2_fad_paths, '_w5', None)

X_pred1 = template_data_s2_df.values

#### Model development and prediction

In [None]:
# train model and get prediction result
model1 = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs = -1)
model1.fit(fad_s2_df.drop('Class of Land', axis = 1), fad_s2_df['Class of Land'])

pred1 = model1.predict(X_pred1)

In [None]:
# evaluate accuracy of prediction (slightly different with submission due different random state)
print(f'accuracy={temp_accuracy(pred1)[0]}, F1-score={temp_accuracy(pred1)[1]}')

accuracy=0.924, F1-score=0.9090909090909091


In [None]:
# convert to dataframe for submission
submission_df1 = prediction_to_submission_df('../submission/challenge_1_submission_template.csv', pred1)
submission_df1

# submission_df1.to_csv("L1_Submission_1.csv", index=False)

Unnamed: 0,id,target
0,"(10.18019073690894, 105.32022315786804)",Rice
1,"(10.561107033461816, 105.12772097986661)",Rice
2,"(10.623790611954897, 105.13771401411867)",Rice
3,"(10.583364246115156, 105.23946127195805)",Non Rice
4,"(10.20744446668854, 105.26844107128906)",Rice
...,...,...
245,"(10.308283266873062, 105.50872812216863)",Non Rice
246,"(10.582910017285496, 105.23991550078767)",Non Rice
247,"(10.581547330796518, 105.23991550078767)",Non Rice
248,"(10.629241357910818, 105.15315779432643)",Rice
