In [None]:
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix, f1_score)
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pandas as pd


# download custom functions
import sys
sys.path.append('../model_dev_functions')

from model_development import base_model_pred, df_to_arr, model_scores
from feature_engineering import add_NDVI, add_vhvv, get_high_corr_cols
from data_prep import (batch_aggregate_pickle, get_aggregation_from_window,
                       read_multiple_pickles)
from submission_format import prediction_to_submission_df



## iteration 4
test_score = 1.0

As we observe that the inclusion of Sentinel-1 data in `iteration 3` significantly improves our prediction accuracy, we assume that cloud is the most important factor in this classification task. We conduct cloud analysis and find out the dates with the least amount of cloud throughout the year. Based on the dates, we extract their corresponding Sentinel-1 and Sentinel-2 data. increased window size to 9*9 to reduce the impact of possible cloud covering the window. 
- cloud15 data
- drop highly correlated features
- normalization (Robust)
- stack models

#### Cloud cover analysis to extract scenes with low cloud coverage

<img src='../image_analysis/cloud cover analysis.png' alt="Alternative text" />

As we know that the Sentinel data does not cover all groups of training data in a single scene, we break down the analysis into East scene and West scene. East scene consists of all groups from training data other than the group furthest to the West. 

With this analysis, we can select satellite data from dates with less than 15% cloud cover over our training samples. As East side scenes contains most of the data groups (i.e. 6 of 7 in the east), therefore east side will be use as the primary critiria. 

As we know that the training data is in the same region as the submission template data, we think that it is reasonable to assume that the dates with less cloud cover for training data means the same for the submission template data. 

#### Data Preparation

In [None]:
# Sentinel-2 data (training)
cloud15S2_paths, cloud15S2_list = read_multiple_pickles('../11-datasets/ready_dataset/ready_dataset_cloud15', 
                                                     ['latitude', 'longitude', 'geometry', 'grouping'])

cloud15_s2_df, cloud15_s2_df_list = batch_aggregate_pickle(cloud15S2_list, cloud15S2_paths, '_w9', None, agg_method=lambda x:x.mean())
cloud_15_s2_y = pd.read_pickle(cloud15S2_paths[0])['Class of Land'].map({'Rice': 1, 'Non Rice': 0}).values

# Sentinel-1 data (training)
cloud15S1_paths, cloud15S1_list = read_multiple_pickles('../11-datasets/ready_dataset/ready_dataset_cloud15S1', 
                                                     ['latitude', 'longitude', 'geometry', 'grouping'])

cloud15_s1_df, cloud15_s1_df_list = batch_aggregate_pickle(cloud15S1_list, cloud15S1_paths, '_w9')
cloud_15_s1_y = pd.read_pickle(cloud15S2_paths[0])['Class of Land'].map({'Rice': 1, 'Non Rice': 0}).values

# concat data from sentinel-2 and sentinel-1
cloud15_s1s2 = pd.concat([cloud15_s2_df, cloud15_s1_df.drop('Class of Land', axis=1)], axis=1)
cloud15_s1s2

Unnamed: 0,AOT_w9_0115,B02_w9_0115,B03_w9_0115,B04_w9_0115,B08_w9_0115,WVP_w9_0115,visual_w9_0115,B05_w9_0115,B06_w9_0115,B07_w9_0115,...,vh_w9_0520,vv_w9_0520,vh_w9_0601,vv_w9_0601,vh_w9_0818,vv_w9_0818,vh_w9_1205,vv_w9_1205,vh_w9_1229,vv_w9_1229
0,138.0,620.629639,967.333313,463.370361,3964.024658,3326.357910,47.592594,1186.308594,3340.926025,4031.987549,...,0.036744,0.157805,0.031072,0.078085,0.007117,0.038904,0.036330,0.177955,0.006644,0.037547
1,138.0,676.679016,904.086426,496.061737,3089.407471,3345.357910,50.901234,1211.271606,2910.061768,3468.333252,...,0.028177,0.349164,0.027931,0.092954,0.008786,0.089624,0.016413,0.055847,0.004707,0.055702
2,138.0,594.518494,899.370361,453.098755,3599.037109,3260.691406,46.592594,1132.012329,2945.061768,3540.209961,...,0.036251,0.199289,0.031167,0.148355,0.006448,0.020774,0.039174,0.153203,0.004183,0.022632
3,138.0,612.617310,969.024719,457.345673,4026.049316,3344.580322,46.987656,1221.802490,3488.481445,4222.728516,...,0.029057,0.136654,0.024184,0.078517,0.010887,0.050735,0.049119,0.186494,0.009598,0.067469
4,138.0,591.839478,699.641968,391.703705,2411.037109,3597.086426,40.271606,754.666687,1883.036987,2293.296387,...,0.032237,0.182050,0.029203,0.071724,0.011458,0.096539,0.008988,0.039027,0.004649,0.032278
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,138.0,3547.037109,3536.123535,3399.456787,5190.913574,3244.000000,255.000000,4135.827148,4902.740723,5308.320801,...,0.077351,0.296637,0.089087,0.326931,0.072536,0.369975,0.058265,0.278032,0.052680,0.255063
596,138.0,3677.234619,3715.728516,3639.283936,5169.777832,3244.000000,255.000000,4065.308594,4804.962891,5194.530762,...,0.072449,0.313707,0.087714,0.358288,0.075839,0.311691,0.066887,0.242163,0.068978,0.226601
597,138.0,3955.530762,3922.024658,3810.567871,5253.185059,3244.000000,255.000000,4083.481445,4751.197754,5104.567871,...,0.071250,0.307097,0.079684,0.337885,0.077614,0.263658,0.065535,0.248294,0.080965,0.230987
598,138.0,3752.493896,3682.567871,3530.469238,4916.493652,3244.000000,255.000000,4017.938232,4671.617188,5010.147949,...,0.086430,0.282350,0.074414,0.339348,0.073026,0.258110,0.065576,0.230867,0.078133,0.258629


In [None]:
# Sentinel-2 data (template)
cloud15S2t_paths, cloud15S2t_list = read_multiple_pickles('../11-datasets/ready_dataset/ready_dataset_cloud15t', 
                                                     ['latitude', 'longitude', 'geometry', 'grouping', 'target'])
cloud15_s2t_df, cloud15_s2t_df_list = batch_aggregate_pickle(cloud15S2t_list, cloud15S2t_paths, '_w9', None)


# Sentinel-1 data (template)
cloud15S1t_paths, cloud15S1t_list = read_multiple_pickles('../11-datasets/ready_dataset/ready_dataset_cloud15S1t', 
                                                     ['latitude', 'longitude', 'geometry', 'grouping', 'target'])
cloud15_s1t_df, cloud15_s1t_df_list = batch_aggregate_pickle(cloud15S1t_list, cloud15S1_paths, '_w9', None)

# concat data
cloud15_s1s2t = pd.concat([cloud15_s2t_df, cloud15_s1t_df], axis=1)
X_pred6 = cloud15_s1s2t

In [None]:
# train model
model6 = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs = -1)
model6.fit(cloud15_s1s2, cloud_15_s1_y)

pred6 = model6.predict(X_pred6)

In [None]:
submission_df6 = prediction_to_submission_df('../submission/challenge_1_submission_template.csv', pred6)
# submission_df6.to_csv('submission_csv/L1_Submission_6.csv', index=False)

#### Export final model

In [None]:
#import joblib
# joblib.dump(model6, 'RFCloudless_model.h5')

#### iteration 4.1 (extra exploration to see if we can get the same accuracy)

The predicted value is the same as the one that has 1.0 f1-score. 

- Add NDVI + VH/VV before feaeture selection

In [None]:
cloud15_s1s2.shape, cloud_15_s1_y.shape

((600, 198), (600,))

In [None]:
X_train6_2 = pd.concat([add_NDVI(cloud15_s2_df_list), add_vhvv(cloud15_s1_df_list)], axis=1)
X_pred6_2 = pd.concat([add_NDVI(cloud15_s2t_df_list), add_vhvv(cloud15_s1t_df_list)], axis=1)


high_corr6_2 = get_high_corr_cols(X_train6_2, 0.95)

X_train6_2 = X_train6_2.drop(high_corr6_2, axis=1)
X_pred6_2 = X_pred6_2.drop(high_corr6_2, axis=1)

number of high_corr_cols: 83


In [None]:
model6.fit(X_train6_2, cloud_15_s1_y)
pred_6_2 = model6.predict(X_pred6_2)