In [1]:
from pydap.client import open_url
from datetime import datetime
from calendar import monthrange, month_name

from tqdm import tqdm
tqdm.pandas()

import os
import numpy as np
import pandas as pd
import netCDF4 as nc
import xarray as xr

import time
import pickle
import cdsapi
import math

# Functions (defs) from other notebooks
import ipynb.fs.defs.CreateCollocatedDataFrame as ccdf

# Interpolation
from scipy.interpolate import RegularGridInterpolator, LinearNDInterpolator
import scipy.interpolate.interpnd

# Plotting
from matplotlib import pyplot as plt, figure
from sklearn.metrics import mean_squared_error
from matplotlib.colors import LogNorm
from cartopy.mpl.ticker import LongitudeFormatter, LatitudeFormatter
from plotly import express as px
import cartopy.crs as ccrs

In [None]:
# Set data frame options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
# Set boolean parameters
plot_distributions_boolean = False

## Load Interpolated DFs

In [None]:
india_df = pd.read_csv('/Users/madsrindal/Desktop/Intervals/24-80-19-85/InterpolatedDF-withCYGNSSQFs-[24-80-19-85].csv')


In [None]:
india_df.head()

In [None]:
india_df.shape

In [None]:
### Functions used for data processing before the utilization of ML 

# FILTER SMAP NAN VALUES
print('Filtering SMAP NaN values...')
print('-'*60)
before = india_df.shape[0]
india_df = ccdf.filter_nan_smap_sm(india_df)
india_df = ccdf.filter_nan_smap_vo(india_df)
india_df = ccdf.filter_nan_smap_sr(india_df)
after = india_df.shape[0]
print('Removed ' + str(before-after) + ' rows of SMAP NaN values')
print('-'*60)

## FILTER QUALITY FLAGS
print('\nFiltering quality flags...')
print('-'*40)
before = india_df.shape[0]
india_df = ccdf.filter_quality_flags_1(india_df)
after = india_df.shape[0]
print('Removed ' + str(before-after) + ' rows due to CYGNSS QFs')
print('-'*40)

## COMPUTE BLOCK_CODE
print('\nComputing block codes...')
india_df = ccdf.compute_block_code(india_df)

## SCALE SURFACE REFLECTIVITY VALUES (REMOVE THE MIN VALUE FROM ALL OTHER VALUES)
print('\nScaling surface reflectivity values...')
india_df = ccdf.scale_sr_values(india_df)

## COMPUTE DAILY HOUR (0-23)
print('\nComputing daily hour from 0-23...')
india_df = ccdf.compute_daily_hour_column(india_df)

## COMPUTE TIME OF DAY (morning/day/afternoon/night)
print('\nComputing time of day (morning/day/afternoon/night)...')
india_df = ccdf.compute_time_of_day(india_df)

print('##### PREPROCESSING DONE #####')

In [None]:
print('India Data Frame Shape: ', india_df.shape)

In [None]:
india_df.head()

In [None]:
df_list = [india_df]

# Incidence Angle Analysis

In [None]:
if plot_distributions_boolean:

    for df in df_list:

        print('Min: ', df['sp_inc_angle'].min())
        print('Max: ', df['sp_inc_angle'].max())

        plt.hist(df['sp_inc_angle'])
        plt.title('Incidence Angle Measurements 2020', fontsize=18)
        plt.ylabel('Count', fontsize=12)
        plt.xlabel('Incidence angle', fontsize=12)
        # plt.savefig('/Users/madsrindal/Desktop/Plots/IncidenceAngleDistribution2020.png', bbox_inches='tight')
        plt.show()

In [None]:
if plot_distributions_boolean:
    
    incidence_angle_intervals = [5, 10, 20]

    for df in df_list:

        max_values = []
        for angle in incidence_angle_intervals:

            corr_list = []
            inc_angle = []

            for i in range(0, 72, 2):
                chosen_df = df[df['sp_inc_angle'] >= i]
                chosen_df = chosen_df[chosen_df['sp_inc_angle'] <= i+angle]
                corr = chosen_df['smap_sm'].corr(chosen_df['sr'])
                corr_list.append(corr)
                inc_angle.append(i)

            max_values.append(max(corr_list))
            plt.plot(inc_angle, corr_list, linewidth=4.0, label='IA Interval: ' + str(angle))
            plt.title('SMAP SM and SR Correlation', fontsize=18)
            plt.ylabel('Correlation', fontsize=12)
            plt.xlabel('Incidence angle', fontsize=12)
            # plt.savefig('/Users/madsrindal/Desktop/Plots/IncidenceAngleCorrelation2020smap.png')

        plt.legend(fontsize='small')
        plt.show()

# Vegetation Opacity Analysis

In [None]:
if plot_distributions_boolean:

    for df in df_list:
        print('Min: ', df['smap_vo'].min())
        print('Max: ', df['smap_vo'].max())

        plt.hist(df['smap_vo'])
        plt.title('SMAP Vegetation Opacity Measurements', fontsize=18)
        plt.ylabel('Count', fontsize=12)
        plt.xlabel('Vegetation opacity', fontsize=12)
        # plt.savefig('/Users/madsrindal/Desktop/Plots/IncidenceAngleDistribution2020.png', bbox_inches='tight')
        plt.show()

# Surface Rougness Analysis

In [None]:
if plot_distributions_boolean:

    for df in df_list:
        print('Min: ', df['smap_surface_roughness'].min())
        print('Max: ', df['smap_surface_roughness'].max())

        plt.hist(df['smap_surface_roughness'])
        plt.title('SMAP Surface Roughness Factor', fontsize=18)
        plt.ylabel('Count', fontsize=12)
        plt.xlabel('Surface Roughness Factor', fontsize=12)
        # plt.savefig('/Users/madsrindal/Desktop/Plots/IncidenceAngleDistribution2020.png', bbox_inches='tight')
        plt.show()

In [None]:
smap_df = pd.read_csv('/Users/madsrindal/Desktop/Intervals/24-80-19-85/SMAP-allYears-withQFs-[24-80-19-85].csv')
filtered_smap_df = ccdf.filter_smap_qfs(smap_df)

In [None]:
print('Min/Max values for original data frame:')
print('Min: ', smap_df['surface_roughness'].min())
print('Max: ', smap_df['surface_roughness'].max())

print('\nMin/Max values for filtered data frame:')
print('Min: ', filtered_smap_df['surface_roughness'].min())
print('Max: ', filtered_smap_df['surface_roughness'].max())

# Correlation

In [None]:
for df in df_list:
    corr = df['smap_sm'].corr(df['sr'])
    print('SM / SR - Correlation: ', corr)

# Machine Learning

In [None]:
# Machine learning

import catboost as cb
import seaborn as sns
import shap
import pickle

import h2o
from h2o.automl import H2OAutoML
from catboost import CatBoostRegressor
from sklearn.linear_model import RANSACRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.inspection import permutation_importance

In [None]:
ml_df = india_df.copy()

## CatBoost

In [None]:
ml_df['rx_to_sp_range'] = ml_df['rx_to_sp_range'].apply(lambda x: float(x))
ml_df['tx_to_sp_range'] = ml_df['tx_to_sp_range'].apply(lambda x: float(x))
ml_df['hours_after_jan_2019'] = ml_df['hours_after_jan_2019'].apply(lambda x: int(x))

In [None]:
# Drop some columns
cols_to_drop = ['unique_track_id', 'qf_ok']
ml_df.drop(cols_to_drop, axis = 1, inplace=True)

In [None]:
target_variable = 'smap_sm'
X = ml_df.loc[:, ml_df.columns != target_variable]
y = ml_df.loc[:, ml_df.columns == target_variable]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=5)

In [None]:
X_train.head()

In [None]:
cat_features_indices = np.where(X_train.dtypes != float)[0]
print('Categorical features on indices: ', cat_features_indices)
train_dataset = cb.Pool(X_train, y_train, cat_features=cat_features_indices) 
test_dataset = cb.Pool(X_test, y_test, cat_features=cat_features_indices)

In [None]:
print('All Columns: ', list(X_train.columns))
print('--------------'*8)
print('Cat Columns: ', list(X_train.iloc[:, cat_features_indices].columns))

In [None]:
model = cb.CatBoostRegressor(loss_function='RMSE')

# if load_pretrained_model:
#     model.load_model('catboost_model_08052022')

In [None]:
# 31 runs per iteration interval -> 94 in total
# Finished in 6.3 hours

start_time = time.time()

grid = {'iterations': [100, 150, 200],
        'learning_rate': [0.03, 0.1],
        'depth': [2, 4, 6, 8],
        'l2_leaf_reg': [0.2, 0.5, 1, 3]}
model.grid_search(grid, train_dataset)

print('\n\n' + '#'*50)
print('Finished grid search in ' + str(time.time()-start_time) + ' seconds')
print('#'*50)

In [None]:
if save_model_when_done:
    model_name = 'model_name'
    model.save_model(model_name)

In [None]:
pred = model.predict(X_test)
rmse = (np.sqrt(mean_squared_error(y_test, pred)))
r2 = r2_score(y_test, pred)
print('Testing performance')
# print('Incidence angle interval: ', inc_angles)
print('RMSE: ', rmse)
print('R2: ', r2)
#print('RMSE: {:.2f}'.format(rmse))
#print('R2: {:.2f}'.format(r2))

In [None]:
sorted_feature_importance = model.feature_importances_.argsort()
plt.barh(X_train.columns[sorted_feature_importance], 
        model.feature_importances_[sorted_feature_importance], 
        color='blue')
plt.title("CatBoost Feature Importance", fontsize=18)
plt.xlabel("Importance Percentage", fontsize=12)
# plt.savefig('/Users/madsrindal/Desktop/Plots/CatBoostFeatureimportance', bbox_inches='tight')
plt.show()

In [None]:
# Save / Load model with pickle
# pickle.dump(model, open(model_name + '.pkl', 'wb')) # Save
# pickled_model = pickle.load(open(model_name + '.pkl', 'rb')) # Load

## AutoML

In [None]:
## CODE FOR MAKING A SMALLER AREA USED TO TEST AUTO_ML FUNCTIONALITY AND BEST MODELS ##

def filter_location(df, location):
    filtered_df = df[df.sp_lat < location[0]]
    filtered_df = filtered_df[filtered_df.sp_lat > location[2]]
    filtered_df = filtered_df[filtered_df.sp_lon < location[3]]
    filtered_df = filtered_df[filtered_df.sp_lon > location[1]]
    return filtered_df

print('Original ml_df shape: ', ml_df.shape)

india_small_area = [22.5, 81, 20.5, 83]

ml_df_small = filter_location(ml_df, india_small_area)

print('Smaller area ml_df shape: ', ml_df_small.shape)
print('New ml_df shape: ', ml_df.shape)

In [None]:
# Machine learning
import h2o
from h2o.automl import H2OAutoML
from catboost import CatBoostRegressor
from sklearn.linear_model import RANSACRegressor

In [None]:
h2o.init()

In [None]:
target_variable = 'smap_sm'
X_auto = ml_df_small.loc[:, ml_df_small.columns != target_variable]
y_auto = ml_df_small.loc[:, ml_df_small.columns == target_variable]

X_train_auto, X_test_auto, y_train_auto, y_test_auto = train_test_split(X_auto, y_auto, test_size = 0.2, random_state=5)

train_auto = pd.concat([X_train_auto, y_train_auto], axis=1, join='inner')
test_auto = pd.concat([X_test_auto, y_test_auto], axis=1, join='inner')


In [None]:
train_h2o = h2o.H2OFrame(train_auto)
test_h2o = h2o.H2OFrame(test_auto)

In [None]:
train_h2o.head()

In [None]:
test_h2o.head()

In [None]:
y = 'smap_sm'
x = train_h2o.columns
x.remove(y)

train_h2o['ddm_channel'] = train_h2o['time_of_day'].asfactor()
train_h2o['spacecraft_num'] = train_h2o['time_of_day'].asfactor()
train_h2o['day_of_year'] = train_h2o['time_of_day'].asfactor()
train_h2o['track_id'] = train_h2o['time_of_day'].asfactor()
train_h2o['prn_code'] = train_h2o['time_of_day'].asfactor()
train_h2o['quality_flags'] = train_h2o['time_of_day'].asfactor()
train_h2o['quality_flags_2'] = train_h2o['time_of_day'].asfactor()
train_h2o['year'] = train_h2o['time_of_day'].asfactor()
train_h2o['hours_after_jan_2019'] = train_h2o['time_of_day'].asfactor()
train_h2o['block_code'] = train_h2o['time_of_day'].asfactor()
train_h2o['daily_hour'] = train_h2o['time_of_day'].asfactor()
train_h2o['time_of_day'] = train_h2o['time_of_day'].asfactor()

test_h2o['ddm_channel'] = test_h2o['time_of_day'].asfactor()
test_h2o['spacecraft_num'] = test_h2o['time_of_day'].asfactor()
test_h2o['day_of_year'] = test_h2o['time_of_day'].asfactor()
test_h2o['track_id'] = test_h2o['time_of_day'].asfactor()
test_h2o['prn_code'] = test_h2o['time_of_day'].asfactor()
test_h2o['quality_flags'] = test_h2o['time_of_day'].asfactor()
test_h2o['quality_flags_2'] = test_h2o['time_of_day'].asfactor()
test_h2o['year'] = test_h2o['time_of_day'].asfactor()
test_h2o['hours_after_jan_2019'] = test_h2o['time_of_day'].asfactor()
test_h2o['block_code'] = test_h2o['time_of_day'].asfactor()
test_h2o['daily_hour'] = test_h2o['time_of_day'].asfactor()
test_h2o['time_of_day'] = test_h2o['time_of_day'].asfactor()

In [None]:
aml = H2OAutoML(balance_classes=False, max_models = 10, seed = 1)
aml.train(x = x, y = y, training_frame = train_h2o)

In [None]:
lb = aml.leaderboard
lb.head(rows=lb.nrows)

In [None]:
model1 = aml.get_best_model(algorithm="xgboost", criterion="rmse")
model2 = aml.get_best_model(algorithm="GBM", criterion="rmse")
model3 = aml.get_best_model(algorithm="DRF", criterion="rmse")
model4 = aml.get_best_model(criterion='rmse')

In [None]:
preds1 = model1.predict(test_h2o)
preds2 = model2.predict(test_h2o)
preds3 = model3.predict(test_h2o)
preds4 = model4.predict(test_h2o)

In [None]:
rmse_xgboost = mean_squared_error(h2o.as_list(test_h2o['smap_sm'])['smap_sm'], predictions1['predict'], squared=False)
rmse_gbm = mean_squared_error(h2o.as_list(test_h2o['smap_sm'])['smap_sm'], predictions2['predict'], squared=False)
rmse_drf = mean_squared_error(h2o.as_list(test_h2o['smap_sm'])['smap_sm'], predictions3['predict'], squared=False)
rmse_best = mean_squared_error(h2o.as_list(test_h2o['smap_sm'])['smap_sm'], predictions4['predict'], squared=False)

In [None]:
print('RMSE XGBOOST: ', rmse_xgboost)
print('RMSE GBM: ', rmse_gbm)
print('RMSE DRF: ', rmse_drf)
print('RMSE BEST: ', rmse_best)