In [1]:
# basic packages
import pandas as pd
import os
import warnings
warnings.filterwarnings("ignore")
import torch #had to pip install torch into venv, also scikit-learn, hydroeval, hydrotools
import torch.nn as nn

#Shared/Utility scripts
from model_scripts import Simple_Eval, dataloader, dataprocessor, mlp_model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
HOME = os.path.expanduser('~')


modelname = 'MLP'
model_path = f"{HOME}/SWEMLv2.0/Model/{modelname}"
if not os.path.exists(model_path):
    os.makedirs(model_path, exist_ok=True)
print(f"{modelname} development script")

Device: cuda
Device: cuda
MLP development script


# Load data and set input columns

In [2]:
#load data
regionlist = ['SouthernRockies', 'Southwest', 'Northwest']
output_res = '300M_Resolution'
DataFrame = 'Seasonality_PrecipVIIRSGeoObsDFs'
fSCA_thresh = '20_fSCA_Thresh'

df = dataloader.get_ML_Data(regionlist, output_res, DataFrame, fSCA_thresh)
df.head()

Concatenating 14 for the model dataframe development.


  0%|          | 0/14 [00:00<?, ?it/s]

There are 305566 datapoints for model training/testing in the SouthernRockies modeling domain.
Concatenating 99 for the model dataframe development.


  0%|          | 0/99 [00:00<?, ?it/s]

There are 2222453 datapoints for model training/testing in the Southwest modeling domain.
Concatenating 2 for the model dataframe development.


  0%|          | 0/2 [00:00<?, ?it/s]

There are 170370 datapoints for model training/testing in the Northwest modeling domain.
There are 2698389 datapoints for model training/testing in the overall modeling domain.


Unnamed: 0,index,cell_id,Date,cen_lat,cen_lon,Elevation_m,Slope_Deg,Aspect_Deg,ns_1,ns_2,...,ns_6_week_mean,Seasonal_ns_1_rel,Seasonal_ns_2_rel,Seasonal_ns_3_rel,Seasonal_ns_4_rel,Seasonal_ns_5_rel,Seasonal_ns_6_rel,swe_cm,region,region_class
0,5152,SouthernRockies_300M_37.322_-106.366,2015-04-06,37.322,-106.366,3000,5,191,16.5,18.5,...,3.092308,1.755319,2.339494,2.055441,1.627859,1.494303,0.0,0.0,SouthernRockies,1
1,19438,SouthernRockies_300M_37.131_-106.326,2015-04-06,37.131,-106.326,3305,4,249,46.2,0.0,...,26.230769,2.055441,0.0,1.755319,2.339494,2.372538,1.627859,0.157039,SouthernRockies,1
2,19437,SouthernRockies_300M_37.131_-106.329,2015-04-06,37.131,-106.329,3283,18,27,46.2,0.0,...,26.230769,2.055441,0.0,1.755319,2.339494,2.372538,1.627859,0.04209,SouthernRockies,1
3,19436,SouthernRockies_300M_37.131_-106.331,2015-04-06,37.131,-106.331,3233,15,249,46.2,0.0,...,26.230769,2.055441,0.0,1.755319,2.339494,2.372538,1.627859,9e-06,SouthernRockies,1
4,19435,SouthernRockies_300M_37.131_-106.334,2015-04-06,37.131,-106.334,3266,28,232,46.2,0.0,...,26.230769,2.055441,0.0,1.755319,2.339494,2.372538,1.627859,0.0,SouthernRockies,1


# Dataprocessing



In [10]:
import importlib
importlib.reload(dataprocessor)

Device: cuda


<module 'model_scripts.dataprocessor' from '/home/rjohnson18/SWEMLv2.0/Modeling/model_scripts/dataprocessor.py'>

In [11]:
#clean the data
df = dataprocessor.data_clean(df, regionlist)
df.head()

#temporary for seasonality relationship
df.fillna(1, inplace = True)

#convert dates to datetime format
df.Date = pd.to_datetime(df.Date)

input_columns = [
           'cen_lat',	
            'cen_lon',	
            'Elevation_m',	
            'Slope_Deg',	
            'Aspect_Deg',	
            'ns_1',	
            'ns_2',	
            'ns_3',	
            'ns_4',	
            'ns_5',	
            'ns_6',	
            'VIIRS_SCA', 
            'hasSnow',
            'season_precip_cm',
            'region_class',
            'DOS', 
            'WY_week',
            'ns_1_week_mean',
            'ns_2_week_mean', 
            'ns_3_week_mean', 
            'ns_4_week_mean',
            'ns_5_week_mean', 
            'ns_6_week_mean', 
            'Seasonal_ns_1_rel',
            'Seasonal_ns_2_rel',
            'Seasonal_ns_3_rel', 
            'Seasonal_ns_4_rel',
            'Seasonal_ns_5_rel', 
            'Seasonal_ns_6_rel'
]

years = False
splitratio = 0.33
test_years = [2019]
target = 'swe_cm'

#fit a scaler,save, and scale the training data
x_train_scaled_t, y_train_scaled_t, x_test_scaled_t, x_test, y_test = dataprocessor.mlp_scaler(
                                                    regionlist,
                                                      df, 
                                                      years, 
                                                      splitratio,
                                                      test_years, 
                                                      target, 
                                                      input_columns, 
                                                      model_path, 
                                                      scalertype = 'MinMax'
                                                      )

The provided data contains 2039955 data points, of which 0 locations/timesteps show no SWE and VIIRS fsca > 20%
0 locations/timesteps show SWE and VIIRS fsca < 20%
0 locations/timesteps show SWE greater than a realistic value (250 cm) in the SouthernRockies domain
0 locations/timesteps show SWE greater than a realistic value (400 cm) in the Southwest domain
0 locations/timesteps show SWE greater than a realistic value (800 cm) in the Northwest domain
removing..
There are 2039955 datapoints for model training/testing.


Unnamed: 0,index,cell_id,Date,cen_lat,cen_lon,Elevation_m,Slope_Deg,Aspect_Deg,ns_1,ns_2,...,ns_6_week_mean,Seasonal_ns_1_rel,Seasonal_ns_2_rel,Seasonal_ns_3_rel,Seasonal_ns_4_rel,Seasonal_ns_5_rel,Seasonal_ns_6_rel,swe_cm,region,region_class
0,5152,SouthernRockies_300M_37.322_-106.366,2015-04-06,37.322,-106.366,3000,5,191,16.5,18.5,...,3.092308,1.755319,2.339494,2.055441,1.627859,1.494303,0.0,0.0,SouthernRockies,1
1,19437,SouthernRockies_300M_37.131_-106.329,2015-04-06,37.131,-106.329,3283,18,27,46.2,0.0,...,26.230769,2.055441,0.0,1.755319,2.339494,2.372538,1.627859,0.04209,SouthernRockies,1
2,19436,SouthernRockies_300M_37.131_-106.331,2015-04-06,37.131,-106.331,3233,15,249,46.2,0.0,...,26.230769,2.055441,0.0,1.755319,2.339494,2.372538,1.627859,9e-06,SouthernRockies,1
3,19435,SouthernRockies_300M_37.131_-106.334,2015-04-06,37.131,-106.334,3266,28,232,46.2,0.0,...,26.230769,2.055441,0.0,1.755319,2.339494,2.372538,1.627859,0.0,SouthernRockies,1
4,19434,SouthernRockies_300M_37.131_-106.337,2015-04-06,37.131,-106.337,3290,16,18,46.2,0.0,...,26.230769,2.055441,0.0,1.755319,2.339494,2.372538,1.627859,0.0,SouthernRockies,1


Unnamed: 0,index,cell_id,Date,cen_lat,cen_lon,Elevation_m,Slope_Deg,Aspect_Deg,ns_1,ns_2,...,ns_6_week_mean,Seasonal_ns_1_rel,Seasonal_ns_2_rel,Seasonal_ns_3_rel,Seasonal_ns_4_rel,Seasonal_ns_5_rel,Seasonal_ns_6_rel,swe_cm,region,region_class
248246,2,Southwest_300M_38.182_-119.597,2013-04-03,38.182,-119.597,1706,7,36,113.1,124.5,...,12.512815,2.489418,2.775875,0.0,0.035153,2.156931,3.820084,66.867477,Southwest,2
248247,5516,Southwest_300M_38.015_-119.561,2013-04-03,38.015,-119.561,1835,16,169,75.5,113.1,...,21.782141,2.117127,2.489418,1.756746,1.613696,1.941113,2.345959,48.456591,Southwest,2
248248,5515,Southwest_300M_38.015_-119.564,2013-04-03,38.015,-119.564,1839,11,56,75.5,113.1,...,21.782141,2.117127,2.489418,1.756746,1.613696,1.941113,2.345959,53.630471,Southwest,2
248249,5514,Southwest_300M_38.015_-119.567,2013-04-03,38.015,-119.567,1841,5,127,75.5,113.1,...,44.850723,2.117127,2.489418,1.756746,1.613696,1.941113,2.775875,49.717703,Southwest,2
248250,5513,Southwest_300M_38.015_-119.57,2013-04-03,38.015,-119.57,1850,8,120,75.5,113.1,...,14.630779,2.117127,2.489418,1.756746,1.613696,2.775875,1.941113,48.634857,Southwest,2


Unnamed: 0,index,cell_id,Date,cen_lat,cen_lon,Elevation_m,Slope_Deg,Aspect_Deg,ns_1,ns_2,...,ns_6_week_mean,Seasonal_ns_1_rel,Seasonal_ns_2_rel,Seasonal_ns_3_rel,Seasonal_ns_4_rel,Seasonal_ns_5_rel,Seasonal_ns_6_rel,swe_cm,region,region_class
1901057,85184,Northwest_300M_47.496_-123.881,2016-02-08,47.496,-123.881,651,65,257,93.0,72.9,...,7.371429,2.990354,3.0,3.402946,3.787066,0.0,5.684109,0.0,Northwest,3
1901058,28390,Northwest_300M_47.892_-123.622,2016-02-08,47.892,-123.622,575,36,146,72.9,93.0,...,7.371429,3.0,2.990354,3.402946,2.683908,3.787066,5.684109,133.371216,Northwest,3
1901059,28391,Northwest_300M_47.892_-123.619,2016-02-08,47.892,-123.619,517,33,118,72.9,93.0,...,7.371429,3.0,2.990354,3.402946,2.683908,3.787066,5.684109,93.470695,Northwest,3
1901060,28392,Northwest_300M_47.892_-123.616,2016-02-08,47.892,-123.616,461,33,113,72.9,93.0,...,7.371429,3.0,2.990354,3.402946,2.683908,3.787066,5.684109,83.744812,Northwest,3
1901061,28393,Northwest_300M_47.892_-123.614,2016-02-08,47.892,-123.614,398,29,137,72.9,93.0,...,7.371429,3.0,2.990354,3.402946,2.683908,3.787066,5.684109,80.450775,Northwest,3


y train shape (1366768, 1)
x train shape (1366768, 29)
x test shape (673187, 29)


## Set up Testing year
* Select year(s) not used in training
* Convert to numpy array
* Load scaler and scale data
## Train the model
* randomize training data..
* add training loss https://www.geeksforgeeks.org/training-neural-networks-with-validation-using-pytorch/
## Loss functions
### Mean Absolute Error (MAE)
Regression problems, especially when the distribution of the target variable has outliers, such as small or big values that are a great distance from the mean value. It is considered to be more robust to outliers. PyTorch implementation as nn.L1Loss()

### Mean Squared Error (MSE)
The MSE, also called L2 Loss, computes the average of the squared differences between actual values and predicted values. Pytorch MSE Loss always outputs a positive result, regardless of the sign of actual and predicted values. To enhance the accuracy of the model, you should try to reduce the L2 Loss—a perfect value is 0.0.

The squaring implies that larger mistakes produce even larger errors than smaller ones. If the classifier is off by 100, the error is 10,000. If it’s off by 0.1, the error is 0.01. This punishes the model for making big mistakes and encourages small mistakes.

MSE is the default loss function for most Pytorch regression problems.

#### Make your own loss function
https://neptune.ai/blog/pytorch-loss-functions

In [12]:
epochs = 400 # - seems to converge around 80 epochs with 100 batrch size
batch_size = 120
learning_rate = 0.00001  # 0.0001,0.00001 look up learning rate scheduler https://www.geeksforgeeks.org/understanding-pytorch-learning-rate-scheduling/ -  smaller learning rates doing better!
decay = 0.00005 #0.0005, 0.00005
L1 = 20 #looked at 10,20,30,100
L2 = 20 #looked at 10,20,30,100
L3 = 100 #looked at 10,20,30,100
L4 = 100 #looked at 10,20,30,100
L5 = 100 #looked at 10,20,30,100
L6 = 18 #looked at 10,20,30,100
#can we train multiple models at a time here?
# 80	70	0.00001	0.00005	20	20	100	100	100	18 way better low value

search_params = epochs, batch_size, learning_rate, decay, L1, L2, L3, L4, L5, L6
loss_func = nn.MSELoss()
layers = x_train_scaled_t.shape[1], L1, L2, L3, L4, L5, L6
params =  learning_rate, decay, epochs, batch_size
Use_fSCA_Threshold = True


# #Train the model
mlp_model.mlp_train(
    x_train_scaled_t,
    y_train_scaled_t, 
    layers, 
    params, 
    loss_func,
    model_path, 
    modelname, 
    shuffle = True)


#Make a prediction for each location, save as compressed pkl file, and send predictions to AWS for use in CSES
PredsDF = pd.DataFrame()
PredsDF = mlp_model.mlp_predict(
                    layers, 
                    model_path, 
                    modelname, 
                    x_test,
                    x_test_scaled_t, 
                    y_test,
                    Use_fSCA_Threshold
                    )

#Take a sample to determine model skill - Take 1000 from each modeling domain
n_samples = 1000
SampleDF, PredsDF = Simple_Eval.SamplePreds(regionlist, PredsDF, df, n_samples)

 #Evaluate model performance of the different models
prediction_columns = [f"{modelname}_swe_cm"]
Eval_DF = Simple_Eval.Simple_Eval(regionlist,
                                SampleDF,
                                prediction_columns, 
                                modelname, 
                                plots = False, 
                                keystats = False        
                                )

Epochs completed:   0%|          | 0/400 [00:00<?, ?it/s]

Epoch 1/400, Loss: 0.008925901546716998
Epoch 2/400, Loss: 0.003268781192625257
Epoch 3/400, Loss: 0.002987825038190359
