In [None]:
# basic packages
import pandas as pd
import os
import warnings
import pyarrow as pa
import pyarrow.parquet as pq
warnings.filterwarnings("ignore")

#Shared/Utility scripts
import torch 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
from model_scripts import Simple_Eval, dataloader, dataprocessor, xgb_model #had to pip install xgboost
HOME = os.path.expanduser('~')

modelname = 'XGBoost'
model_path = f"{HOME}/SWEMLv2.0/Model/{modelname}"
if not os.path.exists(model_path):
    os.makedirs(model_path, exist_ok=True)
print(f"{modelname} development script, {device}")

In [None]:
#load data
regionlist = ['SouthernRockies', 'Southwest', 'Northwest']
output_res = '300M_Resolution'
DataFrame = 'Sturm_Seasonality_PrecipVIIRSGeoObsDFs'
fSCA_thresh = '20_fSCA_Thresh'

alldata = dataloader.get_ML_Data(regionlist, output_res, DataFrame, fSCA_thresh)
alldata.head()

## Pull out a test condition by date

In [3]:
# Pulling out 3-29-2019 in Southwest
TestArea = alldata[alldata['Date'] == '2019-03-29']
TestArea.reset_index(inplace=True, drop=True)

#remove Test area data from training/testing dataset
df = pd.concat([alldata, TestArea]).drop_duplicates(keep=False)

# Data Processing

In [None]:
#clean the data
df = dataprocessor.data_clean(df, regionlist)
df.head()

#temporary for seasonality relationship
df.fillna(1, inplace = True)

#convert dates to datetime format
df.Date = pd.to_datetime(df.Date)

input_columns = [
            'cen_lat',	
            'cen_lon',	
            'Elevation_m',	
            'Slope_Deg',	
            'Aspect_Deg',	
            'ns_1',	
            'ns_2',	
            'ns_3',	
            'ns_4',	
            'ns_5',	
            'ns_6',	
            'VIIRS_SCA', 
            'hasSnow',
            'season_precip_cm',
            'region_class',
            'DOS', 
            'WY_week',
            'ns_1_week_mean', 
            'ns_2_week_mean', 
            'ns_3_week_mean', 
            'ns_4_week_mean',
            'ns_5_week_mean', 
            'ns_6_week_mean', 
            'Seasonal_ns_1_rel',
            'Seasonal_ns_2_rel',
            'Seasonal_ns_3_rel', 
            'Seasonal_ns_4_rel',
            'Seasonal_ns_5_rel', 
            'Seasonal_ns_6_rel',
            'sturm_value'
]

years = False
splitratio = 0.33
test_years = [2019]
target = 'swe_cm'

#fit a scaler,save, and scale the training data
x_train, y_train, x_test, y_test = dataprocessor.xgb_processor(
                                                    regionlist,
                                                      df, 
                                                      years, 
                                                      splitratio,
                                                      test_years, 
                                                      target, 
                                                      input_columns, 
                                                      model_path, 
                                                      scalertype = 'MinMax'
                                                      )

# Train Model

In [None]:
#Train model
tries = 1 #what is tries?
hyperparameters = {
    'max_depth': range (5, 21, 5),
    'n_estimators': range(200, 1500, 500),
    'eta': [0.1,]
}
perc_data = 0.25 # percent of training data used to identify optimial hyperparameters

Use_fSCA_Threshold = True

xgb_model.XGB_Train(model_path, 
                    input_columns, 
                    x_train, 
                    y_train, 
                    tries, 
                    hyperparameters,
                    perc_data)

#Make a prediction for each location, save as compressed pkl file, and send predictions to AWS for use in CSES
PredsDF = pd.DataFrame()
PredsDF = xgb_model.XGB_Predict(
                    model_path, 
                    modelname, 
                    x_test,
                    y_test,
                    Use_fSCA_Threshold
                    )

In [None]:
import importlib
importlib.reload(Simple_Eval)

In [None]:
#Take a sample to determine model skill - Take 1000 from each modeling domain
n_samples = 1000
SampleDF, PredsDF = Simple_Eval.SamplePreds(regionlist, PredsDF, df, n_samples)


savfig = False
figname = 'Model-Testing-Split-Performance'

 #Evaluate model performance of the different models
prediction_columns = [f"{modelname}_swe_cm"]
Eval_DF = Simple_Eval.Simple_Eval(regionlist,
                                SampleDF,
                                prediction_columns, 
                                modelname, 
                                savfig, 
                                figname,
                                plots = False, 
                                keystats = False        
                                )

In [6]:
Ppath = f"{HOME}/SWEMLv2.0/Predictions/{DataFrame}/{output_res}/{fSCA_thresh}"
if not os.path.exists(Ppath):
    os.makedirs(Ppath, exist_ok=True)


#save the model predictions
table = pa.Table.from_pandas(PredsDF)
# Parquet with Brotli compression
pq.write_table(table, f"{Ppath}/Test_preds.parquet", compression='BROTLI')

## Make a prediction on the held out date


In [None]:

#Prep prediction data
y_test_Area = pd.DataFrame(TestArea['swe_cm'])
dropcols = ['cell_id',	'Date', 'swe_cm', 'region']
x_test_Area = TestArea.drop(columns=dropcols)
x_test_Area = x_test_Area[input_columns]

#make a prediction
holdoutdate = xgb_model.XGB_Predict(
                    model_path, 
                    modelname, 
                    x_test_Area,
                    y_test_Area,
                    Use_fSCA_Threshold
                    )
#Add geospatial information to prediction DF
EvalDF = pd.concat([TestArea, holdoutdate], axis=1)
#EvalDF.drop(['index'], axis=1, inplace=True)
EvalDF = EvalDF.loc[:,~EvalDF.columns.duplicated()].copy()

EvalDF.head(5)

In [10]:
Ppath = f"{HOME}/SWEMLv2.0/Predictions/{DataFrame}/{output_res}/{fSCA_thresh}"
if not os.path.exists(Ppath):
    os.makedirs(Ppath, exist_ok=True)


#save the model predictions
table = pa.Table.from_pandas(EvalDF)
# Parquet with Brotli compression
pq.write_table(table, f"{Ppath}/All_Feats_HoldOut_03-29-2019.parquet", compression='BROTLI')

In [None]:
EvalDF.columns

In [None]:
import matplotlib.pyplot as plt
import geopandas as gpd

from mpl_toolkits.axes_grid1 import make_axes_locatable

def SpatialAnalysis(EvalDF):
    #Convert to a geopandas DF
    Pred_Geo = gpd.GeoDataFrame(EvalDF, geometry = gpd.points_from_xy(EvalDF.cen_lon, EvalDF.cen_lat))

    Pred_Geo.plot(column='Elevation_m',
                  legend=False,
                )
    
SpatialAnalysis(EvalDF)