In [None]:
import sys
print(sys.executable)

In [None]:
import xarray as xr
import dask
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from eofs.xarray import Eof
import time

# Read data

In [None]:
island_dataset = xr.open_dataset('/home/disk/eos12/wycheng/data/US/island/island_1deg_US.nc',
                                  chunks={'lat':'auto', 'lon':'auto'}).sel(lat=slice(30,40), lon=slice(-120,-110))
display(island_dataset)

In [None]:
WWLLN_dataset = xr.open_mfdataset('/home/disk/eos12/wycheng/data/US/WWLLN/WWLLN_*_F_cg_1deg3hr_US.nc',
                                  parallel=True,
                                  chunks={'Time':'auto', 'lat':'auto', 'lon':'auto'}).sel(lat=slice(30,40), lon=slice(-120,-110))
WWLLN_dataset['F'] = (1/((111.19492664455873)**2)) * (365.25*8) * WWLLN_dataset['F']
display(WWLLN_dataset)

In [None]:
TRMM_dataset = xr.open_mfdataset('/home/disk/eos12/wycheng/data/US/TRMM/TRMM_*_pcp_cg_1deg3hr_US.nc',
                                  parallel=True,
                                  chunks={'Time':'auto', 'lat':'auto', 'lon':'auto'}).sel(lat=slice(30,40), lon=slice(-120,-110))
display(TRMM_dataset)

In [None]:
ERA5_cape_dataset = xr.open_mfdataset('/home/disk/eos12/wycheng/data/US/ERA5/ERA5_cape_*.nc',
                                     parallel=True,
                                     chunks={'time':'auto', 'latitude':'auto', 'longitude':'auto'}).sel(latitude=slice(40,30), longitude=slice(-120,-110)).compute()

ERA5_cape_dataset = ERA5_cape_dataset.rename({'time':'Time', 'latitude':'lat', 'longitude':'lon'})

ERA5_cape_dataset = ERA5_cape_dataset.resample(Time="3h").mean()

lono = xr.DataArray(np.linspace(-119.5,-110.5,10), dims='lon')
lato = xr.DataArray(np.linspace(30.5,39.5,10), dims='lat')

with dask.config.set(**{'array.slicing.split_large_chunks': True}):
    ERA5_cape_dataset = ERA5_cape_dataset.interp(lon=lono,lat=lato,method='linear')

display(ERA5_cape_dataset)

In [None]:
dataset = xr.merge([island_dataset, WWLLN_dataset, TRMM_dataset, ERA5_cape_dataset]).sel(Time=slice("2010-01-01", "2019-12-31"))
display(dataset)

In [None]:
#dataset.to_netcdf(path='/home/disk/eos12/wycheng/data/US/dataset/dataset_test.nc', mode='w')
#dataset = xr.open_dataset('/home/disk/eos12/wycheng/data/US/dataset/dataset_test.nc')

# Set country borders

In [None]:
import regionmask
import geopandas as gpd

In [None]:
PATH_TO_SHAPEFILE = '/home/disk/eos10/wycheng/LightningMachineLearning/data/WorldCountriesBoundaries/99bfd9e7-bb42-4728-87b5-07f8c8ac631c2020328-1-1vef4ev.lu5nk.shp'
countries = gpd.read_file(PATH_TO_SHAPEFILE)

# ML Setup

In [None]:
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import validation_curve
from sklearn.model_selection import learning_curve
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, average_precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.metrics import plot_roc_curve, plot_precision_recall_curve
from sklearn.metrics import roc_curve, precision_recall_curve
from sklearn.metrics import auc

In [None]:
dataset = dataset.assign(TO=lambda dataset: 1.0*(dataset.F>0))

In [None]:
dataframe = dataset.where( (dataset.island == 1) ).to_dataframe().dropna(axis=0)
dataframe

In [None]:
feature_name   = ['pcp','cape']
output_name    = ['TO']

In [None]:
X = dataframe.drop(output_name,axis=1)
y = dataframe[output_name] 

undersample = RandomUnderSampler(sampling_strategy=0.33)
                   
X_train_raw, X_test, y_train_raw, y_test = train_test_split(X, y, test_size=0.33, random_state=0)
X_train, y_train = undersample.fit_resample(X_train_raw, y_train_raw)

y_predict_truth = y_test[output_name].values.ravel()

In [None]:
n_models = 5
AUCROC = np.zeros((n_models))
AUCPRC = np.zeros((n_models))

## R14

In [None]:
import scipy as sp
from sklearn.metrics import accuracy_score, precision_score, f1_score, confusion_matrix
from sklearn.preprocessing import normalize

In [None]:
class R14:
    
    def fit(CAPE,pcp,y):

        thrs = sp.optimize.fminbound(lambda x: -f1_score(y, ((CAPE*pcp > x) * 1.0).astype(int)), 0, 4000)
        fval = f1_score(y, ((CAPE*pcp >= thrs) * 1.0).astype(int))
        
        return thrs, fval
    
    def predict(CAPE,pcp,thrs):
        
        y_predict = ((CAPE*pcp >= thrs) * 1.0).astype(int)
        y_predict_proba = CAPE*pcp
        
        return y_predict, y_predict_proba/np.max(y_predict_proba)

In [None]:
[r14_thrs,fval] = R14.fit(X_train['cape'],X_train['pcp'],y_train)

In [None]:
y_predict_r14, y_score_r14 = R14.predict(X_test['cape'],X_test['pcp'],r14_thrs)

In [None]:
precision, recall, thresholds = precision_recall_curve(y_predict_truth, y_score_r14)
AUCROC[0] = roc_auc_score(y_predict_truth, y_score_r14)
AUCPRC[0] = auc(recall, precision)

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lrclf = LogisticRegression(random_state=0).fit(X_train[feature_name], y_train[output_name].values.ravel())

In [None]:
y_predict_lrclf = lrclf.predict(X_test[feature_name])

In [None]:
y_score_lrclf = lrclf.predict_proba(X_test[feature_name])[:,1]

In [None]:
precision, recall, thresholds = precision_recall_curve(y_predict_truth, y_score_lrclf)
AUCROC[1]   = roc_auc_score(y_predict_truth, y_score_lrclf)
AUCPRC[1]   = auc(recall, precision)

## Decision Trees

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dtclf = DecisionTreeClassifier(max_depth=5)

dtclf.fit(X_train[feature_name], y_train[output_name].values.ravel())

In [None]:
y_predict_dtclf = dtclf.predict(X_test[feature_name])

In [None]:
y_score_dtclf = dtclf.predict_proba(X_test[feature_name])[:,1]

In [None]:
precision, recall, thresholds = precision_recall_curve(y_predict_truth, y_score_dtclf)
AUCROC[2]   = roc_auc_score(y_predict_truth, y_score_dtclf)
AUCPRC[2]   = auc(recall, precision)

## RFC

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfclf = RandomForestClassifier(n_estimators=10, 
                               max_depth=4,
                               min_samples_split=10,
                               random_state=0)
    
rfclf.fit(X_train[feature_name], y_train[output_name].values.ravel())

In [None]:
y_predict_rfclf = rfclf.predict(X_test[feature_name])

In [None]:
y_score_rfclf = rfclf.predict_proba(X_test[feature_name])[:,1]

In [None]:
precision, recall, thresholds = precision_recall_curve(y_predict_truth, y_score_rfclf)
AUCROC[3]   = roc_auc_score(y_predict_truth, y_score_rfclf)
AUCPRC[3]   = auc(recall, precision)

## Neural Network Classifier

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
mlpclf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                       hidden_layer_sizes=(5, 2), random_state=1)

mlpclf.fit(X_train[feature_name], y_train[output_name].values.ravel())

In [None]:
y_predict_mlpclf = mlpclf.predict(X_test[feature_name])

In [None]:
y_score_mlpclf = mlpclf.predict_proba(X_test[feature_name])[:,1]

In [None]:
precision, recall, thresholds = precision_recall_curve(y_predict_truth, y_score_mlpclf)
AUCROC[4]   = roc_auc_score(y_predict_truth, y_score_mlpclf)
AUCPRC[4]   = auc(recall, precision)

## Evaluating Model Performance

In [None]:
models  = ['r14','lrclf','dtclf','rfclf','mlpclf']
model_names = ['R14','LR','DT','RF','NN']
colors  = ['k','b','orange','g','r','purple']
markers = ['.','v','s','p','*','x','d']

In [None]:
fig, ax = plt.subplots()

for imodel in np.arange(0,n_models,1):
    exec( 'fpr, tpr, threshold = roc_curve(y_predict_truth, y_score_'+models[imodel]+')' )
    roc_auc = auc(fpr, tpr)
    ax.plot(fpr, tpr, c=colors[imodel], label = model_names[imodel]+' (AUC = %0.2f)'% roc_auc)
    
plt.xlabel('False Alarm Rate')
plt.ylabel('True Positive Rate')
plt.legend(fontsize=12,loc='best')
plt.show()  

In [None]:
fig, ax = plt.subplots()

for imodel in np.arange(0,n_models,1):
    exec( 'precision, recall, thresholds = precision_recall_curve(y_predict_truth, y_score_'+models[imodel]+')' )
    pr_auc = auc(recall, precision)
    ax.plot(precision, recall, c=colors[imodel], label = model_names[imodel]+' (AUC = %0.2f)'% pr_auc)
    
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend(fontsize=12,loc='best')
plt.show()  

In [None]:
fig, ax = plt.subplots()

for imodel in np.arange(0,n_models,1):
    ax.scatter(AUCPRC[imodel],AUCROC[imodel], c=colors[imodel], marker=markers[imodel],label=model_names[imodel])

ax.set_title('Model skill')
ax.set_xlabel('Area under PR curve')
ax.set_ylabel('Area under ROC curve')
#ax.set_xlim([0.25,0.5])
#ax.set_ylim([0.7,0.95])
ax.legend(loc='best')

# Examine the performance for dry thunderstorms

In [None]:
pcp_thrs = 0.01
Xdt_test = X_test.where(X_test['pcp']<pcp_thrs).dropna()
ydt_predict_truth = y_test[output_name].where(X_test.pcp<pcp_thrs).dropna().values.ravel()

the ratio between dry thunderstorms and total thunderstorms

In [None]:
print(y_test.where(y_test['TO']>0).where(X_test.pcp<pcp_thrs).count()/y_test.where(y_test['TO']>0).count())

In [None]:
ydt_predict_r14, ydt_score0 = R14.predict(Xdt_test['cape'],Xdt_test['pcp'],r14_thrs)

In [None]:
AUCROC = np.zeros((n_models))
AUCPRC = np.zeros((n_models))

In [None]:
ydt_predict_r14,    ydt_score_r14     = R14.predict(Xdt_test['cape'],Xdt_test['pcp'],r14_thrs)

ydt_score_lrclf   = lrclf.predict_proba(Xdt_test[feature_name])[:,1]
ydt_score_dtclf   = dtclf.predict_proba(Xdt_test[feature_name])[:,1]
ydt_score_rfclf   = rfclf.predict_proba(Xdt_test[feature_name])[:,1]
ydt_score_mlpclf  = mlpclf.predict_proba(Xdt_test[feature_name])[:,1]

In [None]:
precision, recall, thresholds = precision_recall_curve(ydt_predict_truth, ydt_score_r14)
AUCROC[0] = roc_auc_score(ydt_predict_truth, ydt_score_r14)
AUCPRC[0] = auc(recall, precision)

precision, recall, thresholds = precision_recall_curve(ydt_predict_truth, ydt_score_lrclf)
AUCROC[1] = roc_auc_score(ydt_predict_truth, ydt_score_lrclf)
AUCPRC[1] = auc(recall, precision)

precision, recall, thresholds = precision_recall_curve(ydt_predict_truth, ydt_score_dtclf)
AUCROC[2] = roc_auc_score(ydt_predict_truth, ydt_score_dtclf)
AUCPRC[2] = auc(recall, precision)

precision, recall, thresholds = precision_recall_curve(ydt_predict_truth, ydt_score_rfclf)
AUCROC[3] = roc_auc_score(ydt_predict_truth, ydt_score_rfclf)
AUCPRC[3] = auc(recall, precision)

precision, recall, thresholds = precision_recall_curve(ydt_predict_truth, ydt_score_mlpclf)
AUCROC[4] = roc_auc_score(ydt_predict_truth, ydt_score_mlpclf)
AUCPRC[4] = auc(recall, precision)

In [None]:
models  = ['r14','lrclf','dtclf','rfclf','mlpclf']
model_names = ['R14','LR','DT','RF','NN']
colors  = ['k','b','orange','g','r','purple']
markers = ['.','v','s','p','*','x','d']

In [None]:
fig, ax = plt.subplots()

for imodel in np.arange(0,n_models,1):
    exec( 'fpr, tpr, threshold = roc_curve(ydt_predict_truth, ydt_score_'+models[imodel]+')' )
    roc_auc = auc(fpr, tpr)
    ax.plot(fpr, tpr, c=colors[imodel], label = model_names[imodel]+' (AUC = %0.2f)'% roc_auc)
    
plt.xlabel('False Alarm Rate')
plt.ylabel('True Positive Rate')
plt.legend(fontsize=12,loc='best')
plt.show()  

In [None]:
fig, ax = plt.subplots()

for imodel in np.arange(0,n_models,1):
    exec( 'precision, recall, thresholds = precision_recall_curve(ydt_predict_truth, ydt_score_'+models[imodel]+')' )
    pr_auc = auc(recall, precision)
    ax.plot(precision, recall, c=colors[imodel], label = model_names[imodel]+' (AUC = %0.2f)'% pr_auc)
    
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend(fontsize=12,loc='best')
plt.show()  

In [None]:
fig, ax = plt.subplots()

for imodel in np.arange(0,n_models,1):
    ax.scatter(AUCPRC[imodel],AUCROC[imodel], c=colors[imodel], marker=markers[imodel],label=model_names[imodel])

ax.set_title('Model skill')
ax.set_xlabel('Area under PR curve')
ax.set_ylabel('Area under ROC curve')
#ax.set_xlim([0.25,0.5])
#ax.set_ylim([0.7,0.95])
ax.legend(loc='best')

In [None]:
dataset_test = xr.open_dataset('/home/disk/eos12/wycheng/dataset_CAL.nc')
display(dataset_test)