In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from netCDF4 import Dataset

# Data Processing

## set time and regional domain

In [None]:
#"""
region     = 'CONUS'
ilon_start =  45
ilon_end   = 120
ilat_start = 110
ilat_end   = 150
#"""


isl        = 1
pcp_thrs   = 0
YYYY_list  = [2012];

## read data

In [None]:
WWLLN_dir       = '/home/disk/eos12/wycheng/data/US/WWLLN/'
ERA5_dir        = '/home/disk/eos12/wycheng/data/US/ERA5/'
TRMM_dir        = '/home/disk/eos12/wycheng/data/US/TRMM/'
island_dir      = '/home/disk/eos9/wycheng/LightningMachineLearning/data/cgdata/island/'

island_fname    = 'island_1deg.nc'
ncin_island     = Dataset(island_dir+island_fname,'r')
island_in       = ncin_island.variables['island'][ilat_start:ilat_end,ilon_start:ilon_end]

nYYYY           = np.shape(YYYY_list)[0]

for iYYYY in range(nYYYY):
    YYYY = YYYY_list[iYYYY];

    WWLLN_F_fname   = 'WWLLN_'+str(YYYY)+'_F_cg_1deg3hr_US.nc'
    ERA5_cape_fname = 'ERA5_'+str(YYYY)+'_cape_cg_1deg3hr_US.nc'
    TRMM_pcp_fname  = 'TRMM_'+str(YYYY)+'_pcp_cg_1deg3hr_US.nc'

    ncin_F          = Dataset(WWLLN_dir+WWLLN_F_fname,'r')
    ncin_cape       = Dataset(ERA5_dir+ERA5_cape_fname,'r')
    ncin_pcp        = Dataset(TRMM_dir+TRMM_pcp_fname,'r')

    
    if (iYYYY==0):
        F_in        = ncin_F.variables['F'][:,:,:]
        cape_in     = ncin_cape.variables['cape'][:,:,:]
        pcp_in      = ncin_pcp.variables['pcp'][:,:,:]

    else:
        F_in        = np.append(F_in,ncin_F.variables['F'][:,:,:],axis=0)
        cape_in     = np.append(cape_in,ncin_cape.variables['cape'][:,:,:],axis=0)
        pcp_in      = np.append(pcp_in,ncin_pcp.variables['pcp'][:,:,:],axis=0)

F_in           = F_in * (1/((111.19492664455873)**2)) * (365.25*8) # turn unit into [km-2 yr-1]
isLightning_in = np.where(F_in>0,1,0)
sqrtcape_in    = cape_in ** 0.5;

island_in3d    = np.broadcast_to(island_in, F_in.shape)

In [None]:
mask_island   = np.where(island_in3d==1, 1, np.nan);
print(mask_island.shape)

In [None]:
F_lnd           = F_in*mask_island
isLightning_lnd = isLightning_in*mask_island
cape_lnd        = cape_in*mask_island
sqrtcape_lnd    = sqrtcape_in*mask_island
pcp_lnd         = pcp_in*mask_island

In [None]:
dataset = pd.DataFrame(data=np.column_stack((F_lnd.ravel(),isLightning_lnd.ravel(),cape_lnd.ravel(),pcp_lnd.ravel())), columns=['F','IL','CAPE','pcp']).dropna()

## check data

In [None]:
dataset.info(verbose=True)

## formatting input (training/test) data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
feature_names = ['CAPE','pcp']
output_name   = ['IL']
X = dataset[feature_names]
y = dataset[output_name]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=None)

In [None]:
print(X_train.info())
print(y_train.info())

# ML

## R14

In [None]:
import scipy as sp
from sklearn.metrics import accuracy_score, precision_score, f1_score, confusion_matrix
from sklearn.preprocessing import normalize

In [None]:
class R14:
    
    def fit(CAPE,pcp,y):

        thrs = sp.optimize.fminbound(lambda x: -f1_score(y, ((CAPE*pcp > x) * 1.0).astype(int)), 0, 4000)
        fval = f1_score(y, ((CAPE*pcp >= thrs) * 1.0).astype(int))
        
        return thrs, fval
    
    def predict(CAPE,pcp,thrs):
        
        y_predict = ((CAPE*pcp >= thrs) * 1.0).astype(int)
        y_predict_proba = CAPE*pcp
        
        return y_predict, y_predict_proba/np.max(y_predict_proba)

In [None]:
[r14_thrs,fval] = R14.fit(X_train['CAPE'],X_train['pcp'],y_train)

In [None]:
print(r14_thrs, fval)

In [None]:
y_predict_r14, y_predict_prob_r14 = R14.predict(X_test['CAPE'],X_test['pcp'],r14_thrs)

## random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfclf = RandomForestClassifier(n_estimators=10, max_depth=4, min_samples_split=1000, random_state=0)

In [None]:
rfclf.fit(X_train[feature_names], y_train[output_name])

In [None]:
y_predict_rfclf = rfclf.predict(X_test[feature_names])

## Model Evaluation

In [None]:
from sklearn import metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import plot_roc_curve

In [None]:
print(precision_score(y_test['IL'], y_predict_rfclf))
print(precision_score(y_test['IL'], y_predict_r14))

In [None]:
print(recall_score(y_test['IL'], y_predict_rfclf))
print(recall_score(y_test['IL'], y_predict_r14))

In [None]:
print(f1_score(y_test['IL'], y_predict_rfclf))
print(f1_score(y_test['IL'], y_predict_r14))

In [None]:
auc_rfclf = metrics.roc_auc_score(y_test, rfclf.predict_proba(X_test)[:,1])
auc_r14   = metrics.roc_auc_score(y_test, y_predict_prob_r14)
print(auc_rfclf, auc_r14)

In [None]:
xthrs = np.linspace(0,4000,20)
fpr = []
tpr = []
for i in range(np.size(xthrs)):
    yp, fv = R14.predict(X_test['CAPE'],X_test['pcp'],xthrs[i])
    tn, fp, fn, tp = confusion_matrix(y_test['IL'], yp).ravel()
    fpr.append( (fp/(fp+tn)) ) 
    tpr.append( (tp/(tp+fn)) )

In [None]:
print(tpr)

In [None]:
plot_roc_curve(rfclf, X_test, y_test, label='RFC (AUC = %0.2f)'%(auc_rfclf) ) 
plt.plot(fpr, tpr, 'r-',label='R14 (AUC = %0.2f)'%(auc_r14))
plt.legend(fontsize=16)
plt.show()  

In [None]:
pd.DataFrame(
    confusion_matrix(y_test['IL'], y_predict_rfclf),
    columns=['Predicted No Lightning', 'Predicted Lightning'],
    index=['True No Lightning', 'True Lightning']
)

In [None]:
pd.DataFrame(
    confusion_matrix(y_test['IL'], y_predict_r14),
    columns=['Predicted No Lightning', 'Predicted Lightning'],
    index=['True No Lightning', 'True Lightning']
)