# Fitting a distribution to radius to maximum wind observations

The historical record has only sparse observations of radius to maximum winds $R_{mw}$ in the Australian region (2002 onwards). As in Vickery and Skerjl (2000), we assume that $R_{mw}$ fits a log-normal distribution. Powell _et al._ (2005) provide a functional form for the distribution (their Eq. 7), and we will use this as a first estimate. The resulting model is intended for application in setting $R_{mw}$ values for stochastically generated storms in TCRM.

Note that this model describes the log normal distribution of $R_{mw}$ in kilometres -- Powell _et al._ define their model in nautical miles.

In [1]:
%matplotlib inline

from __future__ import division, print_function
import os
from os.path import join as pjoin
from matplotlib import pyplot as plt
from datetime import datetime, timedelta

from Utilities.metutils import convert

import numpy as np
import scipy.stats as stats

import pandas as pd
import statsmodels.formula.api as smf
import statsmodels.api as sm

from statsmodels.tools.tools import ECDF

import seaborn as sns
sns.set_style("ticks")
sns.set_context("poster")

First a short function to convert the formatted latitude/longitude values to actual numbers.

In [2]:
def convertLatLon(strval):
    """
    Convert a string representing lat/lon values from '140S to -14.0, etc.
    
    :param str strval: string containing the latitude or longitude.
    
    :returns: Latitude/longitude as a float value.
    
    """
    hemi = strval[-1].upper()
    fval = float(strval[:-1]) / 10.
    if (hemi == 'S') | (hemi == 'W'): 
        fval *= -1
    if (hemi == 'E') | (hemi == 'W'):
        fval = fval % 360
    return fval
            

Define the data structure and a small function to load a file. This uses the JTWC data format, described [here](http://www.usno.navy.mil/NOOC/nmfc-ph/RSS/jtwc/best_tracks/shindex.php). 

In [3]:
COLNAMES = ['BASIN','Number', 'Datetime','TECHNUM', 'TECH','TAU', 'Latitude', 'Longitude', 'Windspeed','Pressure',
            'Status', 'RAD', 'WINDCODE','RAD1', 'RAD2','RAD3', 'RAD4','Poci', 'Roci','rMax', 'GUSTS','EYE',
            'SUBREGION','MAXSEAS', 'INITIALS','DIR', 'SPEED','STORMNAME', 'DEPTH','SEAS',
            'SEASCODE','SEAS1', 'SEAS2','SEAS3', 'SEAS4'] 

COLTYPES = ['|S2', 'i', datetime, 'i', '|S4', 'i', 'f', 'f', 'f', 'f', 
            '|S4', 'f', '|S3', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f',
            '|S1', 'f', '|S3', 'f', 'f', '|S10', '|S1', 'f', 
            '|S3', 'f', 'f', 'f', 'f']
COLUNITS = ['', '', '', '', '', '', '', '', 'kts', 'hPa', 
            '', 'nm', '', 'nm', 'nm', 'nm', 'nm', 'hPa', 'nm', 'nm', 'kts', 'nm',
            '', '', '', 'degrees', 'kts', '', '', '',
            '', '', '', '', '']
DATEFORMAT = "%Y%m%d%H"
dtype = np.dtype({'names':COLNAMES, 'formats':COLTYPES})
converters = {
    1: lambda s: s.strip(' ,'),
    2: lambda s: datetime.strptime(s.strip(' ,'), DATEFORMAT),
    6: lambda s: float(convertLatLon(s.strip(' ,'))),
    7: lambda s: float(convertLatLon(s.strip(' ,'))),
    8: lambda s: s.strip(' ,'),
    9: lambda s: s.strip(' ,'),
    10: lambda s: s.strip(' ,'),
    11: lambda s: convert(float(s.strip(' ,') or 0), COLUNITS[11], 'km'),
    12: lambda s: s.strip(' ,'),
    13: lambda s: convert(float(s.strip(' ,') or 0), COLUNITS[13], 'km'),
    14: lambda s: convert(float(s.strip(' ,') or 0), COLUNITS[14], 'km'),
    15: lambda s: convert(float(s.strip(' ,') or 0), COLUNITS[15], 'km'),
    16: lambda s: convert(float(s.strip(' ,') or 0), COLUNITS[16], 'km'),
    17: lambda s: float(s.strip(' ,')),
    18: lambda s: convert(float(s.strip(' ,') or 0), COLUNITS[18], 'km'),
    19: lambda s: convert(float(s.strip(' ,') or 0), COLUNITS[19], 'km'),
}
delimiter = (3,4,12,4,6,5,7,7,5,6,4,5,5,6,6,6,6,6,6,5,5,5,5)
skip_header = 0
usecols = tuple(range(23))
missing_value = ""
filling_values = 0

def loadData(filename):
    try:
        data = np.genfromtxt(filename, dtype, delimiter=delimiter, skip_header=skip_header, 
                             converters=converters, missing_values=missing_value, 
                             filling_values=filling_values, usecols=usecols, autostrip=True, invalid_raise=False)
    except IndexError:
        try:
            data = np.genfromtxt(filename, dtype, delimiter=delimiter, skip_header=skip_header, 
                             converters=converters, missing_values=missing_value, 
                             filling_values=filling_values, usecols=tuple(range(18)), autostrip=True, invalid_raise=False)
        except IndexError:
            data = np.genfromtxt(filename, dtype, delimiter=[3,4,12,4,6,5,7,7,5], skip_header=skip_header, 
                             converters=converters, missing_values=missing_value, 
                             filling_values=filling_values, usecols=tuple(range(9)), autostrip=True, invalid_raise=False)
    return data


Often the b-deck files contain multiple records with the same time stamp. This is to record information on different wind speed radii (e.g. the radius to 34-knot winds, 48-knot winds, etc.). We can quickly filter out this extra information using [`numpy.unique()`](http://docs.scipy.org/doc/numpy/reference/generated/numpy.unique.html). Additional filtering restricts to a known domain and only those storms that are of Tropical Storm or Typhoon strength.

In [4]:
def filterData(data):
    datetimes, idx = np.unique(data['Datetime'], True)
    filter1 = (data['Status'][idx] == 'TS') | (data['Status'][idx] == 'TY')
    filter2 = (data['Longitude'][idx] >= 90.) & (data['Longitude'][idx] <= 180.)
    filter3 = (data['rMax'][idx] >= 0.1)
    filter4 = (data['Poci'][idx] > 0.1)
    subsidx = np.nonzero(filter1 & filter2 & filter3 & filter4)
    return data[subsidx]

Now churn through the best-track files (unmodified) and pull out $R_{mw}$ and central pressure estimates. This assumes you have the JTWC best track files somewhere locally - no download performed.

In [5]:
def processfiles(path, basin):
    rmax = np.array([])
    prs = np.array([])
    lat = np.array([])
    poci = np.array([])
    for root, dirs, files in os.walk(path):
        if root.endswith(basin):
            for file in files:
                data = loadData(pjoin(root, file))
                if 'Status' in data.dtype.names:
                    data = filterData(data)
                    if 'rMax' in data.dtype.names:
                        rmax = np.append(rmax, data['rMax'])
                        prs = np.append(prs, data['Pressure'])
                        poci = np.append(poci, data['Poci'])
                        lat = np.append(lat, data['Latitude'])
    return rmax, prs, poci, lat

In [16]:
inputPath = "C:\\WorkSpace\\data\\Raw\\best_tracks"
rmax, prs, poci, lat = processfiles(inputPath, 'sh')

Now we test the first hypothesis - that the distribution of $R_{mw}$ is represented by a log-normal distribution. Plot up the probability distribution function, with a kernel density estimate and a fitted log-normal distribution.

In [17]:
print("Parameter estimates:       Shape; Location (fixed);    Scale;    Mean")
fig, ax = plt.subplots(1,1)
sns.distplot(rmax, bins=np.arange(0, 201, 10),
             kde_kws={'clip':(0, 200), 'label':"KDE"}, ax=ax)

shape, loc, scale = stats.lognorm.fit(rmax, scale=np.mean(rmax), floc=0)
print("Southern hemisphere basin: ", shape, loc, scale, np.mean(rmax))
x = np.arange(1, 201)
v = stats.lognorm.pdf(x, shape, loc=loc, scale=scale)
fcdf = stats.lognorm.cdf(np.sort(rmax), shape, loc=loc, scale=scale)

ax.plot(x, v, label="Lognormal fit")
ax.legend(loc=0)
ax.set_xlabel(r'$R_{mw}$ (km)')
ax.set_xlim((0, 200))
ax.set_title("Southern hemisphere (2002-2013)")


fig.tight_layout()
sns.despine()

Evaluate empirical CDFs for each region, then compare to the fitted CDFs evaluated above. 

In [18]:
ecdf = ECDF(rmax, side='left')

plt.plot(np.sort(rmax), ecdf.y[1:])
plt.plot(np.sort(rmax), fcdf, 'r' )
rsq = stats.pearsonr(np.sort(rmax), fcdf)[0]**2
plt.text( 10, 0.9, r"$R^{2}$ = %f"%rsq)

## Fitting to multiple parameters

In Powell _et al._ (2005), the natural logarithm of $R_{mw}$ is modelled as follows:

$\ln R_{mw} = \alpha + \beta \Delta p + \gamma \Delta p^2 + \zeta \lambda^2 + \varepsilon$

The constants were determined by a generalised linear model, $\lambda$ is the latitude and $\varepsilon$ is a normal random variable with zero mean. 

Additional filtering is needed here to remove records where the pressure of the outermost closed isobar ($P_{oci}$) is not known.

In [19]:
def filterPoci(field, poci):
    filter1 = (poci >= 0.1)
    subsidx = np.nonzero(filter1)
    return field[subsidx]


dp = filterPoci(poci, poci) - filterPoci(prs, poci)
dpsq = dp*dp
lat = filterPoci(lat, poci)
latsq = lat*lat

Now fit a model, based on Powell _et al._ above. 

In [20]:
X = np.column_stack((dp, dpsq, latsq))
X = sm.add_constant(X)
y = np.array(np.log(filterPoci(rmax, poci)))
model = sm.OLS(y, X)
results = model.fit()
print(results.summary())
print('Parameters: ', results.params)
print('P-value: ', results.pvalues)
print('R-squared: ', results.rsquared)
print('T-values: ', results.tvalues)

On first inspection, this is counter to the expected outcome. The coefficient for the $\Delta p^2$ term is positive, while the coefficient for the $\Delta p$ term is negative. This would imply an increasing $R_{mw}$ for more intense storms (i.e. with larger $\Delta p$). 

First though, we fit the data using Generalized Least Squares - this better accounts for a degree of correlation between the explanatory variables. 

In [21]:
rho = results.params[1]
from scipy.linalg import toeplitz
order = toeplitz(range(len(results.resid)))
sigma = rho ** order
gls_model = sm.GLS(y, X, sigma=sigma)
gls_results = gls_model.fit()
print(gls_results.summary())
print('Parameters: ', gls_results.params)
print('P-value: ', gls_results.pvalues)
print('R-squared: ', gls_results.rsquared)
print('T-values: ', gls_results.tvalues)

The results are similar. The coefficient for the $\Delta p^2$ term is positive, and the coefficient for the $\Delta p$ term is negative.

We now plot the function to visualise what's happening. We exclude the random innovation term ($\varepsilon$) to highlight the functional form of the model.

In [23]:
deltap = np.linspace(0, 100, 100)
lats = np.arange(-30, -1, 4)

f = gls_results.params
fig, ax = plt.subplots(1,1)
sns.set_palette(sns.color_palette("coolwarm", 8))
for l in lats:
    yy = f[0] + f[1]*deltap + f[2]*deltap*deltap + f[3]*l*l
    ax.plot(deltap, np.exp(yy), label="%d"%l)
    
ax.set_ylabel(r"$R_{mw}$ (km)")
ax.set_xlabel(r"$\Delta p$ (hPa)")
ax.legend(loc=1)

The functional form is parabolic in $\Delta p$ (as expected), with the local minimum near $\Delta p = 60$ hPa. Above this value (i.e. more intense storms), $R_{mw}$ would tend to increase. Latitude has only a minor influence on $R_{mw}$, but remains a useful predictor nonetheless.

Now, let us examine the residuals from the GLS model:

In [24]:
ax = sns.distplot(gls_results.resid, kde_kws={'label':'Residuals', 'linestyle':'--'})
fp = stats.norm.fit(gls_results.resid,shape=np.mean(gls_results.resid),scale=np.std(gls_results.resid))
x = np.linspace(-2, 2, 1000)
print(fp)
print(stats.mstats.normaltest(gls_results.resid))
ax.plot(x, stats.norm.pdf(x, fp[0], fp[1]), label='Normal')

This provides an estimate of the magnitude of random variation around the fitted model. 

We now put the full model together, using randomly sampled data from the observed dataset. 

In [28]:
nx = len(dp)
ind = np.random.choice(np.arange(nx), 200, replace=True)
dp0 = dp[ind]
l0 = lat[ind]

yy = f[0] + f[1]*dp0 + f[2]*dp0*dp0 + f[3]*l0*l0 + np.random.normal(scale=fp[1], size=200)
rm = np.exp(yy)
fig, ax = plt.subplots(1, 1)
ax.scatter(dp, filterPoci(rmax, poci), c='w', edgecolor='k', s=50, marker='+', label='Observations')
ax.scatter(dp0, rm, c=np.abs(l0), cmap=sns.light_palette('blue', as_cmap=True), s=40, label='Model', alpha=0.5)
ax.set_xlim(0, 100)
ax.set_xlabel(r"$\Delta p$ (hPa)")
ax.set_ylabel(r"$R_{mw}$ (km)")
ax.set_ylim(0, 200)
ax.legend(loc=1)
ax.grid()


So the model reproduces the observations reasonably well on visual inspection. Modelled values of $R_{mw}$ are generally less than 50 km for the most intense storms, while for weak storms, $R_{mw}$ values tend to be higher, with maximum values occuring for those storms with $\Delta p < 20$ hPa. 

The overall distribution of $R_{mw}$ is also well reproduced. Here, we present the distribution from the $R_{mw}$ model with the fitted log-normal distribution for the observations. There's a slight over-representation of smaller storms ($R_{mw} < 25$ km), but above this the distributions match well.

In [52]:
x = np.arange(1, 101)
v = stats.lognorm.pdf(x, shape, loc=loc, scale=scale)
fig, ax = plt.subplots(1, 1)
sns.distplot(rm, bins=np.arange(0, 101, 5),
             kde_kws={'clip':(0, 100), 'label':"Model data (KDE)"},)
ax.plot(x, v, label="Lognormal fit from observations", color='r')
ax.legend(loc=0)
ax.set_xlabel(r'$R_{mw}$ (km)')
ax.set_xlim((0, 100))

## A reduced model for $R_{mw}$ - Wang and Rosowsky

Wang and Rosowsky (2012) used a reduced model, where the dependence is only on the square of the pressure deficit and latitude:

$\ln R_{mw} = \alpha + \gamma \Delta p^2 + \zeta |\lambda| + \varepsilon$

First fit a ordinary least squares model:

In [53]:
X = np.column_stack((dpsq, np.abs(lat)))
X = sm.add_constant(X)
y = np.array(np.log(filterPoci(rmax, poci)))
model = sm.OLS(y, X)
results = model.fit()
print(results.summary())
print('Parameters: ', results.params)
print('P-value: ', results.pvalues)
print('R-squared: ', results.rsquared)
print('T-values: ', results.tvalues)

Now use a generalised least squares model:

In [54]:
rho = results.params[1]
from scipy.linalg import toeplitz
order = toeplitz(range(len(results.resid)))
sigma = rho ** order
gls_model = sm.GLS(y, X, sigma=sigma)
gls_results = gls_model.fit()
print(gls_results.summary())
print('Parameters: ', gls_results.params)
print('P-value: ', gls_results.pvalues)
print('R-squared: ', gls_results.rsquared)
print('T-values: ', gls_results.tvalues)

In [55]:
deltap = np.linspace(0, 100, 100)
lats = np.arange(-30, -1, 4)

f = gls_results.params
fig, ax = plt.subplots(1,1)
sns.set_palette(sns.color_palette("coolwarm", 8))
for l in lats:
    yy = f[0] + f[1]*deltap*deltap + f[2]*np.abs(l)
    ax.plot(deltap, np.exp(yy), label="%d"%l)
    
ax.set_ylabel(r"$R_{mw}$ (km)")
ax.set_xlabel(r"$\Delta p$ (hPa)")
ax.legend(loc=1)

In [56]:
ax = sns.distplot(gls_results.resid, kde_kws={'label':'Residuals', 'linestyle':'--'})
fp = stats.norm.fit(gls_results.resid,shape=np.mean(gls_results.resid),scale=np.std(gls_results.resid))
x = np.linspace(-2, 2, 1000)
print(fp)
print(stats.mstats.normaltest(gls_results.resid))

ax.plot(x, stats.norm.pdf(x, fp[0], fp[1]), label='Normal')

In [73]:
nx = len(dp)
ind = np.random.choice(np.arange(nx), 200, replace=True)
dp0 = dp[ind]
l0 = lat[ind]

yy = f[0] + f[1]*dp0*dp0 + f[2]*np.abs(l0) + np.random.normal(scale=fp[1], size=200)
rm = np.exp(yy)
fig, ax = plt.subplots(1, 1)
ax.scatter(dp, filterPoci(rmax, poci), c='w', edgecolor='k', s=50, marker='+', label='Observations')
ax.scatter(dp0, rm, c=np.abs(l0), s=40, label='Model')
ax.set_xlim(0, 100)
ax.set_xlabel(r"$\Delta p$ (hPa)")
ax.set_ylabel(r"$R_{mw}$ (km)")
ax.set_ylim(0, 200)
ax.legend(loc=1)
ax.grid()

In [62]:
x = np.arange(1, 101)
v = stats.lognorm.pdf(x, shape, loc=loc, scale=scale)
fig, ax = plt.subplots(1, 1)
sns.distplot(rm, bins=np.arange(0, 101, 5),
             kde_kws={'clip':(0, 100), 'label':"Model data (KDE)"},)
ax.plot(x, v, label="Lognormal fit from observations", color='r')
ax.legend(loc=0)
ax.set_xlabel(r'$R_{mw}$ (km)')
ax.set_xlim((0, 100))

## A simple model

We use a simple model, using $\Delta p$ and $\lambda$ only as predictors - i.e.:

$\ln R_{mw} = \alpha + \beta \Delta p + \gamma\lambda + \varepsilon$


In [75]:
X = np.column_stack((dp, np.abs(lat)))
X = sm.add_constant(X)
y = np.array(np.log(filterPoci(rmax, poci)))
model = sm.OLS(y, X)
results = model.fit()
print(results.summary())
print('Parameters: ', results.params)
print('P-value: ', results.pvalues)
print('R-squared: ', results.rsquared)
print('T-values: ', results.tvalues)

In [76]:
rho = results.params[1]
from scipy.linalg import toeplitz
order = toeplitz(range(len(results.resid)))
sigma = rho ** order
gls_model = sm.GLS(y, X, sigma=sigma)
gls_results = gls_model.fit()
print(gls_results.summary())
print('Parameters: ', gls_results.params)
print('P-value: ', gls_results.pvalues)
print('R-squared: ', gls_results.rsquared)
print('T-values: ', gls_results.tvalues)

In [77]:
deltap = np.linspace(0, 100, 100)
lats = np.arange(-30, -1, 4)

f = gls_results.params
fig, ax = plt.subplots(1,1)
sns.set_palette(sns.color_palette("coolwarm", 8))
for l in lats:
    yy = f[0] + f[1]*deltap + f[2]*np.abs(l)
    ax.plot(deltap, np.exp(yy), label="%d"%l)
    
ax.set_ylabel(r"$R_{mw}$ (km)")
ax.set_xlabel(r"$\Delta p$ (hPa)")
ax.legend(loc=1)

In [78]:
ax = sns.distplot(gls_results.resid, kde_kws={'label':'Residuals', 'linestyle':'--'})
fp = stats.norm.fit(gls_results.resid,shape=np.mean(gls_results.resid),scale=np.std(gls_results.resid))
x = np.linspace(-2, 2, 1000)
print(fp)
print(stats.mstats.normaltest(gls_results.resid))

ax.plot(x, stats.norm.pdf(x, fp[0], fp[1]), label='Normal')

In [80]:
nx = len(dp)
ind = np.random.choice(np.arange(nx), 200, replace=True)
dp0 = dp[ind]
l0 = lat[ind]

yy = f[0] + f[1]*dp0 + f[2]*np.abs(l0) + np.random.normal(scale=fp[1], size=200)
rm = np.exp(yy)
fig, ax = plt.subplots(1, 1)
ax.scatter(dp, filterPoci(rmax, poci), c='w', edgecolor='k', s=50, marker='+', label='Observations')
ax.scatter(dp0, rm, c=np.abs(l0), s=40, label='Model')
ax.set_xlim(0, 100)
ax.set_xlabel(r"$\Delta p$ (hPa)")
ax.set_ylabel(r"$R_{mw}$ (km)")
ax.set_ylim(0, 200)
ax.legend(loc=1)
ax.grid()

In [81]:
x = np.arange(1, 101)
v = stats.lognorm.pdf(x, shape, loc=loc, scale=scale)
fig, ax = plt.subplots(1, 1)
sns.distplot(rm, bins=np.arange(0, 101, 5),
             kde_kws={'clip':(0, 100), 'label':"Model data (KDE)"},)
ax.plot(x, v, label="Lognormal fit from observations", color='r')
ax.legend(loc=0)
ax.set_xlabel(r'$R_{mw}$ (km)')
ax.set_xlim((0, 100))

<a id='references'></a>
## References