# Fitting a distribution to radius to maximum wind observations

The historical record has only sparse observations of radius to maximum winds $\textrm{R}_{mw}$ in the Australian region (2002 onwards). As a first step, we fit a log-normal distribution to the observations and use that as a basis for setting the $\textrm{R}_{mw}$ values for stochastically generated storms in TCRM.

In [1]:
%matplotlib inline

import os
from os.path import join as pjoin
from matplotlib import pyplot as plt
from datetime import datetime, timedelta

from Utilities.metutils import convert

import numpy as np
import scipy.stats as stats

import pandas as pd
import statsmodels.formula.api as smf

import seaborn as sns
sns.set_style("ticks")
sns.set_context("poster")

First a short function to convert the formatted latitude/longitude values to actual numbers.

In [2]:
def convertLatLon(strval):
    """
    Convert a string representing lat/lon values from '140S to -14.0, etc.
    
    :param str strval: string containing the latitude or longitude.
    
    :returns: Latitude/longitude as a float value.
    
    """
    hemi = strval[-1].upper()
    fval = float(strval[:-1]) / 10.
    if (hemi == 'S') | (hemi == 'W'): 
        fval *= -1
    if (hemi == 'E') | (hemi == 'W'):
        fval = fval % 360
    return fval
            

Define the data structure and a small function to load a file. This uses the JTWC data format, described [here](http://www.usno.navy.mil/NOOC/nmfc-ph/RSS/jtwc/best_tracks/shindex.php). 

In [3]:
COLNAMES = ['BASIN','Number', 'Datetime','TECHNUM', 'TECH','TAU', 'Latitude', 'Longitude', 'Windspeed','Pressure',
            'Status', 'RAD', 'WINDCODE','RAD1', 'RAD2','RAD3', 'RAD4','Poci', 'Roci','rMax', 'GUSTS','EYE',
            'SUBREGION','MAXSEAS', 'INITIALS','DIR', 'SPEED','STORMNAME', 'DEPTH','SEAS',
            'SEASCODE','SEAS1', 'SEAS2','SEAS3', 'SEAS4'] 

COLTYPES = ['|S2', 'i', datetime, 'i', '|S4', 'i', 'f', 'f', 'f', 'f', 
            '|S4', 'f', '|S3', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f',
            '|S1', 'f', '|S3', 'f', 'f', '|S10', '|S1', 'f', 
            '|S3', 'f', 'f', 'f', 'f']
COLUNITS = ['', '', '', '', '', '', '', '', 'kts', 'hPa', 
            '', 'nm', '', 'nm', 'nm', 'nm', 'nm', 'hPa', 'nm', 'nm', 'kts', 'nm',
            '', '', '', 'degrees', 'kts', '', '', '',
            '', '', '', '', '']
DATEFORMAT = "%Y%m%d%H"
dtype = np.dtype({'names':COLNAMES, 'formats':COLTYPES})
converters = {
    1: lambda s: s.strip(' ,'),
    2: lambda s: datetime.strptime(s.strip(' ,'), DATEFORMAT),
    6: lambda s: float(convertLatLon(s.strip(' ,'))),
    7: lambda s: float(convertLatLon(s.strip(' ,'))),
    8: lambda s: s.strip(' ,'),
    9: lambda s: s.strip(' ,'),
    10: lambda s: s.strip(' ,'),
    11: lambda s: convert(float(s.strip(' ,') or 0), COLUNITS[11], 'km'),
    12: lambda s: s.strip(' ,'),
    13: lambda s: convert(float(s.strip(' ,') or 0), COLUNITS[13], 'km'),
    14: lambda s: convert(float(s.strip(' ,') or 0), COLUNITS[14], 'km'),
    15: lambda s: convert(float(s.strip(' ,') or 0), COLUNITS[15], 'km'),
    16: lambda s: convert(float(s.strip(' ,') or 0), COLUNITS[16], 'km'),
    17: lambda s: float(s.strip(',')),
    18: lambda s: convert(float(s.strip(' ,') or 0), COLUNITS[18], 'km'),
    19: lambda s: convert(float(s.strip(' ,') or 0), COLUNITS[19], 'km'),
}
delimiter = (3,4,12,4,6,5,7,7,5,6,4,5,5,6,6,6,6,6,6,5,5,5,5)
skip_header = 0
usecols = tuple(range(23))
missing_value = ""
filling_values = 0

def loadData(filename):
    try:
        data = np.genfromtxt(filename, dtype, delimiter=delimiter, skip_header=skip_header, 
                             converters=converters, missing_values=missing_value, 
                             filling_values=filling_values, usecols=usecols, autostrip=True, invalid_raise=False)
    except IndexError:
        try:
            data = np.genfromtxt(filename, dtype, delimiter=delimiter, skip_header=skip_header, 
                             converters=converters, missing_values=missing_value, 
                             filling_values=filling_values, usecols=tuple(range(18)), autostrip=True, invalid_raise=False)
        except IndexError:
            data = np.genfromtxt(filename, dtype, delimiter=[3,4,12,4,6,5,7,7,5], skip_header=skip_header, 
                             converters=converters, missing_values=missing_value, 
                             filling_values=filling_values, usecols=tuple(range(9)), autostrip=True, invalid_raise=False)
    return data


Often the b-deck files contain multiple records with the same time stamp. This is to record information on different wind speed radii (e.g. the radius to 34-knot winds, 48-knot winds, etc.). We can quickly filter out this extra information using [`numpy.unique()`](http://docs.scipy.org/doc/numpy/reference/generated/numpy.unique.html). Additional filtering restricts to a known domain and only those storms that are of Tropical Storm or Typhoon strength.

In [4]:
def filterData(data):
    datetimes, idx = np.unique(data['Datetime'], True)
    filter1 = (data['Status'][idx] == 'TS') | (data['Status'][idx] == 'TY')
    filter2 = (data['Longitude'][idx] >= 90.) & (data['Longitude'][idx] <= 180.)
    filter3 = (data['rMax'][idx] >= 0.1)
    subsidx = np.nonzero(filter1 & filter2 & filter3)
    return data[subsidx]

Now churn through the best-track files (unmodified) and pull out $\textrm{R}_{mw}$ and central pressure estimates. This assumes you have the JTWC best track files somewhere locally - no download performed.

In [5]:
def processfiles(path, basin):
    rmax = np.array([])
    prs = np.array([])
    for root, dirs, files in os.walk(path):
        if root.endswith(basin):
            for file in files:
                data = loadData(pjoin(root, file))
                if 'Status' in data.dtype.names:
                    data = filterData(data)
                    if 'rMax' in data.dtype.names:
                        rmax = np.append(rmax, data['rMax'])
                        prs = np.append(prs, data['Pressure'])
    return rmax, prs

In [6]:
inputPath = "C:\\WorkSpace\\data\\Raw\\best_tracks"
srmax, sprs = processfiles(inputPath, 'sh')
nrmax, nprs = processfiles(inputPath, 'wp')

Plot up the distribution, with a kernel density estimate and a fitted lognormal distribution (with the location parameter fixed at 0).

In [17]:
fig, (ax1, ax2) = plt.subplots(2,1)
sns.distplot(srmax, bins=np.arange(0, 201, 10),
             kde_kws={'clip':(0, 200), 'label':"KDE"}, ax=ax1)

shape, loc, scale = stats.lognorm.fit(srmax, scale=np.mean(srmax), floc=0)
print "Southern hemisphere basin: ", shape, loc, scale, np.mean(srmax)
x = np.arange(1, 201)
v = stats.lognorm.pdf(x, shape, loc=loc, scale=scale)
ax1.plot(x, v, label="Lognormal fit")
ax1.legend(loc=0)
ax1.set_xlabel(r'$R_{mw}$ (km)')
ax1.set_xlim((0,200))
ax1.set_title("Southern hemisphere (2002-2013)")
#################################
sns.distplot(nrmax, bins=np.arange(0, 201, 10),
             kde_kws={'clip':(0, 200), 'label':"KDE"}, ax=ax2)

shape, loc, scale = stats.lognorm.fit(nrmax, scale=np.std(nrmax),floc=0)
print "Western Pacific basin:     ", shape, loc, scale, np.mean(nrmax)
x = np.arange(1, 201)
v = stats.lognorm.pdf(x, shape, loc=loc, scale=scale)
ax2.plot(x, v, label="Lognormal fit")
ax2.legend(loc=0)
ax2.set_xlabel(r'$R_{mw}$ (km)')
ax2.set_xlim((0,200))
ax2.set_title("Western Pacific (2002-2013)")


fig.tight_layout()
sns.despine()

In [10]:
ax = sns.distplot(srmax, bins=np.arange(0, 201, 10),
             kde_kws={'clip':(0, 200), 'label':"KDE"})

shape, loc, scale = stats.lognorm.fit(srmax, scale=np.mean(srmax), floc=0)
print "Southern hemisphere basin: ", shape, scale
x = np.arange(1, 201)
v = stats.lognorm.pdf(x, shape, loc=loc, scale=scale)
ax.plot(x, v, label="Lognormal fit")
ax.legend(loc=0)
ax.set_xlabel(r'$R_{mw}$ (km)')
ax.set_xlim((0,200))
ax.set_title("Southern hemisphere (2002-2013)")
sns.despine()

In [11]:
sns.jointplot(sprs, srmax,kind='hex', ylim=(0,200),xlim=(900,1020))

In [12]:
m,v,s,k = stats.lognorm.stats(shape, loc, scale, moments='mvsk')
print m, v, s, k

In [13]:
rmfile = "C:\\Workspace\\data\\Derived\\jtwc\\rmw\\rmw.sh.csv"
df = pd.read_csv(rmfile)


In [14]:
sns.pairplot(df, kind='scatter',hue='Wind', palette='cubehelix')

In [15]:
jp = sns.jointplot(np.log(df['Pressure']), df['RMW'], kind='kde')
jp.set_axis_labels('log(pressure)', 'RMW')

In [16]:
ax = sns.distplot(df['RMW']*1.852, bins=np.arange(0, 201, 10), kde_kws={'clip':(0, 200), 'label':"KDE"})
shape, loc, scale = stats.lognorm.fit(df['RMW']*1.852, loc=np.std(df['RMW']), scale=np.mean(df['RMW']))
print shape, loc, scale
x = np.arange(1, 201)
v = stats.lognorm.pdf(x, shape, loc, scale)
ax.plot(x, v, label="Lognormal fit")
ax.legend(loc=0)
ax.set_xlim((0, 200))
ax.set_xlabel(r'$R_{MW}$ (km)')
sns.despine()

In [148]:
m,v,s,k = stats.lognorm.stats(shape, loc, scale, moments='mvsk')
print m, v, s, k

In [196]:
sns.interactplot('Pressure', 'Latitude', 'RMW', df)

In [191]:
x = np.arange(1, 120., 1.)
p = stats.lognorm.pdf(x, shape, loc=loc, scale=scale)
c = stats.lognorm.cdf(x, shape, loc=loc, scale=scale)

In [194]:
from Utilities.stats import rMaxDist
xx, pp, cc = rMaxDist(scale, shape, 120.)

In [195]:
fig, (ax1, ax2) = plt.subplots(2, 1)
ax1.plot(x, p, label="stats.lognorm.pdf")
ax1.plot(xx, pp, label="Utilities.stats.rMaxDist pdf")
ax1.legend(loc=0)

ax2.plot(x, c, label="stats.lognorm.cdf")
ax2.plot(xx, cc, label="Utilities.stats.rMaxDist cdf")
ax2.legend(loc=0)
sns.despine()

In [165]:
jp = sns.jointplot(df['Pressure'], df['RMW'], kind='reg',joint_kws={'robust':1})