In [86]:
%env WORKDIR=~/weatherpy-work
from stormevents.io import load_tornadoes

import numpy as np
import pandas as pd

from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor

env: WORKDIR=~/weatherpy-work


In [203]:
sample_data = load_tornadoes('1985-01-01 12:00', '1995-12-31 23:59', tz='GMT')

In [204]:
sample_data.columns

Index(['begin_yearmonth', 'begin_day', 'begin_time', 'end_yearmonth',
       'end_day', 'end_time', 'episode_id', 'event_id', 'state', 'state_fips',
       'year', 'month_name', 'event_type', 'cz_type', 'cz_fips', 'cz_name',
       'wfo', 'begin_date_time', 'cz_timezone', 'end_date_time',
       'injuries_direct', 'injuries_indirect', 'deaths_direct',
       'deaths_indirect', 'damage_property', 'damage_crops', 'source',
       'magnitude', 'magnitude_type', 'flood_cause', 'category', 'tor_f_scale',
       'tor_length', 'tor_width', 'tor_other_wfo', 'tor_other_cz_state',
       'tor_other_cz_fips', 'tor_other_cz_name', 'begin_range',
       'begin_azimuth', 'begin_location', 'end_range', 'end_azimuth',
       'end_location', 'begin_lat', 'begin_lon', 'end_lat', 'end_lon',
       'episode_narrative', 'event_narrative', 'data_source'],
      dtype='object')

In [205]:
sample_data.head()

Unnamed: 0,begin_yearmonth,begin_day,begin_time,end_yearmonth,end_day,end_time,episode_id,event_id,state,state_fips,...,end_range,end_azimuth,end_location,begin_lat,begin_lon,end_lat,end_lon,episode_narrative,event_narrative,data_source
0,198504,5,2310,198504,5,2310,,9976894,ALABAMA,1,...,0,,,34.13,-86.18,34.2,-86.05,,,PUB
1,198504,5,2320,198504,5,2320,,9976901,ALABAMA,1,...,0,,,33.48,-86.3,33.47,-86.3,,,PUB
2,198504,5,2322,198504,5,2322,,9976902,ALABAMA,1,...,0,,,33.47,-86.3,33.45,-86.3,,,PUB
3,198510,28,1700,198510,28,1700,,9977640,ALABAMA,1,...,0,,,31.0,-86.33,,,,,PUB
4,198510,28,1715,198510,28,1715,,9977641,ALABAMA,1,...,0,,,31.0,-87.27,,,,,PUB


In [206]:
def dist_heading(lat1, lon1, lat2, lon2):
    if any(map(np.isnan, (lat1, lon1, lat2, lon2))):
        return np.nan, np.nan
    from geographiclib.geodesic import Geodesic
    found = Geodesic.WGS84.Inverse(lat1, lon1, lat2, lon2)
    dist_m, heading = found['s12'], found['azi1']
    dist_mi = dist_m * 0.000621371
    return dist_mi, heading

sample_data[['calc_length', 'calc_heading']] = sample_data.apply(
    lambda r: pd.Series(list(dist_heading(r.begin_lat, r.begin_lon, r.end_lat, r.end_lon))), axis=1)

In [207]:
sample_data['length_perc_diff'] = (sample_data.tor_length - sample_data.calc_length) / sample_data.tor_length
sample_data['julian_day'] = sample_data.begin_date_time.dt.dayofyear

In [223]:
def nanfilt(df):
    return (df.begin_lat.notnull()) & (df.begin_lon.notnull()) & (df.end_lat.notnull()) & (df.end_lon.notnull()) & (df.length_perc_diff.notnull())

def end_start_eq(df):
    return (df.begin_lat == df.end_lat) & (df.begin_lon == df.end_lon)

data_non_nan = sample_data[nanfilt(sample_data) & ~end_start_eq(sample_data) & ~np.isinf(sample_data.length_perc_diff)]

In [217]:
sample_data.head()

Unnamed: 0,begin_yearmonth,begin_day,begin_time,end_yearmonth,end_day,end_time,episode_id,event_id,state,state_fips,...,begin_lon,end_lat,end_lon,episode_narrative,event_narrative,data_source,calc_length,calc_heading,length_perc_diff,julian_day
0,198504,5,2310,198504,5,2310,,9976894,ALABAMA,1,...,-86.18,34.2,-86.05,,,PUB,8.874385,57.029157,-0.109298,95
1,198504,5,2320,198504,5,2320,,9976901,ALABAMA,1,...,-86.3,33.47,-86.3,,,PUB,0.689181,180.0,0.540546,95
2,198504,5,2322,198504,5,2322,,9976902,ALABAMA,1,...,-86.3,33.45,-86.3,,,PUB,1.378358,180.0,0.081094,95
3,198510,28,1700,198510,28,1700,,9977640,ALABAMA,1,...,-86.33,,,,,PUB,,,,301
4,198510,28,1715,198510,28,1715,,9977641,ALABAMA,1,...,-87.27,,,,,PUB,,,,301


In [252]:
features = data_non_nan[['length_perc_diff']]

In [253]:
if_model = IsolationForest(contamination=0.15, behaviour='new', verbose=0)

In [254]:
lof_model = LocalOutlierFactor(contamination=0.15)

In [255]:
data_non_nan['if_results'] = if_model.fit_predict(features)
data_non_nan['lof_results'] = lof_model.fit_predict(features)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [256]:
relevant_cols =  ['begin_date_time', 'end_date_time', 'begin_lat', 'begin_lon', 
                  'end_lat', 'end_lon', 'tor_length', 'calc_length', 'calc_heading', 'length_perc_diff',
                  'tor_f_scale', 'state', 'if_results', 'lof_results']

In [257]:
outliers = (data_non_nan.if_results == -1) & (data_non_nan.lof_results == -1)
data_non_nan.loc[outliers, relevant_cols]

Unnamed: 0,begin_date_time,end_date_time,begin_lat,begin_lon,end_lat,end_lon,tor_length,calc_length,calc_heading,length_perc_diff,tor_f_scale,state,if_results,lof_results
64,1985-05-31 21:55:00,1985-05-31 21:55:00,41.52,-79.93,40.73,-78.83,12.5,79.160013,133.161677,-5.332801,F4,PENNSYLVANIA,-1,-1
65,1985-05-31 22:11:00,1985-05-31 22:11:00,40.73,-78.83,41.43,-79.52,11.0,60.260817,-36.491930,-4.478256,F4,PENNSYLVANIA,-1,-1
378,1985-09-07 22:20:00,1985-09-07 22:20:00,45.05,-85.07,45.02,-85.00,2.0,4.004866,121.125166,-1.002433,F1,MICHIGAN,-1,-1
516,1985-06-01 01:53:00,1985-06-01 01:53:00,40.63,-79.00,40.60,-78.98,6.0,2.321850,153.061436,0.613025,F0,PENNSYLVANIA,-1,-1
666,1985-04-28 21:30:00,1985-04-28 21:30:00,32.52,-101.68,32.53,-101.68,0.2,0.689076,0.000000,-2.445379,F0,TEXAS,-1,-1
667,1985-04-28 21:31:00,1985-04-28 21:31:00,32.53,-101.68,32.53,-101.67,0.3,0.583750,89.997311,-0.945835,F0,TEXAS,-1,-1
779,1986-05-10 00:36:00,1986-05-10 00:36:00,41.58,-93.80,41.58,-93.78,0.1,1.036364,89.993363,-9.363640,F0,IOWA,-1,-1
854,1986-05-12 04:31:00,1986-05-12 04:31:00,47.63,-97.45,47.62,-97.45,0.1,0.690859,180.000000,-5.908591,F1,NORTH DAKOTA,-1,-1
866,1986-05-12 04:30:00,1986-05-12 04:30:00,47.63,-97.50,47.63,-97.48,0.1,0.934012,89.992612,-8.340124,F1,NORTH DAKOTA,-1,-1
871,1986-05-14 21:09:00,1986-05-14 21:09:00,34.63,-98.97,34.65,-98.95,0.1,1.788543,39.567811,-16.885431,F1,OKLAHOMA,-1,-1


In [262]:
data_non_nan.shape

(2612, 57)

In [263]:
data_non_nan.loc[(data_non_nan.length_perc_diff < -0.2) | (data_non_nan.length_perc_diff > 0.4), relevant_cols]

Unnamed: 0,begin_date_time,end_date_time,begin_lat,begin_lon,end_lat,end_lon,tor_length,calc_length,calc_heading,length_perc_diff,tor_f_scale,state,if_results,lof_results
1,1985-04-05 23:20:00,1985-04-05 23:20:00,33.48,-86.30,33.47,-86.30,1.5,0.689181,180.000000,0.540546,F1,ALABAMA,-1,1
8,1985-04-05 23:52:00,1985-04-05 23:52:00,33.58,-85.85,33.60,-85.83,1.0,1.797421,39.920887,-0.797421,F2,ALABAMA,-1,1
16,1985-05-27 23:04:00,1985-05-27 23:04:00,40.77,-83.12,40.77,-83.10,0.5,1.049209,89.993470,-1.098419,F1,OHIO,-1,1
29,1985-10-05 14:45:00,1985-10-05 14:45:00,40.52,-74.40,40.62,-74.30,6.0,8.677364,37.295399,-0.446227,F1,NEW JERSEY,1,1
38,1985-05-31 23:43:00,1985-05-31 23:43:00,40.23,-82.18,40.23,-82.13,2.0,2.644141,89.983854,-0.322070,F3,OHIO,1,1
40,1985-06-01 00:02:00,1985-06-01 00:02:00,40.15,-82.02,40.20,-81.93,4.5,5.881333,54.057285,-0.306963,F1,OHIO,1,1
46,1985-06-22 21:30:00,1985-06-22 21:30:00,41.68,-81.05,41.72,-81.02,2.4,3.166761,29.329602,-0.319484,F2,OHIO,1,1
64,1985-05-31 21:55:00,1985-05-31 21:55:00,41.52,-79.93,40.73,-78.83,12.5,79.160013,133.161677,-5.332801,F4,PENNSYLVANIA,-1,-1
65,1985-05-31 22:11:00,1985-05-31 22:11:00,40.73,-78.83,41.43,-79.52,11.0,60.260817,-36.491930,-4.478256,F4,PENNSYLVANIA,-1,-1
69,1985-08-17 18:55:00,1985-08-17 18:55:00,34.88,-81.27,34.90,-81.25,3.0,1.786404,39.481816,0.404532,F1,SOUTH CAROLINA,-1,1
