In [39]:
%env WORKDIR=~/weatherpy-work
from stormevents.io import load_tornadoes

import numpy as np
import pandas as pd

from sklearn.ensemble import IsolationForest

env: WORKDIR=~/weatherpy-work


In [50]:
sample_data = load_tornadoes('1990-01-01 12:00', '1996-12-31 23:59', tz='GMT')

  return list(map(*args))


In [51]:
sample_data.columns

Index(['begin_yearmonth', 'begin_day', 'begin_time', 'end_yearmonth',
       'end_day', 'end_time', 'episode_id', 'event_id', 'state', 'state_fips',
       'year', 'month_name', 'event_type', 'cz_type', 'cz_fips', 'cz_name',
       'wfo', 'begin_date_time', 'cz_timezone', 'end_date_time',
       'injuries_direct', 'injuries_indirect', 'deaths_direct',
       'deaths_indirect', 'damage_property', 'damage_crops', 'source',
       'magnitude', 'magnitude_type', 'flood_cause', 'category', 'tor_f_scale',
       'tor_length', 'tor_width', 'tor_other_wfo', 'tor_other_cz_state',
       'tor_other_cz_fips', 'tor_other_cz_name', 'begin_range',
       'begin_azimuth', 'begin_location', 'end_range', 'end_azimuth',
       'end_location', 'begin_lat', 'begin_lon', 'end_lat', 'end_lon',
       'episode_narrative', 'event_narrative', 'data_source'],
      dtype='object')

In [52]:
sample_data.head()

Unnamed: 0,begin_yearmonth,begin_day,begin_time,end_yearmonth,end_day,end_time,episode_id,event_id,state,state_fips,...,end_range,end_azimuth,end_location,begin_lat,begin_lon,end_lat,end_lon,episode_narrative,event_narrative,data_source
0,199006,2,1750,199006,2,1750,,10049091,MINNESOTA,27,...,0.0,,,44.37,-92.63,44.42,-92.58,,,PUB
1,199005,18,1710,199005,18,1710,,10041359,MASSACHUSETTS,25,...,0.0,,,42.08,-71.4,,,,,PUB
2,199006,13,345,199006,13,345,,10079985,NEBRASKA,31,...,0.0,,,41.1,-98.47,,,,,PUB
3,199006,13,420,199006,13,420,,10079988,NEBRASKA,31,...,0.0,,,41.52,-97.73,,,,,PUB
4,199006,16,45,199006,16,45,,10080010,NEBRASKA,31,...,0.0,,,40.25,-100.75,40.27,-100.53,,,PUB


In [53]:
def dist_heading(lat1, lon1, lat2, lon2):
    if any(map(np.isnan, (lat1, lon1, lat2, lon2))):
        return np.nan, np.nan
    from geographiclib.geodesic import Geodesic
    found = Geodesic.WGS84.Inverse(lat1, lon1, lat2, lon2)
    dist_m, heading = found['s12'], found['azi1']
    dist_mi = dist_m * 0.000621371
    return dist_mi, heading

sample_data[['calc_length', 'calc_heading']] = sample_data.apply(
    lambda r: pd.Series(list(dist_heading(r.begin_lat, r.begin_lon, r.end_lat, r.end_lon))), axis=1)

In [54]:
sample_data['length_perc_diff'] = (sample_data.tor_length - sample_data.calc_length) / sample_data.tor_length

In [55]:
relevant_cols =  ['begin_date_time', 'end_date_time', 'begin_lat', 'begin_lon', 
                  'end_lat', 'end_lon', 'tor_length', 'calc_length', 'calc_heading', 'length_perc_diff',
                  'tor_f_scale', 'state']

sample_data[relevant_cols]

Unnamed: 0,begin_date_time,end_date_time,begin_lat,begin_lon,end_lat,end_lon,tor_length,calc_length,calc_heading,length_perc_diff,tor_f_scale,state
0,1990-06-02 17:50:00,1990-06-02 17:50:00,44.37,-92.63,44.42,-92.58,4.0,4.248026,35.622747,-0.062006,F1,MINNESOTA
1,1990-05-18 17:10:00,1990-05-18 17:10:00,42.08,-71.40,,,0.2,,,,F0,MASSACHUSETTS
2,1990-06-13 03:45:00,1990-06-13 03:45:00,41.10,-98.47,,,1.0,,,,F0,NEBRASKA
3,1990-06-13 04:20:00,1990-06-13 04:20:00,41.52,-97.73,,,0.1,,,,F0,NEBRASKA
4,1990-06-16 00:45:00,1990-06-16 00:45:00,40.25,-100.75,40.27,-100.53,8.0,11.710668,83.161704,-0.463833,F4,NEBRASKA
5,1990-06-16 02:04:00,1990-06-16 02:04:00,40.62,-100.07,,,0.7,,,,F0,NEBRASKA
6,1990-06-16 02:04:00,1990-06-16 02:04:00,40.62,-100.07,,,0.7,,,,F0,NEBRASKA
7,1990-06-16 02:05:00,1990-06-16 02:05:00,40.05,-97.50,40.08,-97.42,15.0,4.719028,63.958740,0.685398,F1,NEBRASKA
8,1990-06-16 02:55:00,1990-06-16 02:55:00,40.98,-99.12,,,1.0,,,,F1,NEBRASKA
9,1990-06-16 03:00:00,1990-06-16 03:00:00,41.20,-98.85,,,0.5,,,,F0,NEBRASKA


In [61]:
def nanfilt(df):
    return (df.begin_lat.notnull()) & (df.begin_lon.notnull()) & (df.end_lat.notnull()) & (df.end_lon.notnull() & (df.length_perc_diff.notnull()))

data_non_nan = sample_data[nanfilt(sample_data)]

In [62]:
data_non_nan[relevant_cols]

Unnamed: 0,begin_date_time,end_date_time,begin_lat,begin_lon,end_lat,end_lon,tor_length,calc_length,calc_heading,length_perc_diff,tor_f_scale,state
0,1990-06-02 17:50:00,1990-06-02 17:50:00,44.37,-92.63,44.42,-92.58,4.0,4.248026,35.622747,-0.062006,F1,MINNESOTA
4,1990-06-16 00:45:00,1990-06-16 00:45:00,40.25,-100.75,40.27,-100.53,8.0,11.710668,83.161704,-0.463833,F4,NEBRASKA
7,1990-06-16 02:05:00,1990-06-16 02:05:00,40.05,-97.50,40.08,-97.42,15.0,4.719028,63.958740,0.685398,F1,NEBRASKA
13,1990-06-16 22:15:00,1990-06-16 22:15:00,40.03,-99.43,40.03,-99.13,15.0,15.911404,89.903522,-0.060760,F1,NEBRASKA
15,1990-06-16 22:30:00,1990-06-16 22:30:00,41.37,-98.85,41.42,-98.78,4.0,5.013809,46.488786,-0.253452,F1,NEBRASKA
20,1990-06-16 00:00:00,1990-06-16 00:00:00,40.15,-101.23,40.25,-100.75,20.0,26.315505,74.645271,-0.315775,F4,NEBRASKA
23,1990-03-13 23:55:00,1990-03-13 23:55:00,42.57,-91.00,42.60,-90.98,3.0,2.308389,26.220065,0.230537,F1,IOWA
28,1990-04-10 00:12:00,1990-04-10 00:12:00,37.98,-97.65,37.98,-97.17,19.0,26.203825,89.852307,-0.379149,F0,KANSAS
29,1990-04-10 01:05:00,1990-04-10 01:05:00,37.98,-97.17,37.98,-97.15,1.0,1.091827,89.993846,-0.091827,F0,KANSAS
30,1990-04-10 01:18:00,1990-04-10 01:18:00,37.40,-98.23,37.45,-98.10,7.0,7.938019,64.214297,-0.134003,F1,KANSAS


In [58]:
model = IsolationForest(contamination='auto', behaviour='new')

In [63]:
data_non_nan['outlier'] = model.fit_predict(data_non_nan[['calc_heading', 'length_perc_diff']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [64]:
relevant_cols.append('outlier')
data_non_nan.loc[data_non_nan.outlier == -1, relevant_cols]

Unnamed: 0,begin_date_time,end_date_time,begin_lat,begin_lon,end_lat,end_lon,tor_length,calc_length,calc_heading,length_perc_diff,tor_f_scale,state,outlier
128,1990-09-14 13:11:00,1990-09-14 13:11:00,43.28,-85.57,43.23,-85.58,4.0,3.488330,-171.679815,0.127918,F1,MICHIGAN,-1
148,1990-06-08 03:50:00,1990-06-08 03:50:00,37.78,-97.95,37.87,-98.05,7.0,8.273795,-41.360470,-0.181971,F2,KANSAS,-1
181,1990-10-23 07:15:00,1990-10-23 07:15:00,35.50,-78.82,35.55,-78.77,1.4,4.452316,39.251309,-2.180226,F1,NORTH CAROLINA,-1
182,1990-10-23 07:18:00,1990-10-23 07:18:00,35.55,-78.77,35.57,-78.78,0.1,1.489476,-22.220430,-13.894764,F1,NORTH CAROLINA,-1
251,1990-07-11 23:09:00,1990-07-11 23:09:00,43.92,-94.55,43.82,-94.53,4.0,6.975927,171.760200,-0.743982,F0,MINNESOTA,-1
321,1990-04-27 19:00:00,1990-04-27 19:00:00,31.87,-89.70,31.97,-89.78,5.0,8.341201,-34.285620,-0.668240,F2,MISSISSIPPI,-1
335,1990-07-09 20:30:00,1990-07-09 20:30:00,40.08,-75.63,40.07,-75.62,5.0,0.870034,142.464517,0.825993,F1,PENNSYLVANIA,-1
346,1990-08-06 23:00:00,1990-08-06 23:00:00,34.70,-79.58,34.68,-79.57,1.0,1.491581,157.556776,-0.491581,F0,SOUTH CAROLINA,-1
490,1990-07-20 20:05:00,1990-07-20 20:05:00,38.93,-104.42,39.03,-104.47,4.0,7.404866,-21.303277,-0.851216,F0,COLORADO,-1
561,1990-03-13 22:45:00,1990-03-13 22:45:00,41.60,-90.28,41.62,-90.33,0.5,2.934573,-61.926738,-4.869146,F3,IOWA,-1
