# This file was used to augment a former version of the fires_merged_weather.csv file that is no longer present in our repo

In [1]:
import pandas as pd
import mpu
import neo4j
from neo4j import GraphDatabase as G_DB
import py2neo
from scipy.spatial import KDTree as KD
import json
import os
import math

In [None]:
dataLoc = '../data/'

STN---  1-6       Int.   Station number (WMO/DATSAV3 number)
                         for the location.

WBAN    8-12      Int.   WBAN number where applicable--this is the
                         historical "Weather Bureau Air Force Navy"
                         number - with WBAN being the acronym.

YEAR    15-18     Int.   The year.

MODA    19-22     Int.   The month and day.

TEMP    25-30     Real   Mean temperature for the day in degrees
                         Fahrenheit to tenths.  Missing = 9999.9
Count   32-33     Int.   Number of observations used in 
                         calculating mean temperature.

DEWP    36-41     Real   Mean dew point for the day in degrees
                         Fahrenheit to tenths.  Missing = 9999.9
Count   43-44     Int.   Number of observations used in 
                         calculating mean dew point.  

SLP     47-52     Real   Mean sea level pressure for the day
                         in millibars to tenths.  Missing =       
                         9999.9
Count   54-55     Int.   Number of observations used in 
                         calculating mean sea level pressure.

STP     58-63     Real   Mean station pressure for the day
                         in millibars to tenths.  Missing =       
                         9999.9
Count   65-66     Int.   Number of observations used in 
                         calculating mean station pressure.  

VISIB   69-73     Real   Mean visibility for the day in miles
                         to tenths.  Missing = 999.9
Count   75-76     Int.   Number of observations used in 
                         calculating mean visibility.      

WDSP    79-83     Real   Mean wind speed for the day in knots
                         to tenths.  Missing = 999.9 
Count   85-86     Int.   Number of observations used in 
                         calculating mean wind speed.

MXSPD   89-93     Real   Maximum sustained wind speed reported 
                         for the day in knots to tenths.
                         Missing = 999.9

GUST    96-100    Real   Maximum wind gust reported for the day
                         in knots to tenths.  Missing = 999.9

MAX     103-108   Real   Maximum temperature reported during the 
                         day in Fahrenheit to tenths--time of max 
                         temp report varies by country and        
                         region, so this will sometimes not be    
                         the max for the calendar day.  Missing = 
                         9999.9     
Flag    109-109   Char   Blank indicates max temp was taken from the
                         explicit max temp report and not from the              
                         'hourly' data.  * indicates max temp was 
                         derived from the hourly data (i.e., highest
                         hourly or synoptic-reported temperature).

MIN     111-116   Real   Minimum temperature reported during the 
                         day in Fahrenheit to tenths--time of min 
                         temp report varies by country and        
                         region, so this will sometimes not be  
                         the min for the calendar day.  Missing = 
                         9999.9
Flag    117-117   Char   Blank indicates min temp was taken from the
                         explicit min temp report and not from the              
                         'hourly' data.  * indicates min temp was 
                         derived from the hourly data (i.e., lowest
                         hourly or synoptic-reported temperature).

PRCP    119-123   Real   Total precipitation (rain and/or melted
                         snow) reported during the day in inches
                         and hundredths; will usually not end 
                         with the midnight observation--i.e., 
                         may include latter part of previous day.
                         .00 indicates no measurable              
                         precipitation (includes a trace).        
                         Missing = 99.99
                         Note:  Many stations do not report '0' on
                         days with no precipitation--therefore,  
                         '99.99' will often appear on these days.
                         Also, for example, a station may only
                         report a 6-hour amount for the period 
                         during which rain fell.
                         See Flag field for source of data.
Flag    124-124   Char   A = 1 report of 6-hour precipitation 
                             amount.
                         B = Summation of 2 reports of 6-hour 
                             precipitation amount.
                         C = Summation of 3 reports of 6-hour 
                             precipitation amount.
                         D = Summation of 4 reports of 6-hour 
                             precipitation amount.
                         E = 1 report of 12-hour precipitation
                             amount.
                         F = Summation of 2 reports of 12-hour
                             precipitation amount.
                         G = 1 report of 24-hour precipitation
                             amount.
                         H = Station reported '0' as the amount
                             for the day (eg, from 6-hour reports),
                             but also reported at least one
                             occurrence of precipitation in hourly
                             observations--this could indicate a
                             trace occurred, but should be considered
                             as incomplete data for the day.
                         I = Station did not report any precip data
                             for the day and did not report any
                             occurrences of precipitation in its hourly
                             observations--it's still possible that
                             precip occurred but was not reported.

SNDP    126-130   Real   Snow depth in inches to tenths--last     
                         report for the day if reported more than
                         once.  Missing = 999.9
                         Note:  Most stations do not report '0' on
                         days with no snow on the ground--therefore,
                         '999.9' will often appear on these days.

FRSHTT  133-138   Int.   Indicators (1 = yes, 0 = no/not          
                         reported) for the occurrence during the 
                         day of:
                         Fog ('F' - 1st digit).
                         Rain or Drizzle ('R' - 2nd digit).
                         Snow or Ice Pellets ('S' - 3rd digit).
                         Hail ('H' - 4th digit).
                         Thunder ('T' - 5th digit).
                         Tornado or Funnel Cloud ('T' - 6th       
                         digit).

In [2]:
colspecs = [[0,6],[7,12],[14,18],[18,20],[20,22],[24,30],[35,41],[46,52],[57,63],[68,73],[78,83],[88,93],[95,100],[102,108],[110,116],[118,123],[125,130],[132,133],[133,134],[134,135],[135,136],[136,137],[137,138]]
stations_US = [str(x) for x in list(range(70,80))]

Read and append thousands of fixed width files and save them to a single csv for each year

In [3]:
os.chdir(dataLoc+'noaa-global-surface-summary-of-the-day/gsod_all_years')
for year_num in range(1992,2016):
    global_data = pd.DataFrame()
    yf = 'gsod_'+str(year_num)
    os.chdir(yf)
    station_folders = os.listdir()
    for sf in station_folders:
        if '.gz' not in sf:
            os.chdir(sf)
            station_files = os.listdir()
            for f in station_files:
                if f[0:2] in stations_US:
                    station_df = pd.read_fwf(f,colspecs=colspecs)
                    global_data = global_data.append(station_df, ignore_index = True)
            os.chdir('..')
            break
    os.chdir('..')
    global_data.to_csv(dataLoc+'global_weather_data_'+str(year_num)+'.csv')
    del global_data
os.chdir('../../..')

Merge lat long onto the dfs using the weather station information

In [4]:
weather_files = [dataLoc+'global_weather_data_' + str(x) + '.csv' for x in list(range(1992,2016))]

In [5]:
weather_stations = pd.read_csv(dataLoc+'noaa-global-surface-summary-of-the-day/isd-history.csv')
weather_stations_US = weather_stations[weather_stations.USAF.str[:2].isin(stations_US)]
weather_stations_US.USAF=weather_stations_US.USAF.apply(lambda x: int(x))

 USAF = Air Force station ID. May contain a letter in the first position.

 WBAN = NCDC WBAN number

 CTRY = FIPS country ID
 
   ST = State for US stations

 ICAO = ICAO ID

  LAT = Latitude in thousandths of decimal degrees

  LON = Longitude in thousandths of decimal degrees

 ELEV = Elevation in meters

BEGIN = Beginning Period Of Record (YYYYMMDD). There may be reporting gaps within the P.O.R.

  END = Ending Period Of Record (YYYYMMDD). There may be reporting gaps within the P.O.R.

In [6]:
weather_stations_US.columns

Index(['USAF', 'WBAN', 'STATION NAME', 'CTRY', 'STATE', 'ICAO', 'LAT', 'LON',
       'ELEV(M)', 'BEGIN', 'END'],
      dtype='object')

In [7]:
name_dict = {'STN---':'Station','YEAR':'Year','MO':'Month','DA':'Day','TEMP':'Temp','DEWP':'DewPoint','SLP':'SeaLevelPressure','STP':'StationPressure','VISIB':'Visibility','WDSP':'Windspeed','MXSPD':'MaxWindspeed','GUST':'Gust','MAX':'MaxTemp','MIN':'MinTemp','PRCP':'Precip','SNDP':'SnowDepth','F':'Fog','R':'Rain','S':'Snow','H':'Hail','T':'Thunder','T.1':'FunnelCloud','CTRY':'Country','STATE':'State','LAT':'Latitude','LON':'Longitude','ELEV(M)':'Elevation','BEGIN':'Begin','END':'End','WBAN_x':'WBAN'}


In [8]:
for wf in weather_files:
    wf_df = pd.read_csv(dataLoc+'weather/'+wf)
    wf_df_merged = wf_df.merge(weather_stations_US,left_on='STN---',right_on='USAF')
    wf_df_merged = wf_df_merged.rename(columns=name_dict)
    wf_df_merged = wf_df_merged.drop(columns='WBAN_y')  
    wf_df_merged.to_csv(dataLoc+'weather/'+wf)

### merge with fires

In [42]:
# https://stackoverflow.com/questions/43020919/scipy-how-to-convert-kd-tree-distance-from-query-to-kilometers-python-pandas
# https://en.wikipedia.org/wiki/Geographic_coordinate_conversion
def to_Cartesian(series):
    # convert to radians
    lat = series[0]*math.pi/180
    lng = series[1]*math.pi/180
    R = 6367 # radius of the Earth in kilometers

    x = R * math.cos(lat) * math.cos(lng)
    y = R * math.cos(lat) * math.sin(lng)
    z = R * math.sin(lat)
    return x, y, z
day_cartesian_scale_factor = 2*6367/365 #2*earth radius/days per year, scales 1 day to 1 kilometer

In [43]:
fires = pd.read_csv('./data/fires_merged.csv')
fires_1992 = fires[fires.FIRE_YEAR==1992]
fires_1992['DISCOVERY_DOY_SCALED'] = fires_1992.DISCOVERY_DOY*day_cartesian_scale_factor

In [44]:
weather_1992 = pd.read_csv('./data/weather/global_weather_data_1992.csv')
days_per_month = [31,28,31,30,31,30,31,31,30,31,30,31]
weather_1992['doy'] = (weather_1992.Month.apply(lambda x:sum(days_per_month[0:x-1]))+weather_1992.Day)*day_cartesian_scale_factor

In [45]:
fires_1992.DISCOVERY_DOY_SCALED

41875       4884.273973
41876       7605.512329
41877       9733.660274
41878      10396.526027
41879       5547.139726
               ...     
1861638     5407.589041
1861639     6419.331507
1864564     7640.400000
1864565     7710.175342
1864566     7675.287671
Name: DISCOVERY_DOY_SCALED, Length: 67975, dtype: float64

convert lat,long to cartesian coordinates

In [46]:
weather_cartesian = weather_1992[['Latitude','Longitude']].apply(to_Cartesian,axis=1)
new_cols = ['x','y','z']
for i in range(len(new_cols)):
    weather_1992[new_cols[i]]=[r[i] for r in weather_cartesian]

In [47]:
fires_cartesian = fires_1992[['LATITUDE','LONGITUDE']].apply(to_Cartesian,axis=1)
new_cols = ['x','y','z']
for i in range(len(new_cols)):
    fires_1992[new_cols[i]]=[r[i] for r in fires_cartesian]

Make KD trees

In [48]:
weather_1992_kd = KD(weather_1992[['x','y','z','doy']])

In [49]:
weather_1992_kd_loc = KD(weather_1992[['x','y','z']])

Test KD trees

In [85]:
day_sep = []
time_sep = []
for i,row in fires_1992.head(10).iterrows():
    row_predictors = row[['x','y','z','DISCOVERY_DOY_SCALED']]
    closest_weather_time = weather_1992_kd.query(row_predictors,1)
    closest_weather = weather_1992_kd_loc.query(row[['x','y','z']],1)
    predicted_weather_point = weather_1992_kd.data[closest_weather[1]]
    closest_weather_station = weather_1992_kd_loc.data[closest_weather[1]]
    print(weather_1992_kd.data[closest_weather_time[1]])
    #print(weather_1992.iloc[closest_weather_time[1]])
    print(closest_weather_station)
    print(row[['LATITUDE','LONGITUDE','x','y','z','DISCOVERY_DOY_SCALED']])
    time_sep.append((predicted_weather_point[3]-row.DISCOVERY_DOY_SCALED)/day_cartesian_scale_factor)
    day_sep.append(math.sqrt(pow(predicted_weather_point[0]-closest_weather_station[0],2)+\
    pow(predicted_weather_point[1]-closest_weather_station[1],2)+\
    pow(predicted_weather_point[2]-closest_weather_station[2],2)))
    print("______________________________")
print(max(day_sep))
print(max(time_sep))

[-1719.02395065 -4139.05302654  4522.37611224  4919.16164384]
[-1719.02395065 -4139.05302654  4522.37611224]
LATITUDE                  45.36
LONGITUDE              -113.078
x                      -1753.67
y                      -4115.74
z                       4530.35
DISCOVERY_DOY_SCALED    4884.27
Name: 41875, dtype: object
______________________________
[-1819.0603937  -4108.23351129  4511.33302925  7605.51232877]
[-1722.47496116 -4196.28107868  4467.99665587]
LATITUDE                  44.54
LONGITUDE              -112.683
x                      -1750.08
y                      -4187.12
z                       4465.86
DISCOVERY_DOY_SCALED    7605.51
Name: 41876, dtype: object
______________________________
[-1819.0603937  -4108.23351129  4511.33302925  9733.66027397]
[-1722.47496116 -4196.28107868  4467.99665587]
LATITUDE                44.5167
LONGITUDE              -112.983
x                      -1772.69
y                      -4179.58
z                       4464.01
DISCOVERY_DOY

In [None]:
fires_1992

In [87]:
day_sep = []
time_sep = []
predicted_weather_points = pd.DataFrame()
for i,row in fires_1992.head(10).iterrows():
    row_predictors = row[['x','y','z','DISCOVERY_DOY_SCALED']]
    closest_weather_time = weather_1992_kd.query(row_predictors,1)
    predicted_weather_point = weather_1992.iloc[closest_weather_time[1]]
    predicted_weather_points = predicted_weather_points.append(predicted_weather_point)

In [98]:
predicted_weather_points

Unnamed: 0.2,Begin,Country,Day,DewPoint,Elevation,End,Fog,FunnelCloud,Gust,Hail,...,Unnamed: 0,Unnamed: 0.1,Visibility,WBAN,Windspeed,Year,doy,x,y,z
674342,19730101.0,US,21.0,40.7,1591.7,20100801.0,0.0,0.0,999.9,0.0,...,674342.0,494048.0,43.5,24138.0,8.5,1992.0,4919.161644,-1719.023951,-4139.053027,4522.376112
678120,20060101.0,US,6.0,36.5,1232.6,20190416.0,0.0,0.0,19.8,0.0,...,678120.0,496688.0,10.0,99999.0,4.1,1992.0,7605.512329,-1819.060394,-4108.233511,4511.333029
678242,20060101.0,US,6.0,33.8,1232.6,20190416.0,0.0,0.0,22.0,0.0,...,678242.0,496749.0,8.9,99999.0,5.7,1992.0,9733.660274,-1819.060394,-4108.233511,4511.333029
678280,20060101.0,US,25.0,26.2,1232.6,20190416.0,0.0,0.0,999.9,0.0,...,678280.0,496768.0,4.5,99999.0,0.7,1992.0,10396.526027,-1819.060394,-4108.233511,4511.333029
674043,19770301.0,US,8.0,33.5,1686.5,20100801.0,0.0,0.0,22.9,0.0,...,674043.0,493749.0,25.7,24135.0,5.6,1992.0,5547.139726,-1693.70838,-4088.771896,4577.333865
674051,19770301.0,US,16.0,44.5,1686.5,20100801.0,1.0,0.0,22.9,0.0,...,674051.0,493757.0,9.6,24135.0,11.1,1992.0,5826.241096,-1693.70838,-4088.771896,4577.333865
674061,19770301.0,US,26.0,48.3,1686.5,20100801.0,0.0,0.0,19.8,0.0,...,674061.0,493767.0,23.3,24135.0,5.6,1992.0,6175.117808,-1693.70838,-4088.771896,4577.333865
674061,19770301.0,US,26.0,48.3,1686.5,20100801.0,0.0,0.0,19.8,0.0,...,674061.0,493767.0,23.3,24135.0,5.6,1992.0,6175.117808,-1693.70838,-4088.771896,4577.333865
674101,19770301.0,US,5.0,33.6,1686.5,20100801.0,0.0,0.0,999.9,0.0,...,674101.0,493807.0,23.1,24135.0,9.1,1992.0,7570.624658,-1693.70838,-4088.771896,4577.333865
674117,19770301.0,US,21.0,50.0,1686.5,20100801.0,0.0,0.0,21.0,0.0,...,674117.0,493823.0,21.8,24135.0,10.2,1992.0,8128.827397,-1693.70838,-4088.771896,4577.333865


In [99]:
fires_1992.head(10)

Unnamed: 0,FOD_ID,FIRE_NAME,STAT_CAUSE_DESCR,FIRE_YEAR,DISCOVERY_DOY,FIRE_SIZE,LATITUDE,LONGITUDE,STATE,CONTAINED,DISCOVERY_MONTH,DISTANCE_CITY_1000000,DISTANCE_CITY_100000,DISTANCE_CITY_10000,DISTANCE_CITY_1000,APPROX_ELEVATION,DISCOVERY_DOY_SCALED,x,y,z
41875,42087,,Lightning,1992,140,1.0,45.36,-113.078333,MT,True,5,10.781239,4.750138,0.843216,0.464428,1820.0,4884.273973,-1753.670688,-4115.735442,4530.347657
41876,42088,,Lightning,1992,218,0.6,44.54,-112.683333,MT,True,8,11.108428,3.861481,1.144407,0.677997,2068.0,7605.512329,-1750.081193,-4187.124525,4465.858599
41877,42089,,Campfire,1992,279,0.1,44.516667,-112.983333,MT,True,10,11.105679,3.911656,1.379697,0.78089,1820.0,9733.660274,-1772.690821,-4179.577742,4464.010098
41878,42090,,Campfire,1992,298,0.1,44.69,-112.73,MT,True,10,11.26083,4.017896,1.277495,0.534599,2068.0,10396.526027,-1748.967464,-4174.899812,4477.724128
41879,42091,,Campfire,1992,159,0.1,45.763333,-112.82,MT,True,6,10.886766,4.979875,0.3738,0.3738,1686.5,5547.139726,-1722.684484,-4094.103393,4561.728244
41880,42092,,Lightning,1992,167,0.1,45.75,-112.825,MT,True,6,10.886309,4.980347,0.386271,0.386271,1686.5,5826.241096,-1723.453508,-4094.93137,4560.694474
41881,42093,,Lightning,1992,177,0.5,45.693333,-112.716667,MT,True,6,11.007126,5.001247,0.36035,0.36035,1686.5,6175.117808,-1717.44889,-4102.341413,4556.298195
41882,42094,,Lightning,1992,177,0.5,45.705,-112.676667,MT,True,6,11.041236,5.006315,0.33124,0.33124,1686.5,6175.117808,-1714.226782,-4102.683293,4557.203676
41883,42095,,Campfire,1992,217,0.1,45.941667,-113.166667,MT,True,8,10.501405,4.591542,0.63583,0.292743,1686.5,7570.624658,-1741.829865,-4070.5322,4575.531153
41884,42096,,Lightning,1992,233,0.3,45.545,-113.11,MT,True,8,10.68617,4.804972,0.736534,0.57622,1820.0,8128.827397,-1750.194594,-4101.290142,4544.769197


In [97]:
pd.concat([fires_1992.head(10),predicted_weather_points],axis=1,sort=False)

Unnamed: 0.2,index,FOD_ID,FIRE_NAME,STAT_CAUSE_DESCR,FIRE_YEAR,DISCOVERY_DOY,FIRE_SIZE,LATITUDE,LONGITUDE,STATE,...,Unnamed: 0,Unnamed: 0.1,Visibility,WBAN,Windspeed,Year,doy,x,y,z
0,41875,42087,,Lightning,1992,140,1.0,45.36,-113.078333,MT,...,674342.0,494048.0,43.5,24138.0,8.5,1992.0,4919.161644,-1719.023951,-4139.053027,4522.376112
1,41876,42088,,Lightning,1992,218,0.6,44.54,-112.683333,MT,...,678120.0,496688.0,10.0,99999.0,4.1,1992.0,7605.512329,-1819.060394,-4108.233511,4511.333029
2,41877,42089,,Campfire,1992,279,0.1,44.516667,-112.983333,MT,...,678242.0,496749.0,8.9,99999.0,5.7,1992.0,9733.660274,-1819.060394,-4108.233511,4511.333029
3,41878,42090,,Campfire,1992,298,0.1,44.69,-112.73,MT,...,678280.0,496768.0,4.5,99999.0,0.7,1992.0,10396.526027,-1819.060394,-4108.233511,4511.333029
4,41879,42091,,Campfire,1992,159,0.1,45.763333,-112.82,MT,...,674043.0,493749.0,25.7,24135.0,5.6,1992.0,5547.139726,-1693.70838,-4088.771896,4577.333865
5,41880,42092,,Lightning,1992,167,0.1,45.75,-112.825,MT,...,674051.0,493757.0,9.6,24135.0,11.1,1992.0,5826.241096,-1693.70838,-4088.771896,4577.333865
6,41881,42093,,Lightning,1992,177,0.5,45.693333,-112.716667,MT,...,674061.0,493767.0,23.3,24135.0,5.6,1992.0,6175.117808,-1693.70838,-4088.771896,4577.333865
7,41882,42094,,Lightning,1992,177,0.5,45.705,-112.676667,MT,...,674061.0,493767.0,23.3,24135.0,5.6,1992.0,6175.117808,-1693.70838,-4088.771896,4577.333865
8,41883,42095,,Campfire,1992,217,0.1,45.941667,-113.166667,MT,...,674101.0,493807.0,23.1,24135.0,9.1,1992.0,7570.624658,-1693.70838,-4088.771896,4577.333865
9,41884,42096,,Lightning,1992,233,0.3,45.545,-113.11,MT,...,674117.0,493823.0,21.8,24135.0,10.2,1992.0,8128.827397,-1693.70838,-4088.771896,4577.333865


Run KD Tree and merge

In [100]:
day_sep = []
time_sep = []
predicted_weather_points = pd.DataFrame()
for i,row in fires_1992.iterrows():
    row_predictors = row[['x','y','z','DISCOVERY_DOY_SCALED']]
    closest_weather_time = weather_1992_kd.query(row_predictors,1)
    predicted_weather_point = weather_1992.iloc[closest_weather_time[1]]
    predicted_weather_points = predicted_weather_points.append(predicted_weather_point)

In [105]:
fires_1992.shape

(67975, 20)

In [106]:
predicted_weather_points.shape

(67975, 39)

In [130]:
fires_1992=fires_1992.rename(columns={'x':'x_fire','y':'y_fire','z':'z_fire'})

In [153]:
fires_1992_merged = fires_1992.reset_index().merge(predicted_weather_points.reset_index(),left_index=True,right_index=True)

In [154]:
fires_1992_merged

Unnamed: 0.2,index_x,FOD_ID,FIRE_NAME,STAT_CAUSE_DESCR,FIRE_YEAR,DISCOVERY_DOY,FIRE_SIZE,LATITUDE,LONGITUDE,STATE,...,Unnamed: 0,Unnamed: 0.1,Visibility,WBAN,Windspeed,Year,doy,x,y,z
0,41875,42087,,Lightning,1992,140,1.0,45.360000,-113.078333,MT,...,674342.0,494048.0,43.5,24138.0,8.5,1992.0,4919.161644,-1719.023951,-4139.053027,4522.376112
1,41876,42088,,Lightning,1992,218,0.6,44.540000,-112.683333,MT,...,678120.0,496688.0,10.0,99999.0,4.1,1992.0,7605.512329,-1819.060394,-4108.233511,4511.333029
2,41877,42089,,Campfire,1992,279,0.1,44.516667,-112.983333,MT,...,678242.0,496749.0,8.9,99999.0,5.7,1992.0,9733.660274,-1819.060394,-4108.233511,4511.333029
3,41878,42090,,Campfire,1992,298,0.1,44.690000,-112.730000,MT,...,678280.0,496768.0,4.5,99999.0,0.7,1992.0,10396.526027,-1819.060394,-4108.233511,4511.333029
4,41879,42091,,Campfire,1992,159,0.1,45.763333,-112.820000,MT,...,674043.0,493749.0,25.7,24135.0,5.6,1992.0,5547.139726,-1693.708380,-4088.771896,4577.333865
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67970,1861638,300296712,,Missing/Undefined,1992,155,10.0,21.366400,-157.839188,HI,...,52537.0,32925.0,999.9,25626.0,21.6,1992.0,5407.589041,-3763.781338,-741.836070,5081.566519
67971,1861639,300296713,,Missing/Undefined,1992,184,10.0,21.323521,-157.819824,HI,...,52552.0,32940.0,999.9,25626.0,12.7,1992.0,5930.904110,-3763.781338,-741.836070,5081.566519
67972,1864564,300299713,,Missing/Undefined,1992,219,0.1,21.497620,-158.187042,HI,...,53061.0,33230.0,14.8,99999.0,6.3,1992.0,7640.400000,-3648.854178,-873.116633,5144.144199
67973,1864565,300299714,,Missing/Undefined,1992,221,0.1,21.497620,-158.187042,HI,...,53065.0,33232.0,8.7,99999.0,5.7,1992.0,7710.175342,-3648.854178,-873.116633,5144.144199


In [155]:
fires_1992_merged.head()

Unnamed: 0.2,index_x,FOD_ID,FIRE_NAME,STAT_CAUSE_DESCR,FIRE_YEAR,DISCOVERY_DOY,FIRE_SIZE,LATITUDE,LONGITUDE,STATE,...,Unnamed: 0,Unnamed: 0.1,Visibility,WBAN,Windspeed,Year,doy,x,y,z
0,41875,42087,,Lightning,1992,140,1.0,45.36,-113.078333,MT,...,674342.0,494048.0,43.5,24138.0,8.5,1992.0,4919.161644,-1719.023951,-4139.053027,4522.376112
1,41876,42088,,Lightning,1992,218,0.6,44.54,-112.683333,MT,...,678120.0,496688.0,10.0,99999.0,4.1,1992.0,7605.512329,-1819.060394,-4108.233511,4511.333029
2,41877,42089,,Campfire,1992,279,0.1,44.516667,-112.983333,MT,...,678242.0,496749.0,8.9,99999.0,5.7,1992.0,9733.660274,-1819.060394,-4108.233511,4511.333029
3,41878,42090,,Campfire,1992,298,0.1,44.69,-112.73,MT,...,678280.0,496768.0,4.5,99999.0,0.7,1992.0,10396.526027,-1819.060394,-4108.233511,4511.333029
4,41879,42091,,Campfire,1992,159,0.1,45.763333,-112.82,MT,...,674043.0,493749.0,25.7,24135.0,5.6,1992.0,5547.139726,-1693.70838,-4088.771896,4577.333865


In [156]:
fires_1992_merged.to_csv(dataLoc+'/fires_weather/fires_merged_weather.csv')

### Functionalize the above code so it can be run for every year

In [158]:
new_cols = ['x','y','z']
for yr in range(1993,2016):
    fires_yr = fires[fires.FIRE_YEAR==yr]

    fires_yr['DISCOVERY_DOY_SCALED'] = fires_yr.DISCOVERY_DOY*day_cartesian_scale_factor

    weather_yr = pd.read_csv('./data/weather/global_weather_data_' + str(yr) + '.csv')

    days_per_month = [31,28,31,30,31,30,31,31,30,31,30,31]
    weather_yr['doy'] = (weather_yr.Month.apply(lambda x:sum(days_per_month[0:x-1]))+weather_yr.Day)*day_cartesian_scale_factor

    weather_cartesian = weather_yr[['Latitude','Longitude']].apply(to_Cartesian,axis=1)
    for i in range(len(new_cols)):
        weather_yr[new_cols[i]]=[r[i] for r in weather_cartesian]

    fires_cartesian = fires_yr[['LATITUDE','LONGITUDE']].apply(to_Cartesian,axis=1)
    new_cols = ['x','y','z']
    for i in range(len(new_cols)):
        fires_yr[new_cols[i]]=[r[i] for r in fires_cartesian]

    weather_yr_kd = KD(weather_yr[['x','y','z','doy']])

    predicted_weather_points = pd.DataFrame()
    for i,row in fires_yr.iterrows():
        row_predictors = row[['x','y','z','DISCOVERY_DOY_SCALED']]
        closest_weather_time = weather_yr_kd.query(row_predictors,1)
        predicted_weather_point = weather_yr.iloc[closest_weather_time[1]]
        predicted_weather_points = predicted_weather_points.append(predicted_weather_point)

    fires_yr=fires_yr.rename(columns={'x':'x_fire','y':'y_fire','z':'z_fire'})
    fires_yr_merged = fires_yr.reset_index().merge(predicted_weather_points.reset_index(),left_index=True,right_index=True)
    fires_yr_merged.to_csv(dataLoc+'fires-weather-merged/fires_merged_weather_' + str(yr) + '.csv')

In [160]:
merged_fires_weather = pd.DataFrame()
for yr in range(1992,2016):
    fire_weather_df = pd.read_csv(dataLoc+'fires-weather-merged/fires_merged_weather_'+str(yr)+'.csv')
    merged_fires_weather = merged_fires_weather.append(fire_weather_df)


In [162]:
merged_fires_weather.to_csv(dataLoc+'fires_merged_weather.csv')