In [1]:
from pyspark.sql import SparkSession as ps
import datetime
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import pyspark as spark
import pyspark.sql.functions as f
import sqlite3



# EXTRACT

The sqlite database is connected, and a cursor is created to select tables from the database. 

In [2]:
conn = sqlite3.connect('FPA_FOD_20170508.sqlite') #Creation of the connector
cursor = conn.cursor() #Creation of cursor to directly communicate with sqlite data

cursor.execute("SELECT name FROM sqlite_master WHERE type='table';") #query to select the tables.

<sqlite3.Cursor at 0x19f64220570>

Next, we loop through, and print the name of each table. I checked each one manually, and through the use of the source where the data was received from, to determine which tables would be most relevant to work with.

In [3]:
table_names = cursor.fetchall() #Fetches each of the tables and places them into a list

for name in table_names: #loops throught the list and prints each table name
    print(name[0])

spatial_ref_sys
spatialite_history
sqlite_sequence
geometry_columns
spatial_ref_sys_aux
views_geometry_columns
virts_geometry_columns
geometry_columns_statistics
views_geometry_columns_statistics
virts_geometry_columns_statistics
geometry_columns_field_infos
views_geometry_columns_field_infos
virts_geometry_columns_field_infos
geometry_columns_time
geometry_columns_auth
views_geometry_columns_auth
virts_geometry_columns_auth
sql_statements_log
SpatialIndex
ElementaryGeometries
KNN
Fires
idx_Fires_Shape
idx_Fires_Shape_node
idx_Fires_Shape_rowid
idx_Fires_Shape_parent
NWCG_UnitIDActive_20170109


Selected the only relevant Table from the list, and converted it to dataframe.

In [4]:
query = "SELECT * FROM Fires;" #Selects the table named Fires from the sqlite file
dfpd = pd.read_sql_query(query, conn) #Creates Pandas dataframe based off the Fires table

dfpd #prints the table

Unnamed: 0,OBJECTID,FOD_ID,FPA_ID,SOURCE_SYSTEM_TYPE,SOURCE_SYSTEM,NWCG_REPORTING_AGENCY,NWCG_REPORTING_UNIT_ID,NWCG_REPORTING_UNIT_NAME,SOURCE_REPORTING_UNIT,SOURCE_REPORTING_UNIT_NAME,...,FIRE_SIZE_CLASS,LATITUDE,LONGITUDE,OWNER_CODE,OWNER_DESCR,STATE,COUNTY,FIPS_CODE,FIPS_NAME,Shape
0,1,1,FS-1418826,FED,FS-FIRESTAT,FS,USCAPNF,Plumas National Forest,0511,Plumas National Forest,...,A,40.036944,-121.005833,5.0,USFS,CA,63,063,Plumas,b'\x00\x01\xad\x10\x00\x00\xe8d\xc2\x92_@^\xc0...
1,2,2,FS-1418827,FED,FS-FIRESTAT,FS,USCAENF,Eldorado National Forest,0503,Eldorado National Forest,...,A,38.933056,-120.404444,5.0,USFS,CA,61,061,Placer,b'\x00\x01\xad\x10\x00\x00T\xb6\xeej\xe2\x19^\...
2,3,3,FS-1418835,FED,FS-FIRESTAT,FS,USCAENF,Eldorado National Forest,0503,Eldorado National Forest,...,A,38.984167,-120.735556,13.0,STATE OR PRIVATE,CA,17,017,El Dorado,b'\x00\x01\xad\x10\x00\x00\xd0\xa5\xa0W\x13/^\...
3,4,4,FS-1418845,FED,FS-FIRESTAT,FS,USCAENF,Eldorado National Forest,0503,Eldorado National Forest,...,A,38.559167,-119.913333,5.0,USFS,CA,3,003,Alpine,b'\x00\x01\xad\x10\x00\x00\x94\xac\xa3\rt\xfa]...
4,5,5,FS-1418847,FED,FS-FIRESTAT,FS,USCAENF,Eldorado National Forest,0503,Eldorado National Forest,...,A,38.559167,-119.933056,5.0,USFS,CA,3,003,Alpine,b'\x00\x01\xad\x10\x00\x00@\xe3\xaa.\xb7\xfb]\...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1880460,1880461,300348363,2015CAIRS29019636,NONFED,ST-CACDF,ST/C&L,USCASHU,Shasta-Trinity Unit,CASHU,Shasta-Trinity Unit,...,A,40.481637,-122.389375,13.0,STATE OR PRIVATE,CA,,,,b'\x00\x01\xad\x10\x00\x00P\xb8\x1e\x85\xeb\x9...
1880461,1880462,300348373,2015CAIRS29217935,NONFED,ST-CACDF,ST/C&L,USCATCU,Tuolumne-Calaveras Unit,CATCU,Tuolumne-Calaveras Unit,...,A,37.617619,-120.938570,12.0,MUNICIPAL/LOCAL,CA,,,,b'\x00\x01\xad\x10\x00\x00\x00\x80\xbe\x88\x11...
1880462,1880463,300348375,2015CAIRS28364460,NONFED,ST-CACDF,ST/C&L,USCATCU,Tuolumne-Calaveras Unit,CATCU,Tuolumne-Calaveras Unit,...,A,37.617619,-120.938570,12.0,MUNICIPAL/LOCAL,CA,,,,b'\x00\x01\xad\x10\x00\x00\x00\x80\xbe\x88\x11...
1880463,1880464,300348377,2015CAIRS29218079,NONFED,ST-CACDF,ST/C&L,USCATCU,Tuolumne-Calaveras Unit,CATCU,Tuolumne-Calaveras Unit,...,B,37.672235,-120.898356,12.0,MUNICIPAL/LOCAL,CA,,,,b'\x00\x01\xad\x10\x00\x00x\xba_\xaa~9^\xc0\xb...


# TRANSFORM

Converted and merged the time information into datetime format.

DISCOVERY_DATETIME: The exact date that the fire was discovered.

CONT_DATETIME: The exact date that the fire was contained.

In [5]:
#Creates new collumn named DISCOVERY_DATETIME by taking the FIRE_YEAR converted into datetime, then adding the DISCOVERY_DOY as time delta in days.
dfpd['DISCOVERY_DATETIME'] = pd.to_datetime(dfpd['FIRE_YEAR'], format= '%Y') + pd.to_timedelta(dfpd['DISCOVERY_DOY'], unit = 'D')
#Creates new collumn named CONT_DATETIME by taking the FIRE_YEAR converted into datetime, then adding the CONT_DOY as time delta in days.
dfpd['CONT_DATETIME'] = pd.to_datetime(dfpd['FIRE_YEAR'], format= '%Y') + pd.to_timedelta(dfpd['CONT_DOY'], unit = 'D')


Creates an elapsed time based on when the fire was discovered and contained.

FIRE_ELAPSED_TIME: The amount of days between when the fire was discovered and contained.

Note: If the fire was discovered and contaiened within the same day, it appears as 0.

In [6]:
# Finds the difference between CONT_DATETIME and DISCOVERY_DATETIME
dfpd['FIRE_ELAPSED_TIME'] = (dfpd['CONT_DATETIME'] - dfpd['DISCOVERY_DATETIME']).dt.days
dfpd[dfpd["FIRE_ELAPSED_TIME"] < 0]

Solves the issue where some of the dates where entered incorrectly, and result in negative datetimes. 

In [8]:
#Creation of mask for finding the conditional location where negative fires are negative and not null.
mask = (dfpd['FIRE_ELAPSED_TIME'] < 0) & (dfpd['FIRE_ELAPSED_TIME'].isnull() == False)
temp_datetime = dfpd.loc[mask, 'DISCOVERY_DATETIME'] #Temporarily stores the discovery date

#Replaces each instance where the discovery time happens before the contained time.
dfpd.loc[mask, 'DISCOVERY_DATETIME'] = dfpd.loc[mask, 'CONT_DATETIME']
dfpd.loc[mask, 'CONT_DATETIME'] = temp_datetime # Replaces the Contained datetime with initial values stored in the temporary datetime

dfpd[dfpd["FIRE_ELAPSED_TIME"] < 0] #Checks to see if the problem is solved.

Unnamed: 0,OBJECTID,FOD_ID,FPA_ID,SOURCE_SYSTEM_TYPE,SOURCE_SYSTEM,NWCG_REPORTING_AGENCY,NWCG_REPORTING_UNIT_ID,NWCG_REPORTING_UNIT_NAME,SOURCE_REPORTING_UNIT,SOURCE_REPORTING_UNIT_NAME,...,OWNER_CODE,OWNER_DESCR,STATE,COUNTY,FIPS_CODE,FIPS_NAME,Shape,DISCOVERY_DATETIME,CONT_DATETIME,FIRE_ELAPSED_TIME
6473,6474,6503,FS-1431059,FED,FS-FIRESTAT,FS,USNMGNF,Gila National Forest,0306,Gila National Forest,...,5.0,USFS,NM,3,003,Catron,b'\x00\x01\xad\x10\x00\x00\xec>\xae\xaa\xaa.[\...,2005-01-04,2005-12-31,-361.0
6712,6713,6745,FS-1431668,FED,FS-FIRESTAT,FS,USMSMNF,National Forests in Mississippi,0807,National Forests in Mississippi,...,5.0,USFS,MS,111,111,Perry,b'\x00\x01\xad\x10\x00\x00\xbc\x1cU\xd1HHV\xc0...,2005-01-02,2005-12-31,-363.0
6713,6714,6746,FS-1431670,FED,FS-FIRESTAT,FS,USMSMNF,National Forests in Mississippi,0807,National Forests in Mississippi,...,5.0,USFS,MS,111,111,Perry,b'\x00\x01\xad\x10\x00\x00\xe8d\xc2\x92_DV\xc0...,2005-01-02,2006-01-01,-364.0
25280,25281,25401,FS-1456768,FED,FS-FIRESTAT,FS,USWAOWF,Okanogan/Wenatchee National Forest,0617,Okanogan-Wenatchee National Forest,...,5.0,USFS,WA,7,007,Chelan,b'\x00\x01\xad\x10\x00\x00hd6L]&^\xc0H\x08\xa7...,2007-01-08,2007-08-06,-210.0
26509,26510,26638,FS-1458256,FED,FS-FIRESTAT,FS,USCASHF,Shasta-Trinity National Forest,0514,Shasta-Trinity National Forest,...,5.0,USFS,CA,105,105,Trinity,b'\x00\x01\xad\x10\x00\x00T\xa3\xe4\xf6\x07\xc...,2007-01-04,2007-12-31,-361.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1782784,1782785,300138828,SFO-2014WADNR894,NONFED,ST-NASF,ST/C&L,USWAWAS,Washington State Headquarters,WADNR,Washington Department of Natural Resources,...,8.0,PRIVATE,WA,045,045,Mason,b'\x00\x01\xad\x10\x00\x00\x90\xed|?5\xde^\xc0...,2014-01-24,2014-11-17,-297.0
1782785,1782786,300138829,SFO-2014WADNR344,NONFED,ST-NASF,ST/C&L,USWAWAS,Washington State Headquarters,WADNR,Washington Department of Natural Resources,...,8.0,PRIVATE,WA,047,047,Okanogan,b'\x00\x01\xad\x10\x00\x00\xec8EGr\xf9]\xc0\x0...,2014-02-11,2014-07-17,-156.0
1782786,1782787,300138830,SFO-2014WADNR460,NONFED,ST-NASF,ST/C&L,USWAWAS,Washington State Headquarters,WADNR,Washington Department of Natural Resources,...,8.0,PRIVATE,WA,047,047,Okanogan,b'\x00\x01\xad\x10\x00\x00\xd0Mb\x10X\t^\xc0`B...,2014-02-11,2014-08-02,-172.0
1837358,1837359,300259981,SFO-2015NJDEPB122102,NONFED,ST-NASF,ST/C&L,USNJNJS,New Jersey Forest Fire Service,NJDEP,New Jersey Department of Environmental Protection,...,7.0,STATE,NJ,Ocean,029,Ocean,b'\x00\x01\xad\x10\x00\x00\x04\xce\x19Q\xda\x8...,2015-01-05,2015-12-22,-351.0


Some instances still exist despite the fix, this is because the elapsed time crosses over into previous year, or overlaps in some instances.
To solve this issue, we simply add a year to the elapsed time to fix this issue. 

In [32]:
#Mask created for every remaining instance where the time elapsed is negative. 
mask2 = (dfpd['FIRE_ELAPSED_TIME'] < 0) & (dfpd['FIRE_ELAPSED_TIME'].isnull() == False) 
dfpd.loc[mask2, 'FIRE_ELAPSED_TIME'] = dfpd["FIRE_ELAPSED_TIME"] + 365 #Adds 365 to the value to make corrections. 


Even though this solves the negative value issue, this doesnt properly portray the elapsed time. So we find every instance that is larger than the max reported elapsed fire containment time found online. And subtract that by 365

In [33]:
mask3 = (dfpd['FIRE_ELAPSED_TIME'] > 175) & (dfpd['FIRE_ELAPSED_TIME'] > 175) & (dfpd['FIRE_ELAPSED_TIME'].isnull() == False)
dfpd.loc[mask3, 'FIRE_ELAPSED_TIME'] = 365 - dfpd["FIRE_ELAPSED_TIME"] 

This appears to solve the issue as the time elapsed matches the actual time elapsed by portrayed by the dates. 

In [34]:
dfpd[dfpd["FIRE_ELAPSED_TIME"] > 175].sort_values('FIRE_ELAPSED_TIME')

Unnamed: 0,OBJECTID,FOD_ID,FPA_ID,SOURCE_SYSTEM_TYPE,SOURCE_SYSTEM,NWCG_REPORTING_AGENCY,NWCG_REPORTING_UNIT_ID,NWCG_REPORTING_UNIT_NAME,SOURCE_REPORTING_UNIT,SOURCE_REPORTING_UNIT_NAME,...,OWNER_CODE,OWNER_DESCR,STATE,COUNTY,FIPS_CODE,FIPS_NAME,Shape,DISCOVERY_DATETIME,CONT_DATETIME,FIRE_ELAPSED_TIME
1781163,1781164,300136386,SFO-2014NY5724NY5724-2014-192,NONFED,ST-NASF,ST/C&L,USNYNYX,Fire Department of New York,NY5724,Fire Department of New York,...,14.0,MISSING/NOT SPECIFIED,NY,WARREN,113,Warren,b'\x00\x01\xad\x10\x00\x00h\x9aw\x9c\xa2kR\xc0...,2014-04-11,2014-10-04,176.0
1213642,1213643,1470823,W-618074,FED,DOI-WFMI,BIA,USWASPA,Spokane Agency,WASPA,Spokane Agency,...,2.0,BIA,WA,,,,b'\x00\x01\xad\x10\x00\x00\x8c\xc2\xf5(\\s]\xc...,2010-03-21,2010-09-13,176.0
1771550,1771551,300117039,SFO-2014NY2401NY2401-2014-436965,NONFED,ST-NASF,ST/C&L,USNYNYX,Fire Department of New York,NY2401,Fire Department of New York,...,14.0,MISSING/NOT SPECIFIED,NY,QUEENS,081,Queens,b'\x00\x01\xad\x10\x00\x00\xd4\xf0\xf4JYnR\xc0...,2014-05-20,2014-11-12,176.0
1771471,1771472,300116938,SFO-2014NY6019NY6019-2014-279,NONFED,ST-NASF,ST/C&L,USNYNYX,Fire Department of New York,NY6019,Fire Department of New York,...,14.0,MISSING/NOT SPECIFIED,NY,WESTCHESTER,119,Westchester,b'\x00\x01\xad\x10\x00\x00`;\xdfO\x8doR\xc0\x0...,2014-05-14,2014-11-06,176.0
1782503,1782504,300138395,SFO-2014NY3043NY3043-2014-1016,NONFED,ST-NASF,ST/C&L,USNYNYX,Fire Department of New York,NY3043,Fire Department of New York,...,14.0,MISSING/NOT SPECIFIED,NY,NASSAU,059,Nassau,b'\x00\x01\xad\x10\x00\x00\x0cO\xaf\x94elR\xc0...,2014-06-15,2014-12-08,176.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1781522,1781523,300136923,SFO-2014NY4211NY4211-2014-5170,NONFED,ST-NASF,ST/C&L,USNYNYX,Fire Department of New York,NY4211,Fire Department of New York,...,14.0,MISSING/NOT SPECIFIED,NY,RENSSELAER,083,Rensselaer,b'\x00\x01\xad\x10\x00\x004\x89A`\xe5lR\xc0\xa...,2014-04-22,2014-10-28,189.0
1737896,1737897,300075057,SFO-2014NY2347NY2347-2014-1493,NONFED,ST-NASF,ST/C&L,USNYNYX,Fire Department of New York,NY2347,Fire Department of New York,...,14.0,MISSING/NOT SPECIFIED,NY,JEFFERSON,045,Jefferson,b'\x00\x01\xad\x10\x00\x00\xfce\xf7\xe4a\xf9R\...,2014-01-04,2014-07-12,189.0
1213640,1213641,1470821,W-618070,FED,DOI-WFMI,BIA,USWASPA,Spokane Agency,WASPA,Spokane Agency,...,2.0,BIA,WA,,,,b'\x00\x01\xad\x10\x00\x00L\x1e\x16jM\x83]\xc0...,2010-03-11,2010-09-16,189.0
1745410,1745411,300085360,SFO-2014NY1434NY1434-2014-1400189,NONFED,ST-NASF,ST/C&L,USNYNYX,Fire Department of New York,NY1434,Fire Department of New York,...,14.0,MISSING/NOT SPECIFIED,NY,DUTCHESS,027,Dutchess,b'\x00\x01\xad\x10\x00\x00\xc8\xe5?\xa4\xdfzR\...,2014-04-21,2014-10-27,189.0


To make the NWCG reporting agency more usable, I replaced the acronyms with the actual terms. 

Below, I replaced the the nwcg reporting agency values, with their full names.

In [38]:
dfpd['NWCG_REPORTING_AGENCY'] = dfpd['NWCG_REPORTING_AGENCY'].str.replace("BIA","Bureau of Indian Affairs") \
                            .str.replace("BLM","Bureau of Land Management") \
                            .str.replace("BOR"," Bureau of Reclamation") \
                            .str.replace("DOD","Department of Defense") \
                            .str.replace("DOE","Department of Energy") \
                            .str.replace("FS","Forest Service") \
                            .str.replace("FWS","Fish and Wildlife Service") \
                            .str.replace("IA"," Interagency Organization") \
                            .str.replace("NPS","National Park Service") \
                            .str.replace("ST/C&L"," State, County, or Local Organization") \
                            .str.replace("TRIBE","Tribal Organization")


Next, I found each collumn that is was never or is no longer useful for analysis.

In [39]:
dfpd = dfpd.drop(['NWCG_REPORTING_UNIT_ID'], axis=1)
dfpd = dfpd.drop(['OBJECTID'], axis=1)
dfpd = dfpd.drop(['FOD_ID'], axis=1)
dfpd = dfpd.drop(['FPA_ID'], axis=1)
dfpd = dfpd.drop(['SOURCE_REPORTING_UNIT'], axis=1)
dfpd = dfpd.drop(['DISCOVERY_DOY'], axis=1)
dfpd = dfpd.drop(['CONT_DOY'], axis=1)
dfpd = dfpd.drop(['Shape'], axis=1)
dfpd = dfpd.drop(['DISCOVERY_DATE'], axis=1)
dfpd = dfpd.drop(['CONT_DATE'], axis=1)
dfpd = dfpd.drop(['FIPS_NAME'], axis=1)
dfpd = dfpd.drop(['FIPS_CODE'], axis=1)

Below, we prin the information showing each of the remaining collumns within the dataframe.

In [40]:
dfpd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1880465 entries, 0 to 1880464
Data columns (total 30 columns):
 #   Column                      Dtype         
---  ------                      -----         
 0   SOURCE_SYSTEM_TYPE          object        
 1   SOURCE_SYSTEM               object        
 2   NWCG_REPORTING_AGENCY       object        
 3   NWCG_REPORTING_UNIT_NAME    object        
 4   SOURCE_REPORTING_UNIT_NAME  object        
 5   LOCAL_FIRE_REPORT_ID        object        
 6   LOCAL_INCIDENT_ID           object        
 7   FIRE_CODE                   object        
 8   FIRE_NAME                   object        
 9   ICS_209_INCIDENT_NUMBER     object        
 10  ICS_209_NAME                object        
 11  MTBS_ID                     object        
 12  MTBS_FIRE_NAME              object        
 13  COMPLEX_NAME                object        
 14  FIRE_YEAR                   int64         
 15  DISCOVERY_TIME              object        
 16  STAT_CAUSE_CODE   

Now that the data has been cleaned and/replaced.

Why dont we take a look at the data, perform some queries. 

Top 10 states with the highest average fire size.

In [16]:
dfpd.groupby('STATE')['FIRE_SIZE'].describe().sort_values('mean').tail(10)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
STATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
NE,7973.0,121.334348,1848.199585,0.0001,0.3,1.0,8.8,74500.0
UT,30725.0,133.927628,2648.867747,0.001,0.1,0.1,1.0,357185.0
OR,61088.0,137.716104,4225.904392,0.01,0.1,0.1,0.3,558198.3
WA,33513.0,142.69741,3120.713833,0.01,0.1,0.2,1.0,255575.0
MT,40767.0,154.034541,2864.760995,0.01,0.1,0.2,1.0,249562.0
WY,14166.0,169.523555,2195.531021,0.0092,0.1,0.5,3.0,137069.0
NM,37478.0,170.234318,2858.989052,0.009,0.1,0.3,2.5,297845.0
ID,36698.0,372.890482,5643.33194,0.001,0.1,0.2,2.0,367785.0
NV,16956.0,531.72065,5888.665139,0.01,0.1,0.1,3.0,238462.6
AK,12843.0,2509.779198,19600.650646,0.01,0.1,0.2,5.0,606945.0


Top 10 highest average fire size for complexes in US Territory. 

In [41]:
dfpd.groupby('COMPLEX_NAME')['FIRE_SIZE'].describe().sort_values('mean').tail(20)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
COMPLEX_NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
PONIL COMPLEX,1.0,92194.0,,92194.0,92194.0,92194.0,92194.0,92194.0
EUREKA COMPLEX,1.0,95792.8,,95792.8,95792.8,95792.8,95792.8,95792.8
CENTRAL COMPLEX,8.0,98697.2125,102307.675765,1544.0,24796.5,44709.5,205137.675,243254.0
MURPHY COMPLEX,7.0,103298.3,126814.185868,9.0,30725.55,68079.0,112882.0,367785.0
COLUMBIA COMPLEX,1.0,109259.0,,109259.0,109259.0,109259.0,109259.0,109259.0
BEAVER CREEK COMPLEX,1.0,111497.0,,111497.0,111497.0,111497.0,111497.0,111497.0
CRAZY MOUNTAIN COMPLEX,4.0,111787.425,159980.576638,2054.6,31830.5,47822.6,127779.525,349449.9
BASIN COMPLEX,2.0,122098.0,57586.77626,81378.0,101738.0,122098.0,142458.0,162818.0
BLACKJACK BAY COMPLEX,1.0,124110.0,,124110.0,124110.0,124110.0,124110.0,124110.0
BIG BAR COMPLEX,1.0,124898.0,,124898.0,124898.0,124898.0,124898.0,124898.0


Top 10 highest average fire size for complexes in Texas.

In [42]:
dfpd[dfpd['STATE'] == 'TX'].groupby('COMPLEX_NAME')['FIRE_SIZE'].describe().sort_values('mean').tail(10)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
COMPLEX_NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
WILLOW CREEK SOUTH COMPLEX,1.0,24310.0,,24310.0,24310.0,24310.0,24310.0,24310.0
LEMONS GAP COMPLEX,1.0,25000.0,,25000.0,25000.0,25000.0,25000.0,25000.0
SCURRY COUNTY COMPLEX,1.0,28625.0,,28625.0,28625.0,28625.0,28625.0,28625.0
MONTAGUE COUNTY COMPLEX,1.0,31419.0,,31419.0,31419.0,31419.0,31419.0,31419.0
BASTROP COUNTY COMPLEX,1.0,34068.0,,34068.0,34068.0,34068.0,34068.0,34068.0
MATADOR WEST COMPLEX,1.0,41000.0,,41000.0,41000.0,41000.0,41000.0,41000.0
CANNON COMPLEX,1.0,63427.0,,63427.0,63427.0,63427.0,63427.0,63427.0
DICKENS COMPLEX,1.0,89200.0,,89200.0,89200.0,89200.0,89200.0,89200.0
PK COMPLEX,1.0,126734.0,,126734.0,126734.0,126734.0,126734.0,126734.0
EAST AMARILLO COMPLEX,2.0,453622.5,36665.607925,427696.0,440659.25,453622.5,466585.75,479549.0


Top 10 longest average fire time per state.

In [43]:
dfpd.groupby('STATE')['FIRE_ELAPSED_TIME'].describe().sort_values('mean').tail(10)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
STATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
UT,26586.0,1.536373,5.966061,0.0,0.0,0.0,1.0,182.0
NM,28045.0,1.848921,7.632154,0.0,0.0,0.0,1.0,186.0
WY,12406.0,2.004192,9.113463,0.0,0.0,0.0,1.0,161.0
MT,36952.0,2.019268,9.288723,0.0,0.0,0.0,1.0,163.0
NY,68949.0,2.46269,16.647317,0.0,0.0,0.0,0.0,189.0
ID,30662.0,4.60831,15.416729,0.0,0.0,1.0,1.0,186.0
WA,21363.0,5.044048,14.821223,0.0,0.0,0.0,3.0,189.0
NJ,3821.0,5.093169,6.632753,0.0,1.0,3.0,7.0,71.0
HI,166.0,10.849398,30.142212,0.0,0.0,0.0,2.0,185.0
AK,8697.0,10.852478,23.279584,0.0,0.0,1.0,8.0,174.0


Top 10 States with the most reported fires, compared with fire size stats. 

In [44]:
dfpd.groupby('STATE')['FIRE_SIZE'].describe().sort_values('count').tail(10)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
STATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AL,66570.0,13.82823,56.185035,0.01,1.0,3.0,10.0,4394.0
AZ,71586.0,77.901837,2885.760463,0.01,0.1,0.1,1.0,538049.0
MS,79230.0,14.331488,69.175085,0.01,1.0,3.0,10.0,5717.0
NY,80870.0,1.15954,24.707558,0.01,0.1,0.1,1.0,5050.0
SC,81315.0,6.661331,76.383975,0.01,0.5,2.0,5.0,19130.0
FL,90261.0,49.154653,1031.743889,0.01,0.3,1.7,6.0,158000.0
NC,111277.0,6.564892,235.843558,0.01,0.1,0.5,2.0,45294.0
TX,142021.0,68.90684,2376.391666,0.01,1.0,2.0,6.0,479549.0
GA,168867.0,9.415452,955.817799,0.01,0.2,0.8,2.7,309200.0
CA,189550.0,67.242725,2029.960435,0.001,0.1,0.25,1.0,315578.8


Top 10 NWCG most reported to agencies by state.

In [47]:
dfpd.groupby('NWCG_REPORTING_AGENCY')['STATE'].describe().sort_values('count').tail(10)

Unnamed: 0_level_0,count,unique,top,freq
NWCG_REPORTING_AGENCY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bureau of Reclamation,14,2,CA,13
Department of Defense,81,7,FL,23
Tribal Organization,3739,11,OK,2516
Fish and Wildlife Service,19331,50,CA,4027
National Park Service,20893,48,CA,3819
Interagency Organization,21841,6,PR,21802
Bureau of Land Management,97034,14,CA,20029
Bureau of Indian Affairs,119943,26,AZ,23047
Forest Service,220497,42,CA,43002
"State, County, or Local Organization",1377090,51,GA,167123


# NOTES

This database was very easy to workwith, especially since the majority of the table contained information that wasn't relevant or pertaining to the data in meaningful way, or were empty altogether.

Some the data regarding datetime may be inaccurate, and that would be due to errors in data entry when the information was collected and entered into the datalake.