In [3]:
import pandas as pd
import sqlite3
import datetime
import geopandas as gpd
import matplotlib.pyplot as plt
from shapely.geometry import Point, Polygon
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn import metrics
from sklearn import preprocessing
%matplotlib inline

In [4]:
# Let's clean the data
data = sqlite3.connect("./data/FPA_FOD_20170508.sqlite")
test = pd.read_sql_query("SELECT * FROM 'fires'",data)
wf = pd.read_sql_query("SELECT FIRE_YEAR,STAT_CAUSE_DESCR,LATITUDE,LONGITUDE,STATE,DISCOVERY_DATE,FIRE_SIZE,FIPS_NAME FROM 'Fires'", data)

In [5]:
wf['DISCOVERY_DATE'] = pd.to_datetime(wf['DISCOVERY_DATE'] - pd.Timestamp(0).to_julian_date(), unit='D')
wf['DISCOVERY_DATE']

0         2005-02-02
1         2004-05-12
2         2004-05-31
3         2004-06-28
4         2004-06-28
             ...    
1880460   2015-09-26
1880461   2015-10-05
1880462   2015-05-02
1880463   2015-10-14
1880464   2015-03-14
Name: DISCOVERY_DATE, Length: 1880465, dtype: datetime64[ns]

In [6]:
wf['MONTH'] = wf['DISCOVERY_DATE'].dt.month
wf.head()

Unnamed: 0,FIRE_YEAR,STAT_CAUSE_DESCR,LATITUDE,LONGITUDE,STATE,DISCOVERY_DATE,FIRE_SIZE,FIPS_NAME,MONTH
0,2005,Miscellaneous,40.036944,-121.005833,CA,2005-02-02,0.1,Plumas,2
1,2004,Lightning,38.933056,-120.404444,CA,2004-05-12,0.25,Placer,5
2,2004,Debris Burning,38.984167,-120.735556,CA,2004-05-31,0.1,El Dorado,5
3,2004,Lightning,38.559167,-119.913333,CA,2004-06-28,0.1,Alpine,6
4,2004,Lightning,38.559167,-119.933056,CA,2004-06-28,0.1,Alpine,6


In [7]:
wf['DAY'] = wf['DISCOVERY_DATE'].dt.day
wf.head()

Unnamed: 0,FIRE_YEAR,STAT_CAUSE_DESCR,LATITUDE,LONGITUDE,STATE,DISCOVERY_DATE,FIRE_SIZE,FIPS_NAME,MONTH,DAY
0,2005,Miscellaneous,40.036944,-121.005833,CA,2005-02-02,0.1,Plumas,2,2
1,2004,Lightning,38.933056,-120.404444,CA,2004-05-12,0.25,Placer,5,12
2,2004,Debris Burning,38.984167,-120.735556,CA,2004-05-31,0.1,El Dorado,5,31
3,2004,Lightning,38.559167,-119.913333,CA,2004-06-28,0.1,Alpine,6,28
4,2004,Lightning,38.559167,-119.933056,CA,2004-06-28,0.1,Alpine,6,28


In [8]:
len(wf)

1880465

In [9]:
# wf = wf[wf['STATE'] == 'CA']
# wf['STATE'].unique()

In [10]:
wf.columns= wf.columns.str.lower()
wf.head()

Unnamed: 0,fire_year,stat_cause_descr,latitude,longitude,state,discovery_date,fire_size,fips_name,month,day
0,2005,Miscellaneous,40.036944,-121.005833,CA,2005-02-02,0.1,Plumas,2,2
1,2004,Lightning,38.933056,-120.404444,CA,2004-05-12,0.25,Placer,5,12
2,2004,Debris Burning,38.984167,-120.735556,CA,2004-05-31,0.1,El Dorado,5,31
3,2004,Lightning,38.559167,-119.913333,CA,2004-06-28,0.1,Alpine,6,28
4,2004,Lightning,38.559167,-119.933056,CA,2004-06-28,0.1,Alpine,6,28


In [11]:
wf = wf.rename(columns={"fire_year":"year","fips_name":"county","stat_cause_descr":"cause"})
wf.head()

Unnamed: 0,year,cause,latitude,longitude,state,discovery_date,fire_size,county,month,day
0,2005,Miscellaneous,40.036944,-121.005833,CA,2005-02-02,0.1,Plumas,2,2
1,2004,Lightning,38.933056,-120.404444,CA,2004-05-12,0.25,Placer,5,12
2,2004,Debris Burning,38.984167,-120.735556,CA,2004-05-31,0.1,El Dorado,5,31
3,2004,Lightning,38.559167,-119.913333,CA,2004-06-28,0.1,Alpine,6,28
4,2004,Lightning,38.559167,-119.933056,CA,2004-06-28,0.1,Alpine,6,28


In [12]:
wf.to_csv('./data/wildfre_cleaned.csv',index=False)

In [55]:
wf = pd.read_csv('./data/wildfre_cleaned.csv')

In [56]:
wf.describe()

Unnamed: 0,year,latitude,longitude,fire_size,month,day
count,1880465.0,1880465.0,1880465.0,1880465.0,1880465.0,1880465.0
mean,2003.71,36.78121,-95.70494,74.52016,5.930107,15.51281
std,6.663099,6.139031,16.71694,2497.598,2.9549,8.79852
min,1992.0,17.93972,-178.8026,1e-05,1.0,1.0
25%,1998.0,32.8186,-110.3635,0.1,3.0,8.0
50%,2004.0,35.4525,-92.04304,1.0,6.0,15.0
75%,2009.0,40.8272,-82.2976,3.3,8.0,23.0
max,2015.0,70.3306,-65.25694,606945.0,12.0,31.0


In [58]:
wf.isna().sum()

year                   0
cause                  0
latitude               0
longitude              0
state                  0
discovery_date         0
fire_size              0
county            678148
month                  0
day                    0
dtype: int64

In [16]:
wf = wf.dropna().copy()

In [17]:
states = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA", 
          "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", 
          "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", 
          "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", 
          "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]

In [18]:
len(states)

51

In [19]:
wf = wf[wf.state.isin(states)]

In [20]:
slist = wf.state.to_list()

In [21]:
set(states).difference(slist)

{'DC'}

In [22]:
fire_map = gpd.read_file('./data/tl_2019_us_state.shp')

In [23]:
fire_map = fire_map[fire_map.STUSPS.isin(states)]

In [24]:
fire_map.head()

Unnamed: 0,REGION,DIVISION,STATEFP,STATENS,GEOID,STUSPS,NAME,LSAD,MTFCC,FUNCSTAT,ALAND,AWATER,INTPTLAT,INTPTLON,geometry
0,3,5,54,1779805,54,WV,West Virginia,0,G4000,A,62266231560,489271086,38.6472854,-80.6183274,"POLYGON ((-81.74725 39.09538, -81.74635 39.096..."
1,3,5,12,294478,12,FL,Florida,0,G4000,A,138947364717,31362872853,28.4574302,-82.4091477,"MULTIPOLYGON (((-86.38865 30.99418, -86.38385 ..."
2,2,3,17,1779784,17,IL,Illinois,0,G4000,A,143779863817,6215723896,40.1028754,-89.1526108,"POLYGON ((-91.18529 40.63780, -91.17510 40.643..."
3,2,4,27,662849,27,MN,Minnesota,0,G4000,A,206230065476,18942261495,46.3159573,-94.1996043,"POLYGON ((-96.78438 46.63050, -96.78434 46.630..."
4,3,5,24,1714934,24,MD,Maryland,0,G4000,A,25151726296,6979340970,38.9466584,-76.6744939,"POLYGON ((-77.45881 39.22027, -77.45866 39.220..."


In [25]:
# wf=wf[wf['year'] == 2015]

In [26]:
# geometry = [Point(xy) for xy in zip(wf['longitude'],wf['latitude'])]
# geometry[:3]

In [27]:
# crs = {'init':'epsg:4326'}
# geo_df = gpd.GeoDataFrame(wf,crs=crs,geometry=geometry)
# geo_df.head()

In [28]:
# fig,ax = plt.subplots(figsize=(40,40))
# fire_map.plot(ax=ax,color='grey')
# geo_df.plot(ax=ax,markersize = 0.1, color='red',marker='o')

In [29]:
# geo_df.state.unique()

In [30]:
wf.head()

Unnamed: 0,year,cause,latitude,longitude,state,discovery_date,fire_size,county,month,day
0,2005,Miscellaneous,40.036944,-121.005833,CA,2005-02-02,0.1,Plumas,2,2
1,2004,Lightning,38.933056,-120.404444,CA,2004-05-12,0.25,Placer,5,12
2,2004,Debris Burning,38.984167,-120.735556,CA,2004-05-31,0.1,El Dorado,5,31
3,2004,Lightning,38.559167,-119.913333,CA,2004-06-28,0.1,Alpine,6,28
4,2004,Lightning,38.559167,-119.933056,CA,2004-06-28,0.1,Alpine,6,28


In [31]:
wf_causes = wf.copy()

In [32]:
wf_causes = wf_causes[['county','cause','state']].groupby(['county','cause','state']).size().reset_index()
wf_causes.rename(columns={0:'count'},inplace=True)
wf_causes

Unnamed: 0,county,cause,state,count
0,Abbeville,Arson,SC,180
1,Abbeville,Campfire,SC,9
2,Abbeville,Children,SC,16
3,Abbeville,Debris Burning,SC,181
4,Abbeville,Equipment Use,SC,64
...,...,...,...,...
22905,Ziebach,Arson,SD,1
22906,Ziebach,Campfire,SD,1
22907,Ziebach,Equipment Use,SD,2
22908,Ziebach,Miscellaneous,SD,17


In [33]:
len(wf_causes.county.unique())

1694

In [34]:
wf_causes=pd.pivot_table(wf_causes,index=['county','state'],columns='cause',values='count',fill_value=0)

In [35]:
wf_causes

Unnamed: 0_level_0,cause,Arson,Campfire,Children,Debris Burning,Equipment Use,Fireworks,Lightning,Miscellaneous,Missing/Undefined,Powerline,Railroad,Smoking,Structure
county,state,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Abbeville,SC,180,9,16,181,64,2,17,58,311,8,4,26,3
Acadia,LA,4,0,0,0,0,0,0,0,4,0,0,0,0
Accomack,VA,68,0,2,30,11,0,6,6,3,0,1,2,3
Ada,ID,384,29,4,8,327,26,292,159,76,22,12,4,0
Adair,IA,0,0,1,4,0,0,0,0,40,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Yuma,AZ,6,63,1,19,19,2,18,144,46,1,1,2,0
Yuma,CO,0,0,0,1,3,1,6,36,9,0,0,3,0
Zapata,TX,1,1,8,20,29,0,1,126,0,5,0,1,0
Zavala,TX,12,1,13,44,11,0,1,66,0,2,0,5,0


In [36]:
#Normalize the data
total_causes = wf_causes.sum(axis=1)
wf_nor=wf_causes.div(wf_causes.sum(axis=1), axis=0)

In [37]:
wf_nor=wf_nor.loc[total_causes>100]
wf_nor

Unnamed: 0_level_0,cause,Arson,Campfire,Children,Debris Burning,Equipment Use,Fireworks,Lightning,Miscellaneous,Missing/Undefined,Powerline,Railroad,Smoking,Structure
county,state,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Abbeville,SC,0.204778,0.010239,0.018203,0.205916,0.072810,0.002275,0.019340,0.065984,0.353811,0.009101,0.004551,0.029579,0.003413
Accomack,VA,0.515152,0.000000,0.015152,0.227273,0.083333,0.000000,0.045455,0.045455,0.022727,0.000000,0.007576,0.015152,0.022727
Ada,ID,0.285927,0.021593,0.002978,0.005957,0.243485,0.019360,0.217424,0.118392,0.056590,0.016381,0.008935,0.002978,0.000000
Adair,OK,0.297753,0.001124,0.001124,0.089513,0.007116,0.000000,0.000375,0.026966,0.575281,0.000000,0.000749,0.000000,0.000000
Adams,CO,0.031873,0.007968,0.001992,0.025896,0.033865,0.023904,0.027888,0.814741,0.005976,0.000000,0.000000,0.025896,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Young,TX,0.018801,0.004700,0.008226,0.143361,0.294947,0.001175,0.045828,0.378378,0.001175,0.084606,0.000000,0.018801,0.000000
Yuba,CA,0.088725,0.012939,0.022181,0.136784,0.155268,0.000000,0.035120,0.240296,0.288355,0.005545,0.000000,0.014787,0.000000
Yuma,AZ,0.018634,0.195652,0.003106,0.059006,0.059006,0.006211,0.055901,0.447205,0.142857,0.003106,0.003106,0.006211,0.000000
Zapata,TX,0.005208,0.005208,0.041667,0.104167,0.151042,0.000000,0.005208,0.656250,0.000000,0.026042,0.000000,0.005208,0.000000


In [38]:
scaler = preprocessing.StandardScaler().fit(wf_nor)
data_nor=scaler.transform(wf_nor)
data_nor

array([[ 0.52146148, -0.524989  , -0.19778049, ..., -0.35021659,
         0.04788905,  0.11635998],
       [ 2.73148066, -0.68732542, -0.2867107 , ..., -0.27944899,
        -0.41797   ,  2.98449048],
       [ 1.09928351, -0.34496439, -0.64153257, ..., -0.24764678,
        -0.8110341 , -0.39045826],
       ...,
       [-0.80398467,  2.41471193, -0.63782542, ..., -0.38402064,
        -0.70664932, -0.39045826],
       [-0.8995791 , -0.60474804,  0.48615314, ..., -0.45667042,
        -0.73903079, -0.39045826],
       [-0.3853992 , -0.58503602,  1.71632444, ..., -0.45667042,
         0.13439265, -0.39045826]])

In [39]:
data_nor.shape

(1698, 13)

In [40]:
tup_list=wf_nor.index

In [41]:
len(tup_list)

1698

In [42]:
actual_label=[]
for x in range(len(tup_list)):
    temp = tup_list[x][0]
    actual_label.append(temp)

In [43]:
def ACC_out(predict,actual):
    a = 1.0*sum(predict==actual)/len(predict)*100
    print('ACC = %.2f'%(a) + '%')
    return round(a,2)

In [44]:
log_reg = LogisticRegression(max_iter = 140)
X = data_nor[:,:]
log_reg.fit(X,actual_label)
# predict_label = log_reg.predict(X)

LogisticRegression(max_iter=140)

In [45]:
# ACC_out(predict_label,actual_label)

In [46]:
# r_num = round(len(data_nor)*0.75)

In [47]:
# test_set = X[:r_num]
# actual_label2 = actual_label[:r_num]

In [48]:
# predict_label2 = log_reg.predict(test_set)

In [49]:
# ACC_out(predict_label2,actual_label2)

In [52]:
import random
def reg_model_test(model,actual,rand_min,rand_max,rand_num,X):
    random_list = random.sample(range(rand_min,rand_max), rand_num)
    temp_num = 0
    temp_list =[]
    for i in random_list:
        actual_temp=[]
        test_set = X[:i]
        actual_temp = actual[:i]
        predict = log_reg.predict(test_set)
        print(('Random silce of: {}'.format(i)))
        c = ACC_out(predict,actual_temp)
        temp_list.append(c)
        print()
    avg = sum(temp_list)/len(temp_list)
    print('number of iteration:' + str(rand_num) + ' ,Average:' + str(round(avg,2)) + '%')
    print()

In [54]:
reg_model_test(log_reg,actual_label,50,90,40,X)

Random silce of: 50
ACC = 72.00%

Random silce of: 56
ACC = 71.43%

Random silce of: 70
ACC = 75.71%

Random silce of: 80
ACC = 78.75%

Random silce of: 52
ACC = 71.15%

Random silce of: 71
ACC = 76.06%

Random silce of: 77
ACC = 77.92%

Random silce of: 61
ACC = 73.77%

Random silce of: 58
ACC = 72.41%

Random silce of: 82
ACC = 79.27%

Random silce of: 72
ACC = 76.39%

Random silce of: 59
ACC = 72.88%

Random silce of: 85
ACC = 78.82%

Random silce of: 88
ACC = 79.55%

Random silce of: 60
ACC = 73.33%

Random silce of: 63
ACC = 74.60%

Random silce of: 84
ACC = 78.57%

Random silce of: 83
ACC = 78.31%

Random silce of: 75
ACC = 77.33%

Random silce of: 55
ACC = 70.91%

Random silce of: 62
ACC = 74.19%

Random silce of: 87
ACC = 79.31%

Random silce of: 66
ACC = 75.76%

Random silce of: 68
ACC = 76.47%

Random silce of: 89
ACC = 79.78%

Random silce of: 86
ACC = 79.07%

Random silce of: 53
ACC = 71.70%

Random silce of: 51
ACC = 70.59%

Random silce of: 81
ACC = 79.01%

Random silce o