In [1]:
import pandas as pd
import sqlite3
import datetime
import geopandas as gpd
import matplotlib.pyplot as plt
from shapely.geometry import Point, Polygon
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn import metrics
from sklearn import preprocessing
%matplotlib inline

In [2]:
# reading date from sqlite file
data = sqlite3.connect("./data/FPA_FOD_20170508.sqlite")
test = pd.read_sql_query("SELECT * FROM 'fires'",data)
data = pd.read_sql_query("SELECT FIRE_YEAR,STAT_CAUSE_DESCR,LATITUDE,LONGITUDE,STATE,DISCOVERY_DATE,FIRE_SIZE,FIPS_NAME FROM 'Fires'", data)

In [3]:
wf = data.copy()

In [4]:
# convert dates from julian format to datetime64
wf['DISCOVERY_DATE'] = pd.to_datetime(wf['DISCOVERY_DATE'] - pd.Timestamp(0).to_julian_date(), unit='D')
wf['DISCOVERY_DATE']

0         2005-02-02
1         2004-05-12
2         2004-05-31
3         2004-06-28
4         2004-06-28
             ...    
1880460   2015-09-26
1880461   2015-10-05
1880462   2015-05-02
1880463   2015-10-14
1880464   2015-03-14
Name: DISCOVERY_DATE, Length: 1880465, dtype: datetime64[ns]

In [5]:
# create month and day columns
wf['MONTH'] = wf['DISCOVERY_DATE'].dt.month
wf['DAY'] = wf['DISCOVERY_DATE'].dt.day
wf.head()

Unnamed: 0,FIRE_YEAR,STAT_CAUSE_DESCR,LATITUDE,LONGITUDE,STATE,DISCOVERY_DATE,FIRE_SIZE,FIPS_NAME,MONTH,DAY
0,2005,Miscellaneous,40.036944,-121.005833,CA,2005-02-02,0.1,Plumas,2,2
1,2004,Lightning,38.933056,-120.404444,CA,2004-05-12,0.25,Placer,5,12
2,2004,Debris Burning,38.984167,-120.735556,CA,2004-05-31,0.1,El Dorado,5,31
3,2004,Lightning,38.559167,-119.913333,CA,2004-06-28,0.1,Alpine,6,28
4,2004,Lightning,38.559167,-119.933056,CA,2004-06-28,0.1,Alpine,6,28


In [6]:
len(wf)

1880465

In [7]:
# changing column names to lower case
wf.columns= wf.columns.str.lower()
wf = wf.rename(columns={"fire_year":"year","fips_name":"county","stat_cause_descr":"cause"})
wf.head()

Unnamed: 0,year,cause,latitude,longitude,state,discovery_date,fire_size,county,month,day
0,2005,Miscellaneous,40.036944,-121.005833,CA,2005-02-02,0.1,Plumas,2,2
1,2004,Lightning,38.933056,-120.404444,CA,2004-05-12,0.25,Placer,5,12
2,2004,Debris Burning,38.984167,-120.735556,CA,2004-05-31,0.1,El Dorado,5,31
3,2004,Lightning,38.559167,-119.913333,CA,2004-06-28,0.1,Alpine,6,28
4,2004,Lightning,38.559167,-119.933056,CA,2004-06-28,0.1,Alpine,6,28


In [8]:
# wf.to_csv('./data/wildfre_cleaned.csv',index=False)

In [9]:
wf.describe()

Unnamed: 0,year,latitude,longitude,fire_size,month,day
count,1880465.0,1880465.0,1880465.0,1880465.0,1880465.0,1880465.0
mean,2003.71,36.78121,-95.70494,74.52016,5.930107,15.51281
std,6.663099,6.139031,16.71694,2497.598,2.9549,8.79852
min,1992.0,17.93972,-178.8026,1e-05,1.0,1.0
25%,1998.0,32.8186,-110.3635,0.1,3.0,8.0
50%,2004.0,35.4525,-92.04304,1.0,6.0,15.0
75%,2009.0,40.8272,-82.2976,3.3,8.0,23.0
max,2015.0,70.3306,-65.25694,606945.0,12.0,31.0


In [10]:
wf = wf[wf['fire_size'] < 35]

In [11]:
len(wf)

1763200

In [12]:
wf.isna().sum()

year                   0
cause                  0
latitude               0
longitude              0
state                  0
discovery_date         0
fire_size              0
county            635061
month                  0
day                    0
dtype: int64

In [13]:
wf.shape

(1763200, 10)

In [14]:
# create a copy for later use
wf_raw = wf.copy()

In [15]:
wf = wf.dropna().copy()

In [16]:
wf.shape

(1128139, 10)

In [17]:
states = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA", 
          "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", 
          "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", 
          "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", 
          "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]

In [18]:
len(states)

51

In [19]:
# filter the states that are not on Continental United States
wf = wf[wf.state.isin(states)]
slist = wf.state.to_list()
set(states).difference(slist)

{'DC'}

In [20]:
# geo mapping for wildfires

#fire_map = gpd.read_file('./data/tl_2019_us_state.shp')
# fire_map = fire_map[fire_map.STUSPS.isin(states)]
# wf=wf[wf['year'] == 2015]
# geometry = [Point(xy) for xy in zip(wf['longitude'],wf['latitude'])]
# geometry[:3]
# crs = {'init':'epsg:4326'}
# geo_df = gpd.GeoDataFrame(wf,crs=crs,geometry=geometry)
# geo_df.head()
# fig,ax = plt.subplots(figsize=(40,40))
# fire_map.plot(ax=ax,color='grey')
# geo_df.plot(ax=ax,markersize = 0.1, color='red',marker='o')
# geo_df.state.unique()

In [21]:
wf_c = wf.copy()
wf_c = wf_c[['latitude','longitude','county']]
wf_c.head()

Unnamed: 0,latitude,longitude,county
0,40.036944,-121.005833,Plumas
1,38.933056,-120.404444,Placer
2,38.984167,-120.735556,El Dorado
3,38.559167,-119.913333,Alpine
4,38.559167,-119.933056,Alpine


In [22]:
wf_c.county.unique()

array(['Plumas', 'Placer', 'El Dorado', ..., "O'Brien", 'Hodgeman',
       'Kauai'], dtype=object)

In [23]:
wf_causes = wf.copy()

In [24]:
wf_causes.head()

Unnamed: 0,year,cause,latitude,longitude,state,discovery_date,fire_size,county,month,day
0,2005,Miscellaneous,40.036944,-121.005833,CA,2005-02-02,0.1,Plumas,2,2
1,2004,Lightning,38.933056,-120.404444,CA,2004-05-12,0.25,Placer,5,12
2,2004,Debris Burning,38.984167,-120.735556,CA,2004-05-31,0.1,El Dorado,5,31
3,2004,Lightning,38.559167,-119.913333,CA,2004-06-28,0.1,Alpine,6,28
4,2004,Lightning,38.559167,-119.933056,CA,2004-06-28,0.1,Alpine,6,28


In [25]:
wf_causes.shape

(1128070, 10)

In [26]:
wf_causes = wf_causes[['county','cause','state']].groupby(['county','cause','state']).size().reset_index()
wf_causes.rename(columns={0:'count'},inplace=True)
wf_causes

Unnamed: 0,county,cause,state,count
0,Abbeville,Arson,SC,178
1,Abbeville,Campfire,SC,9
2,Abbeville,Children,SC,16
3,Abbeville,Debris Burning,SC,181
4,Abbeville,Equipment Use,SC,64
...,...,...,...,...
22209,Ziebach,Arson,SD,1
22210,Ziebach,Campfire,SD,1
22211,Ziebach,Equipment Use,SD,2
22212,Ziebach,Miscellaneous,SD,16


In [27]:
len(wf_causes.county.unique())

1665

In [28]:
wf_causes=pd.pivot_table(wf_causes,index=['county','state'],columns='cause',values='count',fill_value=0)

In [29]:
wf_causes

Unnamed: 0_level_0,cause,Arson,Campfire,Children,Debris Burning,Equipment Use,Fireworks,Lightning,Miscellaneous,Missing/Undefined,Powerline,Railroad,Smoking,Structure
county,state,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Abbeville,SC,178,9,16,181,64,2,17,58,306,8,4,26,3
Acadia,LA,3,0,0,0,0,0,0,0,2,0,0,0,0
Accomack,VA,63,0,2,28,10,0,5,6,3,0,1,2,3
Ada,ID,353,23,4,7,303,20,227,127,69,12,11,3,0
Adair,IA,0,0,1,3,0,0,0,0,37,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Yuma,AZ,5,63,1,15,16,2,16,133,41,0,1,2,0
Yuma,CO,0,0,0,1,3,1,6,32,3,0,0,3,0
Zapata,TX,1,1,8,19,23,0,0,107,0,5,0,1,0
Zavala,TX,12,0,13,41,7,0,0,55,0,2,0,5,0


In [30]:
#Normalize the data
total_causes = wf_causes.sum(axis=1)
wf_nor=wf_causes.div(wf_causes.sum(axis=1), axis=0)

In [31]:
wf_nor=wf_nor.loc[total_causes>100]
wf_nor

Unnamed: 0_level_0,cause,Arson,Campfire,Children,Debris Burning,Equipment Use,Fireworks,Lightning,Miscellaneous,Missing/Undefined,Powerline,Railroad,Smoking,Structure
county,state,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Abbeville,SC,0.204128,0.010321,0.018349,0.207569,0.073394,0.002294,0.019495,0.066514,0.350917,0.009174,0.004587,0.029817,0.00344
Accomack,VA,0.512195,0.000000,0.016260,0.227642,0.081301,0.000000,0.040650,0.048780,0.024390,0.000000,0.008130,0.016260,0.02439
Ada,ID,0.304573,0.019845,0.003451,0.006040,0.261432,0.017256,0.195858,0.109577,0.059534,0.010354,0.009491,0.002588,0.00000
Adair,OK,0.295009,0.000891,0.001337,0.099822,0.007576,0.000000,0.000000,0.027184,0.567291,0.000000,0.000891,0.000000,0.00000
Adams,CO,0.032193,0.008048,0.002012,0.026157,0.034205,0.022133,0.028169,0.814889,0.006036,0.000000,0.000000,0.026157,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Young,TX,0.018106,0.005571,0.009749,0.142061,0.313370,0.000000,0.038997,0.360724,0.000000,0.090529,0.000000,0.020891,0.00000
Yuba,CA,0.089866,0.013384,0.022945,0.139579,0.152964,0.000000,0.036329,0.242830,0.282983,0.003824,0.000000,0.015296,0.00000
Yuma,AZ,0.016949,0.213559,0.003390,0.050847,0.054237,0.006780,0.054237,0.450847,0.138983,0.000000,0.003390,0.006780,0.00000
Zapata,TX,0.006061,0.006061,0.048485,0.115152,0.139394,0.000000,0.000000,0.648485,0.000000,0.030303,0.000000,0.006061,0.00000


In [32]:
scaler = preprocessing.StandardScaler().fit(wf_nor)
data_nor=scaler.transform(wf_nor)
data_nor

array([[ 0.53997936, -0.53150715, -0.22226852, ..., -0.35032516,
         0.05525437,  0.10320077],
       [ 2.79140115, -0.69006763, -0.28182992, ..., -0.2699669 ,
        -0.42733264,  3.12687242],
       [ 1.27405051, -0.38519859, -0.64713079, ..., -0.2391008 ,
        -0.91402676, -0.39334344],
       ...,
       [-0.82796971,  2.59079047, -0.64888246, ..., -0.37748209,
        -0.76482514, -0.39334344],
       [-0.9075457 , -0.59696006,  0.63719474, ..., -0.45436796,
        -0.79042248, -0.39334344],
       [-0.30221765, -0.69006763,  2.00074255, ..., -0.45436796,
         0.31229483, -0.39334344]])

In [33]:
data_nor.shape

(1641, 13)

In [34]:
tup_list=wf_nor.index

In [35]:
len(tup_list)

1641

In [36]:
actual_label=[]
for x in range(len(tup_list)):
    temp = tup_list[x][0]
    actual_label.append(temp)

In [37]:
def ACC_out(predict,actual):
    a = 1.0*sum(predict==actual)/len(predict)*100
    print('ACC = %.2f'%(a) + '%')
    return round(a,2)

In [38]:
log_reg = LogisticRegression(max_iter = 140)
X = data_nor[:,:]
log_reg.fit(X,actual_label)
# predict_label = log_reg.predict(X)

LogisticRegression(max_iter=140)

In [39]:
# ACC_out(predict_label,actual_label)

In [40]:
# r_num = round(len(data_nor)*0.75)

In [41]:
# test_set = X[:r_num]
# actual_label2 = actual_label[:r_num]

In [42]:
# predict_label2 = log_reg.predict(test_set)

In [43]:
# ACC_out(predict_label2,actual_label2)

In [44]:
import random
def reg_model_test(model,actual,rand_min,rand_max,rand_num,X):
    random_list = random.sample(range(rand_min,rand_max), rand_num)
    temp_num = 0
    temp_list =[]
    for i in random_list:
        actual_temp=[]
        test_set = X[:i]
        actual_temp = actual[:i]
        predict = log_reg.predict(test_set)
        print(predict)
        print(actual_temp)
        print(('Random silce of: {}'.format(i)))
        c = ACC_out(predict,actual_temp)
        temp_list.append(c)
        print()
    avg = sum(temp_list)/len(temp_list)
    print('number of iteration:' + str(rand_num) + ' ,Average:' + str(round(avg,2)) + '%')
    print()

In [45]:
reg_model_test(log_reg,actual_label,50,90,1,X)

['Washington' 'Accomack' 'Ada' 'Adair' 'Adams' 'Lincoln' 'Washington'
 'Lincoln' 'Lee' 'Jackson' 'Alachua' 'Cherokee' 'Alameda' 'Jefferson'
 'Lincoln' 'Albemarle' 'Lincoln' 'Scott' 'Cherokee' 'Monroe' 'Washington'
 'Franklin' 'Johnson' 'Oneida' 'Marion' 'Washington' 'Washington'
 'Garfield' 'Dallas' 'Franklin' 'Amherst' 'Wayne' 'Franklin' 'Cumberland'
 'Washington' 'Andrews' 'York' 'Clay' 'Jackson' 'Franklin' 'Washington'
 'Custer' 'Appling' 'Lee' 'Franklin' 'Arapahoe' 'Archer' 'Park' 'Monroe'
 'Armstrong' 'Washington' 'Johnson' 'Ashland' 'Polk' 'Asotin' 'Madison'
 'Coffee' 'Atlantic' 'McCurtain' 'Marion' 'Jefferson' 'Austin'
 'Washington' 'Cherokee' 'Baca' 'Polk' 'Bailey' 'Dixie' 'Newton' 'Lincoln'
 'Lawrence' 'Monroe' 'Lewis' 'Washington' 'Bannock' 'Pike' 'Taylor'
 'Barnstable' 'Franklin' 'Jackson' 'Lee' 'Calhoun' 'Washington' 'Marion'
 'Franklin']
['Abbeville', 'Accomack', 'Ada', 'Adair', 'Adams', 'Adams', 'Adams', 'Adams', 'Aiken', 'Aitkin', 'Alachua', 'Alamance', 'Alameda', 'Alban

In [46]:
wf.head()

Unnamed: 0,year,cause,latitude,longitude,state,discovery_date,fire_size,county,month,day
0,2005,Miscellaneous,40.036944,-121.005833,CA,2005-02-02,0.1,Plumas,2,2
1,2004,Lightning,38.933056,-120.404444,CA,2004-05-12,0.25,Placer,5,12
2,2004,Debris Burning,38.984167,-120.735556,CA,2004-05-31,0.1,El Dorado,5,31
3,2004,Lightning,38.559167,-119.913333,CA,2004-06-28,0.1,Alpine,6,28
4,2004,Lightning,38.559167,-119.933056,CA,2004-06-28,0.1,Alpine,6,28


In [47]:
wf_lr = wf[['county','latitude','longitude']].copy()
wf_lr.head()

Unnamed: 0,county,latitude,longitude
0,Plumas,40.036944,-121.005833
1,Placer,38.933056,-120.404444
2,El Dorado,38.984167,-120.735556
3,Alpine,38.559167,-119.913333
4,Alpine,38.559167,-119.933056


In [48]:
act_la = wf_lr.county.to_list()
act_la[:10]

['Plumas',
 'Placer',
 'El Dorado',
 'Alpine',
 'Alpine',
 'Amador',
 'El Dorado',
 'Amador',
 'El Dorado',
 'Amador']

In [49]:
data_lr = wf_lr.drop('county',axis=1).copy()
data_lr.head()

Unnamed: 0,latitude,longitude
0,40.036944,-121.005833
1,38.933056,-120.404444
2,38.984167,-120.735556
3,38.559167,-119.913333
4,38.559167,-119.933056


In [50]:
scaler = preprocessing.StandardScaler().fit(data_lr)
data_lr=scaler.transform(data_lr)
data_lr

array([[ 0.63852347, -1.83067502],
       [ 0.44091755, -1.79203627],
       [ 0.45006689, -1.81330989],
       ...,
       [-0.45661485, -1.56578458],
       [ 0.11473855, -1.72254907],
       [ 0.5184052 , -1.9738236 ]])

In [None]:
log_reg = LogisticRegression()
X = data_lr[:,:]
log_reg.fit(X,act_la)
predict = log_reg.predict(X)

In [179]:
predict

array(['Lightning', 'Lightning', 'Lightning', ..., 'Lightning',
       'Lightning', 'Lightning'], dtype='<U17')

In [123]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction import DictVectorizer
df = wf.copy()
df.head()

Unnamed: 0,year,cause,latitude,longitude,state,discovery_date,fire_size,county,month,day
0,2005,Miscellaneous,40.036944,-121.005833,CA,2005-02-02,0.1,Plumas,2,2
1,2004,Lightning,38.933056,-120.404444,CA,2004-05-12,0.25,Placer,5,12
2,2004,Debris Burning,38.984167,-120.735556,CA,2004-05-31,0.1,El Dorado,5,31
3,2004,Lightning,38.559167,-119.913333,CA,2004-06-28,0.1,Alpine,6,28
4,2004,Lightning,38.559167,-119.933056,CA,2004-06-28,0.1,Alpine,6,28


In [125]:
mapping = {}
df = df[['latitude','longitude','fire_size','cause']].copy()

In [126]:
Y = df[['cause']].copy()
data = df.drop('cause', axis=1)
# la_state = data['state'].unique().tolist()
# mapping = dict( zip(la_state,range(len(la_state))) )
# data.replace({'state': mapping},inplace=True)
data.head()

Unnamed: 0,latitude,longitude,fire_size
0,40.036944,-121.005833,0.1
1,38.933056,-120.404444,0.25
2,38.984167,-120.735556,0.1
3,38.559167,-119.913333,0.1
4,38.559167,-119.933056,0.1


In [127]:
la_cause = Y['cause'].unique().tolist()
mapping = dict( zip(la_cause,range(len(la_cause))) )
Y.replace({'cause': mapping},inplace=True)
Y.head()

Unnamed: 0,cause
0,0
1,1
2,2
3,1
4,1


In [128]:
scaler = preprocessing.StandardScaler().fit(data)
data=scaler.transform(data)

In [129]:
X = data
model = LinearRegression()
model.fit(X,Y)

LinearRegression()

In [130]:
actual = Y.cause.to_list()
actual

[0,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 3,
 1,
 1,
 4,
 4,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 2,
 2,
 1,
 3,
 1,
 1,
 1,
 3,
 4,
 5,
 5,
 2,
 2,
 2,
 2,
 3,
 1,
 0,
 0,
 3,
 2,
 1,
 0,
 5,
 0,
 1,
 1,
 1,
 2,
 3,
 1,
 3,
 1,
 2,
 0,
 3,
 3,
 1,
 1,
 2,
 3,
 0,
 2,
 0,
 3,
 0,
 1,
 4,
 3,
 3,
 3,
 4,
 0,
 1,
 1,
 1,
 3,
 1,
 1,
 5,
 2,
 5,
 1,
 1,
 6,
 3,
 2,
 7,
 3,
 1,
 1,
 1,
 1,
 1,
 6,
 3,
 1,
 1,
 3,
 3,
 1,
 1,
 1,
 0,
 2,
 2,
 1,
 1,
 1,
 1,
 1,
 4,
 1,
 1,
 5,
 1,
 1,
 1,
 1,
 1,
 2,
 3,
 3,
 3,
 0,
 3,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 3,
 3,
 1,
 3,
 1,
 0,
 3,
 3,
 3,
 3,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 6,
 1,
 3,
 8,
 0,
 3,
 1,
 0,
 0,
 2,
 1,
 2,
 7,
 5,
 1,
 1,
 3,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 3,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 5,
 5,
 6,
 5,
 5,
 1,
 1,
 1,
 1,
 2,
 1,
 2,
 1,
 1,
 3,
 1,
 5,
 3,
 3,
 3,
 5,
 3,
 1,
 5,
 1,
 5,
 1,
 2,
 1,
 5,
 1,
 1,
 1,
 3,
 0,
 0,
 5,
 5,
 3,
 1,
 3,
 8,
 3,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 3,


In [131]:
predict = model.predict(X)
1.0*sum(predict==actual)/len(predict)*100
# predict[:20]

array([[3.11514411],
       [3.18046816],
       [3.17702669],
       [3.20299295],
       [3.20296572],
       [3.19828337],
       [3.1951149 ],
       [3.20330027],
       [3.19491133],
       [3.20457692],
       [3.18931447],
       [3.20944546],
       [3.19566943],
       [3.20183248],
       [2.98922728],
       [3.09100092],
       [3.17976225],
       [3.52163249],
       [3.52956578],
       [3.52135593]])