In [1]:
import sqlite3
import os
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd

dat = sqlite3.connect('FPA_FOD_20170508.sqlite')
query = dat.execute("SELECT * From Fires")
cols = [column[0] for column in query.description]
df = pd.DataFrame(data = query.fetchall(), columns = cols)


In [2]:
df.size

73338135

In [3]:
import datetime

df['DISCOVERY_DATE'] = pd.to_datetime(df['DISCOVERY_DATE'], origin='julian', unit='D')
df['DAY_OF_WEEK'] = df['DISCOVERY_DATE'].dt.weekday
df['MONTH'] = df['DISCOVERY_DATE'].dt.month


In [4]:
data_num = df.select_dtypes(include=[np.number])
data_cat = df.select_dtypes(exclude=[np.number])

num_attribs = list(data_num)
cat_attribs = list(data_cat)
cat_attribs.remove("FIRE_SIZE_CLASS")

print("num", num_attribs)
print("cat", cat_attribs)

num ['OBJECTID', 'FOD_ID', 'FIRE_YEAR', 'DISCOVERY_DOY', 'STAT_CAUSE_CODE', 'CONT_DATE', 'CONT_DOY', 'FIRE_SIZE', 'LATITUDE', 'LONGITUDE', 'OWNER_CODE', 'DAY_OF_WEEK', 'MONTH']
cat ['FPA_ID', 'SOURCE_SYSTEM_TYPE', 'SOURCE_SYSTEM', 'NWCG_REPORTING_AGENCY', 'NWCG_REPORTING_UNIT_ID', 'NWCG_REPORTING_UNIT_NAME', 'SOURCE_REPORTING_UNIT', 'SOURCE_REPORTING_UNIT_NAME', 'LOCAL_FIRE_REPORT_ID', 'LOCAL_INCIDENT_ID', 'FIRE_CODE', 'FIRE_NAME', 'ICS_209_INCIDENT_NUMBER', 'ICS_209_NAME', 'MTBS_ID', 'MTBS_FIRE_NAME', 'COMPLEX_NAME', 'DISCOVERY_DATE', 'DISCOVERY_TIME', 'STAT_CAUSE_DESCR', 'CONT_TIME', 'OWNER_DESCR', 'STATE', 'COUNTY', 'FIPS_CODE', 'FIPS_NAME', 'Shape']


In [5]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse_output=False)
one_hot_encoded = encoder.fit_transform(df['STATE'].values.reshape(-1,1))
one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(['STATE']))

df = df.drop(columns=cat_attribs)
df = df.drop(columns=['OBJECTID', 'FOD_ID', 'FIRE_YEAR', 'CONT_DATE', 'CONT_DOY'])

df = pd.concat([df, one_hot_df], axis=1)

# corr_matrix = df.corr(method='spearman')
# sorted_corr_matrix = corr_matrix['FIRE_SIZE'].sort_values(ascending=False)
# print(sorted_corr_matrix)
# top_attributes = sorted_corr_matrix.index.tolist()

# for attribute in top_attributes:
#     df_corr.plot(kind="scatter", x=attribute, y="FIRE_SIZE", alpha=0.1)

# plt.show()


In [6]:
from sklearn.preprocessing import StandardScaler

#drops = ['FIRE_SIZE', 'FPA_ID', 'SOURCE_SYSTEM_TYPE', 'SOURCE_SYSTEM', 'NWCG_REPORTING_AGENCY', 'NWCG_REPORTING_UNIT_ID', 'NWCG_REPORTING_UNIT_NAME', 'SOURCE_REPORTING_UNIT', 'SOURCE_REPORTING_UNIT_NAME', 'LOCAL_FIRE_REPORT_ID', 'LOCAL_INCIDENT_ID', 'FIRE_CODE', 'FIRE_NAME', 'ICS_209_INCIDENT_NUMBER', 'ICS_209_NAME', 'MTBS_ID', 'MTBS_FIRE_NAME', 'COMPLEX_NAME', 'DISCOVERY_TIME', 'STAT_CAUSE_DESCR', 'CONT_TIME', 'OWNER_DESCR', 'STATE', 'COUNTY', 'FIPS_CODE', 'FIPS_NAME', 'Shape', 'OBJECTID', 'FOD_ID', 'FIRE_YEAR', 'DISCOVERY_DATE', 'CONT_DATE', 'CONT_DOY']
drops = ['FIRE_SIZE']
testing_df = df.drop(columns=drops, axis=1)
testing_df.dropna()

cols = (testing_df.columns)
columns = []

for col in cols:
    columns.append(col)
    
columns.remove("FIRE_SIZE_CLASS")

for col in columns:
    testing_df[col] = StandardScaler().fit_transform(testing_df[[col]])

testing_df.size
testing_df.select_dtypes(include=[np.number])

Unnamed: 0,DISCOVERY_DOY,STAT_CAUSE_CODE,LATITUDE,LONGITUDE,OWNER_CODE,DAY_OF_WEEK,MONTH,STATE_AK,STATE_AL,STATE_AR,...,STATE_SD,STATE_TN,STATE_TX,STATE_UT,STATE_VA,STATE_VT,STATE_WA,STATE_WI,STATE_WV,STATE_WY
0,-1.462914,0.867131,0.530333,-1.513488,-1.270604,-0.517374,-1.330031,-0.082926,-0.191573,-0.130867,...,-0.129388,-0.129793,-0.285822,-0.128882,-0.108383,-0.015574,-0.134703,-0.131283,-0.108719,-0.087123
1,-0.352283,-1.429173,0.350518,-1.477513,-1.270604,-0.517374,-0.314768,-0.082926,-0.191573,-0.130867,...,-0.129388,-0.129793,-0.285822,-0.128882,-0.108383,-0.015574,-0.134703,-0.131283,-0.108719,-0.087123
2,-0.141263,-0.281021,0.358844,-1.497320,0.545654,-1.506342,-0.314768,-0.082926,-0.191573,-0.130867,...,-0.129388,-0.129793,-0.285822,-0.128882,-0.108383,-0.015574,-0.134703,-0.131283,-0.108719,-0.087123
3,0.169714,-1.429173,0.289615,-1.448135,-1.270604,-1.506342,0.023653,-0.082926,-0.191573,-0.130867,...,-0.129388,-0.129793,-0.285822,-0.128882,-0.108383,-0.015574,-0.134703,-0.131283,-0.108719,-0.087123
4,0.169714,-1.429173,0.289615,-1.449315,-1.270604,-1.506342,0.023653,-0.082926,-0.191573,-0.130867,...,-0.129388,-0.129793,-0.285822,-0.128882,-0.108383,-0.015574,-0.134703,-0.131283,-0.108719,-0.087123
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1880460,1.158176,2.015283,0.602770,-1.596251,0.545654,0.966079,1.038916,-0.082926,-0.191573,-0.130867,...,-0.129388,-0.129793,-0.285822,-0.128882,-0.108383,-0.015574,-0.134703,-0.131283,-0.108719,-0.087123
1880461,1.258133,0.867131,0.136244,-1.509465,0.318622,-1.506342,1.377337,-0.082926,-0.191573,-0.130867,...,-0.129388,-0.129793,-0.285822,-0.128882,-0.108383,-0.015574,-0.134703,-0.131283,-0.108719,-0.087123
1880462,-0.474452,2.015283,0.136244,-1.509465,0.318622,0.966079,-0.314768,-0.082926,-0.191573,-0.130867,...,-0.129388,-0.129793,-0.285822,-0.128882,-0.108383,-0.015574,-0.134703,-0.131283,-0.108719,-0.087123
1880463,1.358089,2.015283,0.145141,-1.507059,0.318622,-0.517374,1.377337,-0.082926,-0.191573,-0.130867,...,-0.129388,-0.129793,-0.285822,-0.128882,-0.108383,-0.015574,-0.134703,-0.131283,-0.108719,-0.087123


In [7]:
from sklearn.model_selection import train_test_split

X = testing_df.drop(columns=['FIRE_SIZE_CLASS']).values
y = testing_df['FIRE_SIZE_CLASS'].values


X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.025, random_state=42)

In [8]:

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import warnings

warnings.filterwarnings("ignore", category=UserWarning)

model = LogisticRegression()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

print("Logistic Regression Accuracy:", accuracy)

Logistic Regression Accuracy: 0.598762018208117


In [9]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(random_state=42)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

print("Decision Tree Accuracy:", accuracy)

Decision Tree Accuracy: 0.5474134263592274


In [10]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=100, weights='distance') 

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

print("KNN Accuracy:", accuracy)

KNN Accuracy: 0.6321152046286055


In [11]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, random_state=1, max_depth=20)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

print("Random Forest Accuracy:", accuracy)

Random Forest Accuracy: 0.6387730792138178


In [10]:
from sklearn.neural_network import MLPClassifier

model = MLPClassifier(random_state=42, alpha = 0.000001)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Neural Network Accuracy:", accuracy)

Neural Network Accuracy: 0.6265208882838424
