In [1]:
import pandas as pd
import numpy as np
from itertools import product

In [2]:
# Load models
high_risk_model = pd.read_pickle('high_risk_logistic.pkl')
med_risk_model = pd.read_pickle('med_risk_logistic.pkl')
low_risk_model = pd.read_pickle('low_risk_logistic.pkl')
any_risk_model = pd.read_pickle('any_risk_logistic.pkl')

In [3]:
wavelet = pd.read_csv('wavelet_features.csv')

In [4]:
# Create new dataframe that includes all hours, days, months, and zipcodes
zipcodes = wavelet.MODZCTA.unique()
months = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
dates = list(range(1, 32))
hours = list(range(24))
df_all = pd.DataFrame(list(product(zipcodes, months, dates, hours)), columns=['MODZCTA', 'Month', 'Date', 'Hour'])

# Remove non-exist dates (2/29, 2/30, 2/31, 4/31, 6/31, 9/31, 11/31)
df_all = df_all[~(((df_all.Month == 2) & (df_all.Date >= 29)) |  # 2/29, 2/30, 2/31
        (((df_all.Month == 4) | (df_all.Month == 6) | (df_all.Month == 9) | (df_all.Month == 11)) & (df_all.Date == 31)))]# 4/31, 6/31, 9/31, 11/31

# Group hours (every 4 hours)
hour_increment = 4
df_all['Hour_group'] = df_all['Hour'].apply(lambda x: (x//hour_increment) * hour_increment)
df_all = df_all.drop('Hour', axis=1)

# Create dummy varaibles
df_all = pd.get_dummies(df_all, columns=['Month', 'Date', 'Hour_group'])

# Merge with wavelet data
df_all = df_all.merge(wavelet, on='MODZCTA')

# Keep MODZCTA column for later use
MODZCTA = df_all['MODZCTA']

# Drop 'MODZCTA' feature since it's not in the model
df_all = df_all.drop('MODZCTA', axis=1)

# Realign column order to match what's in the model
df_all = df_all[high_risk_model.feature_names_in_]

In [5]:
# Fit model
num_high_risk = high_risk_model.predict_proba(df_all)[:,1]
num_med_risk = med_risk_model.predict_proba(df_all)[:,1]
num_low_risk = low_risk_model.predict_proba(df_all)[:,1]
num_any_risk = any_risk_model.predict_proba(df_all)[:,1]

# Include prediction in dataframe
df_result = df_all.copy()
df_result['num_high_risk'] = num_high_risk
df_result['num_med_risk'] = num_med_risk
df_result['num_low_risk'] = num_low_risk
df_result['num_any_risk'] = num_any_risk

# Drop unnecessary columns
df_result.drop(['high_mean_energy', 'high_max_energy', 'high_dominant_scale',
       'med_mean_energy', 'med_max_energy', 'med_dominant_scale',
       'low_mean_energy', 'low_max_energy', 'low_dominant_scale'], axis=1, inplace=True)

# Add back in MODZCTA column
df_result['MODZCTA'] = MODZCTA

# Convert dummy variables to a single column
dummies = ['Month_1', 'Month_2', 'Month_3', 'Month_4', 'Month_5', 'Month_6',
       'Month_7', 'Month_8', 'Month_9', 'Month_10', 'Month_11', 'Month_12',
       'Date_1', 'Date_2', 'Date_3', 'Date_4', 'Date_5', 'Date_6', 'Date_7',
       'Date_8', 'Date_9', 'Date_10', 'Date_11', 'Date_12', 'Date_13',
       'Date_14', 'Date_15', 'Date_16', 'Date_17', 'Date_18', 'Date_19',
       'Date_20', 'Date_21', 'Date_22', 'Date_23', 'Date_24', 'Date_25',
       'Date_26', 'Date_27', 'Date_28', 'Date_29', 'Date_30', 'Date_31',
       'Hour_group_0', 'Hour_group_4', 'Hour_group_8', 'Hour_group_12',
       'Hour_group_16', 'Hour_group_20']
date_time = pd.from_dummies(df_result[dummies], sep='_')
df_result = pd.concat([df_result, date_time], axis=1).drop(dummies, axis=1)
df_result['Month'] = df_result['Month'].apply(lambda x: int(x))
df_result['Date'] = df_result['Date'].apply(lambda x: int(x))


# Convert hour columns to number
df_result['Hour'] = df_result['Hour'].apply(lambda x: x.replace('group_', ''))
df_result['Hour'] = df_result['Hour'].apply(lambda x: int(x))

# Reorder columns
col_order = ['MODZCTA', 'Month', 'Date', 'Hour', 'num_high_risk', 'num_med_risk', 'num_low_risk', 'num_any_risk']
df_result = df_result[col_order]

# Group by date
df_result = df_result.groupby(['MODZCTA', 'Month', 'Date'], as_index=False).sum().drop(['Hour'], axis=1)

# Output to csv
df_result.to_csv('fire_incident_prediction_output.csv', index=False)