# __Fire Causes__
#### _Bureau of Fire Investigations_
<div>
<img src="https://upload.wikimedia.org/wikipedia/commons/2/25/Emblem_of_the_New_York_City_Fire_Department.svg" width="120"/>
</div>

<hr>

Source: _<a href="https://data.cityofnewyork.us/Public-Safety/Bureau-of-Fire-Investigations-Fire-Causes/ii3r-svjz/about_data">NYC Open Data</a>_ <br>
Last updated: _9 May 2024_ <br>
Accessed: _3 June 2024_

<hr>

### Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns



<hr>

### Exploration

In [2]:
# Set the path to the file
path = 'datasets/001. Bureau of Fire Investigations - Fire Causes/Bureau_of_Fire_Investigations_-_Fire_Causes_20240603.csv'

In [3]:
df = pd.read_csv(path)
df.head()

Unnamed: 0,Case_Year,Case Number,Incident_DateTime,Borough,Battalion,Community_District,Precinct,Incident_Classification,Cause_Fire_Description,Fire_Code_Category
0,2016,40001,01/01/2016 12:08:00 AM,Queens,50,8,107,MD Residential Structure,Smoking (Cigarette/Cigar),Smoking
1,2016,50001,01/01/2016 12:13:00 AM,Staten Island,21,2,122,PD Residential Structure,Incendiary - Combustible Material,Incendiary
2,2016,40002,01/01/2016 12:21:00 AM,Queens,37,5,104,MD Residential Structure,Incendiary - Combustible Material,Incendiary
3,2016,60001,01/01/2016 12:29:00 AM,Bronx,15,12,47,Burn Notification (Albany),Albany Burn Notif. Unrelated to a Fire/Explosion,Other
4,2016,40003,01/01/2016 12:32:00 AM,Queens,50,12,103,PD Residential Structure,Incendiary - Combustible Material,Incendiary


In [4]:
# Set the new column names
colnames = ['Year', 'ID', 'Date', 'Borough', 'Fire battalion', 'Community district', 
            'Police precinct', 'Incident type', 'Fire cause', 'Fire cause category']

# Set the new column datatypes
#coldtypes = {'Year': 'int64', 'ID': 'int64', 'Date': 'str', 'Borough': 'str', 
#             'Fire battalion': 'int64', 'Community district': 'int64', 
#             'Police precinct': 'int64', 'Incident type': 'str', 
#             'Fire cause': 'str', 'Fire cause category': 'str'}

In [5]:
# Set the new column names
df = pd.read_csv(path, names = colnames, header = 0)
df.head()

Unnamed: 0,Year,ID,Date,Borough,Fire battalion,Community district,Police precinct,Incident type,Fire cause,Fire cause category
0,2016,40001,01/01/2016 12:08:00 AM,Queens,50,8,107,MD Residential Structure,Smoking (Cigarette/Cigar),Smoking
1,2016,50001,01/01/2016 12:13:00 AM,Staten Island,21,2,122,PD Residential Structure,Incendiary - Combustible Material,Incendiary
2,2016,40002,01/01/2016 12:21:00 AM,Queens,37,5,104,MD Residential Structure,Incendiary - Combustible Material,Incendiary
3,2016,60001,01/01/2016 12:29:00 AM,Bronx,15,12,47,Burn Notification (Albany),Albany Burn Notif. Unrelated to a Fire/Explosion,Other
4,2016,40003,01/01/2016 12:32:00 AM,Queens,50,12,103,PD Residential Structure,Incendiary - Combustible Material,Incendiary


In [6]:
# #, dtype = coldtypes)#, parse_dates=['Incident_DateTime'])

In [19]:
# Helper function
def dataset_check(dataset, stats_check = False, corr_check = False):
    # Calculate
    df = dataset
    columns = df.shape[1]
    rows = df.shape[0]
    duplicate_rows = df.duplicated().sum()
    dtypes = df.dtypes
    missing = df.isna().sum()
    missing = missing[missing > 0]
    missing_percentage = (df.isna().sum() / len(df) * 100).round(2)
    missing_percentage = missing_percentage[missing_percentage > 0]
    unique = df.nunique()
    stats = df.describe() if stats_check else None
    corr = df.corr() if corr_check else None

    # Print
    print(f'COLUMNS:\n{columns:,}\n'.replace(',', ' '))
    print(f'ROWS:\n{rows:,}\n'.replace(',', ' '))
    if duplicate_rows > 0:
        print(f'DUPLICATE ROWS:\n{duplicate_rows}\n')
    print('DATATYPES:')
    print(dtypes, '\n')
    print('MISSING VALUES:')
    print(missing, '\n')
    print('MISSING VALUES IN %:')
    print(missing_percentage.apply(lambda x: f'{x:.2f}%'), '\n')
    print('UNIQUE VALUES:')
    print(unique, '\n')
    
    # Conditional part
    if stats_check:
        print('BASIC STATISTICS:')
        print(stats, '\n')
    if corr_check:
        print('CORRELATION MATRIX:')
        print(corr, '\n')

In [20]:
# Perform the basic check
dataset_check(df)

COLUMNS:
10

ROWS:
53 741

DATATYPES:
Year                    int64
ID                      int64
Date                   object
Borough                object
Fire battalion         object
Community district     object
Police precinct        object
Incident type          object
Fire cause             object
Fire cause category    object
dtype: object 

MISSING VALUES:
Borough                 260
Fire battalion         7658
Community district      338
Police precinct         344
Incident type           827
Fire cause               73
Fire cause category      61
dtype: int64 

MISSING VALUES IN %:
Borough                 0.48%
Fire battalion         14.25%
Community district      0.63%
Police precinct         0.64%
Incident type           1.54%
Fire cause              0.14%
Fire cause category     0.11%
dtype: object 

UNIQUE VALUES:
Year                       9
ID                      9543
Date                   52343
Borough                    6
Fire battalion            57
Community di

Let’s check what we have:
- <b>Duplicates.</b> There are no duplicate rows, which is good.
- <b>Datatypes.</b> Columns <code>Date</code>, <code>Fire battalion</code>, <code>Community district</code>, and <code>Police precint</code> are seen as <code>object</code> columns even though they contain dates and numbers. This should be checked.
- <b>Missing values.</b> We have relatively many missing values in column <code>Incident type</code> and we have _a lot_ missing values in <code>Fire battalion</code>. This is very strange — how come _so many_ incidents do not have an assigned battalion. Could this be because not every incident was reacted by the fire department? 
- <b>Unique values.</b> We have 53 741 rows, but the <code>ID</code> column has 9 543 unique values. That means that the column is of no use and can be dropped. Also, New York has 5 boroughs and not 6 as can be seen in the image below.

<div>
<img src="https://www.loumovesyou.com/wp-content/uploads/2022/11/FiveBoroughs-01.jpg" width="600"/>
</div>

In [None]:
df[df['Fire battalion'].isna()]

In [None]:
df['Fire battalion'].unique()

In [None]:
pivot1 = df.pivot_table(index = 'Fire battalion',
               columns = 'Borough',
               values = 'ID',
               aggfunc = 'count',
               fill_value = 0,
               margins = True, 
               margins_name = 'Total')

pivot1.sort_values(by=['Total'], ascending=False)

In [None]:
pivot2 = df.pivot_table(
    index = 'Borough',
    values = 'ID',
    aggfunc = 'count',
    #fill_value = 0,
    margins = True,
    #margins_name = 'Total',
)

pivot2.sort_values(by = 'ID', ascending = False)

In [None]:
outsideofNYC = df[df['Borough'] == 'Outside of NYC']
outsideofNYC.head(20)

In [None]:
outsideofNYC[outsideofNYC['Borough'] != 'Outside of NYC']

In [None]:
outsideofNYC['Fire cause category'].unique()

In [None]:
outsideofNYC['Fire cause category'].unique()

In [None]:
outsideofNYC['Fire cause category'].unique()

In [None]:
outsideofNYC['Fire cause category'].unique()

In [None]:
outsideofNYC['Fire cause category'].unique()

In [None]:
77/53481*100

In [None]:
# Heatmap example
sns.heatmap(pivot1, cmap = 'RdYlGn');

In [None]:
pivot3 = df.pivot_table(index = 'Police precinct',
               columns = 'Borough',
               values = 'ID',
               aggfunc = 'count',
               fill_value = 0,
               margins = True, 
               margins_name = 'Total')

pivot3b = pivot3.sort_values(by=['Total'], ascending=False)
pivot3b[pivot3b['Total'] <= 300].index

In [None]:
pivot3b[pivot3b['Total'] <= 400]['Total']#.sum()

In [None]:
pivot4 = df.pivot_table(index = 'Community district',
               columns = 'Borough',
               values = 'ID',
               aggfunc = 'count',
               fill_value = 0,
               margins = True, 
               margins_name = 'Total')

pivot4.sort_values(by=['Total'], ascending=False)

In [None]:
df.nunique()

In [None]:
df[df['Fire battalion'] == '50']

In [None]:
df = pd.read_csv(path, parse_dates=['Incident_DateTime'])
df.head()

In [None]:
cond = df['Incident_DateTime'].dt.dayofweek == 1
df[cond]

In [None]:
df.dtypes

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df['Incident_DateTime'].max()

In [None]:
df.dtypes

In [None]:
print(np.sort(df['Case_Year'].unique()))

In [None]:
df.nunique()

In [None]:
df.sort_values(by = ['Case Number', 'Incident_DateTime']).head(20)

In [None]:
groupby1 = df.groupby(by = ['Case Number']).count()
groupby1['Incident_DateTime'].sort_values(ascending = False)

In [None]:
df[df['Case Number'] == 20203].sort_values(by = 'Case_Year')