In [1]:
import pandas as pd
import altair as alt

In [46]:
# Handy constants for Altair.
TEMPORAL = 'temporal'
ORDINAL = 'ordinal'

In [9]:
disasters = pd.read_csv('disasters/disasters.csv', header=[0,1], encoding='latin-1')

In [11]:
disasters.columns = disasters.columns.droplevel(1)

In [12]:
disasters

Unnamed: 0,ISO3,Name,Year,Start Date,Event Name,Hazard Category,Hazard Type,New Displacements
0,AB9,Abyei Area,2018,2018-07-01,Abyie: Flood - 01/07/2018,Weather related,Flood,2.0
1,AB9,Abyei Area,2019,2019-06-01,Abyei: Flood - southern parts - 01/06/2019,Weather related,Flood,40000.0
2,AFG,Afghanistan,2008,2008-04-17,,Geophysical,Earthquake,3250.0
3,AFG,Afghanistan,2008,2008-08-01,,Weather related,Flood,180.0
4,AFG,Afghanistan,2008,2008-01-01,,Weather related,Extreme temperature,
...,...,...,...,...,...,...,...,...
10180,ZWE,Zimbabwe,2019,2019-02-13,Zimbabwe: Floods- Chiredzi (Masvingo)- 13/02/2019,Weather related,Flood,200.0
10181,ZWE,Zimbabwe,2019,2019-10-07,Zimbabwe: Hailstorm- Masvingo (Zaka)- 07/10/2019,Weather related,Storm,400.0
10182,ZWE,Zimbabwe,2020,2020-02-11,Zimbabwe: Flash flooding- Matabeleland North (...,Weather related,Flood,148.0
10183,ZWE,Zimbabwe,2020,2020-01-17,Zimbabwe: Flash flooding- Matabeleland North (...,Weather related,Flood,35.0


In [30]:
# Handy column name constants.
HAZARD_CATEGORY = 'Hazard Category'
HAZARD_TYPE = 'Hazard Type'
NEW_DISPLACEMENTS = 'New Displacements'
YEAR = 'Year'

In [17]:
# What categories are they using?
disasters[HAZARD_CATEGORY].unique()

array(['Flood', 'Earthquake', 'Extreme temperature', 'Wet mass movement',
       'Storm', 'Dry mass movement', 'Drought', 'Volcanic eruption',
       'Wildfire', nan, 'Mass movement', 'Volcanic activity',
       'Wet Mass movement', 'Severe winter condition',
       'Wet Mass Movement'], dtype=object)

In [14]:
# How many (what portion) of events are missing a category?
m = disasters[HAZARD_CATEGORY].isna().sum()
n = len(disasters)
m, m/n

(3, 0.0002945508100147275)

In [20]:
# Which events were those?
disasters[disasters[HAZARD_CATEGORY].isna()]

Unnamed: 0,ISO3,Name,Year,Start Date,Event Name,Hazard Category,Hazard Type,New Displacements
776,BGD,Bangladesh,2017,2017-04-12,"Bangladesh : Riverbanck erosion - Sirajganj , ...",,,630.0
2424,COL,Colombia,2018,2018-03-02,Colombia: oil spill - Santander - March 2018,,,178.0
9120,USA,United States,2017,2017-08-11,United States : Utah Uintah Fire - Utah - 05/0...,,,309.0


In [19]:
# How many people were affected by those uncategorized events?
m = disasters[disasters[HAZARD_CATEGORY].isna()][NEW_DISPLACEMENTS].sum()
n = disasters[NEW_DISPLACEMENTS].sum()
m, m/n

(1117.0, 3.5053208935342254e-06)

In [23]:
# It's ok to ignore those rows.
disasters = disasters[~disasters[HAZARD_CATEGORY].isna()]

In [33]:
disasters.groupby(YEAR)[HAZARD_CATEGORY].count()

Year
2008     221
2009     262
2010     355
2011     295
2012     211
2013     644
2014     696
2015     599
2016     593
2017     941
2018    1610
2019    1928
2020    1827
Name: Hazard Category, dtype: int64

In [48]:
source = disasters.groupby(YEAR)[NEW_DISPLACEMENTS].sum()
source

Year
2008    38240828.0
2009    16732507.0
2010    42350166.0
2011    15024154.0
2012    30145960.0
2013    22129890.0
2014    19122384.0
2015    19192530.0
2016    24217150.0
2017    18777435.0
2018    17181797.0
2019    24854800.0
2020    30687697.0
Name: New Displacements, dtype: float64

In [47]:
# How has reporting changed by year?
source = disasters.groupby(YEAR)[HAZARD_CATEGORY].count()
alt.Chart(source.to_frame().reset_index()).mark_bar().encode(
    alt.X(YEAR, type=ORDINAL),
    y=HAZARD_CATEGORY
)

In [49]:
# How many affected people were captured by the reporting each year?
source = disasters.groupby(YEAR)[NEW_DISPLACEMENTS].sum()
alt.Chart(source.to_frame().reset_index()).mark_bar().encode(
    alt.X(YEAR, type=ORDINAL),
    y=NEW_DISPLACEMENTS
)