In [1]:
from sqlalchemy import create_engine
from pathlib import Path
import pandas as pd

In [2]:
# load data from database
repo = Path.cwd()
path = repo / 'data' / 'DisasterResponse.db'

engine = create_engine(f'sqlite:///{path}')
df = pd.read_sql("select * from Message", con=engine)
df.shape

(25991, 38)

In [3]:
df.columns

Index(['related', 'request', 'offer', 'aid_related', 'medical_help',
       'medical_products', 'search_and_rescue', 'security', 'military',
       'water', 'food', 'shelter', 'clothing', 'money', 'missing_people',
       'refugees', 'death', 'other_aid', 'infrastructure_related', 'transport',
       'buildings', 'electricity', 'tools', 'hospitals', 'shops',
       'aid_centers', 'other_infrastructure', 'weather_related', 'floods',
       'storm', 'fire', 'earthquake', 'cold', 'other_weather', 'direct_report',
       'message', 'original', 'genre'],
      dtype='object')

In [7]:
# imbalance in target values
df.iloc[:,:-3].mean().sort_values(ascending=False) * 100

related                   76.472625
aid_related               41.710592
weather_related           28.032781
direct_report             19.483667
request                   17.175176
other_aid                 13.239198
food                      11.223116
earthquake                 9.434035
storm                      9.387865
shelter                    8.879997
floods                     8.268247
medical_help               8.006618
infrastructure_related     6.559963
water                      6.421454
other_weather              5.294140
buildings                  5.121003
medical_products           5.044054
transport                  4.613135
death                      4.586203
other_infrastructure       4.428456
refugees                   3.362702
military                   3.304990
search_and_rescue          2.785580
money                      2.320034
electricity                2.046862
cold                       2.031472
security                   1.812166
clothing                   1

The data is severely imbalanced. Except for 'related' and 'aid_related', there are very few positive observations in the other categories. In fact, most have fewer than 10 percent in the positive class. 

Further, 'related' is skewed in the other direction where more then three-quarters of observations are considered as 'related'. In fact, where 'related' is not equal to 1, there are no entries in the other columns. 'related' is simply a flag for whether the message is disaster-related or not.

In [9]:
df[df.related != 1].iloc[:, :-3].mean().sort_values(ascending=False)

related                   0.0
other_infrastructure      0.0
buildings                 0.0
electricity               0.0
tools                     0.0
hospitals                 0.0
shops                     0.0
aid_centers               0.0
weather_related           0.0
infrastructure_related    0.0
floods                    0.0
storm                     0.0
fire                      0.0
earthquake                0.0
cold                      0.0
other_weather             0.0
transport                 0.0
other_aid                 0.0
request                   0.0
military                  0.0
offer                     0.0
aid_related               0.0
medical_help              0.0
medical_products          0.0
search_and_rescue         0.0
security                  0.0
water                     0.0
death                     0.0
food                      0.0
shelter                   0.0
clothing                  0.0
money                     0.0
missing_people            0.0
refugees  

In [8]:
df[df.related != 1].iloc[:, -3]

5                   Information about the National Palace-
8          I would like to receive the messages, thank you
11       I am in Petionville. I need more information r...
17       are you going to call me or do you want me to ...
18          I don't understand how to use this thing 4636.
                               ...                        
25975    WHO is recruiting a sanitary engineer / consul...
25979    Cadmium, a metallic element widely used in bat...
25984    However while ECOWAS wanted him to lead a 12-m...
25986    The training demonstrated how to enhance micro...
25987    A suitable candidate has been selected and OCH...
Name: message, Length: 6115, dtype: object

Any classifier will have to address the severe imbalance in the data.