# ETL Pipeline Preparation

## Import packages and load datasets

In [1]:
# import libraries
import pandas as pd
from sqlalchemy import create_engine
import numpy as np

In [2]:
# Load the `messages` dataset
messages = pd.read_csv('messages.csv')
messages.head()

Unnamed: 0,id,message,original,genre
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct


In [3]:
# Load the `categories` dataset
categories = pd.read_csv('categories.csv')
categories.head()

Unnamed: 0,id,categories
0,2,related-1;request-0;offer-0;aid_related-0;medi...
1,7,related-1;request-0;offer-0;aid_related-1;medi...
2,8,related-1;request-0;offer-0;aid_related-0;medi...
3,9,related-1;request-1;offer-0;aid_related-1;medi...
4,12,related-1;request-0;offer-0;aid_related-0;medi...


## Drop duplicates

In [4]:
# Drop duplicate rows from `messages`
messages.drop_duplicates(inplace=True)

In [5]:
# Drop duplicate rows from `categories`
categories.drop_duplicates(inplace=True)

In [6]:
# Check that there are now no duplicate rows in either dataframe
print(sum(messages.duplicated()) == 0)
print(sum(categories.duplicated()) == 0)

True
True


What are the shapes of these dataframes?

In [7]:
# Find the shapes of `messages` and `categories`
print(messages.shape)
print(categories.shape)

(26180, 4)
(26216, 2)


Uh oh, the there are more rows in `categories` than there are messages in the `messages` dataframe. I'll deal with this next.

## Deal with some data labeling issues

Ultimately, I want to merge these dataframes on their common id, given by the values in the `id` column of each dataframe.

Are these ids the same between the two dataframes?

In [8]:
# Check whether `categories` and `messages` have identical 'id' columns
list(categories['id']) == list(messages['id'])

False

No, they aren't. What's going on here? Are the id values at least equal as sets? That is, is there an id value in one dataframe that isn't in the other?

In [9]:
set(messages['id']) == set(categories['id'])

True

OK, so they are at least equal as sets. So let's check for duplicate ids in both dataframes:

In [10]:
# Check for duplicate ids in the `messages` dataframe
print(sum(messages['id'].duplicated()))
# Do the same for the `categories` dataframe
print(sum(categories['id'].duplicated()))

0
36


OK, so there are duplicate id values in the `categories` dataframe but not in the `messages` dataframe.

In [11]:
categories[categories['id'].duplicated(keep=False)].head()
# Set keep=False so that I am shown all duplicate rows,
# not just the first of each

Unnamed: 0,id,categories
162,202,related-1;request-1;offer-0;aid_related-1;medi...
163,202,related-1;request-1;offer-0;aid_related-1;medi...
709,862,related-0;request-0;offer-0;aid_related-0;medi...
710,862,related-1;request-0;offer-0;aid_related-0;medi...
1407,1652,related-1;request-1;offer-0;aid_related-1;medi...


In [12]:
# What message does id == 862 correspond to?
messages[messages['id'] == 862]

Unnamed: 0,id,message,original,genre
709,862,What is the address of the radio station? I as...,Ki adres radyo a? Paske m bezwen al depoze dos...,direct


Looking at 'id' = 862, I can see that one row has this message categorized as 'related' and the other does not. This seems like a mistake in data labeling; the message was initially duplicated in `messages` (before I deleted duplicates) and was labeled in two different ways. Since only a few dozen messages out of the ~26,000 have this problem (i.e., having multiple corresponding rows in the `categories` dataframe), I'll just remove them from the dataset.

In [13]:
# Find the ids of messages to remove
ids_to_remove = categories[categories['id'].duplicated(keep=False)]['id']

In [14]:
# Drop the appropriate rows from `categories` and `messages`
categories.drop(categories[categories['id'].isin(ids_to_remove)].index, inplace=True)
messages.drop(messages[messages['id'].isin(ids_to_remove)].index, inplace=True)

## Split `categories` into separate columns

In [15]:
# Show the first few rows
categories.head()

Unnamed: 0,id,categories
0,2,related-1;request-0;offer-0;aid_related-0;medi...
1,7,related-1;request-0;offer-0;aid_related-1;medi...
2,8,related-1;request-0;offer-0;aid_related-0;medi...
3,9,related-1;request-1;offer-0;aid_related-1;medi...
4,12,related-1;request-0;offer-0;aid_related-0;medi...


Each row has a bunch of categories with associated 0-1 values, separated by semicolons.

In [16]:
# Split the 'categories' column on semicolons
categories['categories'] = categories['categories'].str.split(';')

In [17]:
# Get the category names from the first row of the dataframe
cat_names = categories['categories'].iloc[0]
cat_names[:5]

['related-1', 'request-0', 'offer-0', 'aid_related-0', 'medical_help-0']

In [18]:
# Strip the last two characters of each element to find the category names
cat_names = [c[:-2] for c in cat_names]
cat_names[:5]

['related', 'request', 'offer', 'aid_related', 'medical_help']

In [19]:
# Replace the 'categories' column entries with lists of the numbers alone (no text),
# being sure to convert them from strings to integers
categories['categories'] = categories['categories'].apply(
    lambda x: [int(s[-1]) for s in x])
categories.head()

Unnamed: 0,id,categories
0,2,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,7,"[1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,8,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,9,"[1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,12,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [20]:
# Split the `categories` column lists into different columns and use
# the `cat_names` to name them
df_cat = pd.DataFrame(list(categories['categories']), index=categories.index,
            columns=cat_names)
df_cat.head()

Unnamed: 0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
# Drop the old 'categories' column from the `categories` dataframe
categories.drop(columns='categories', inplace=True)
# Concatenate the new columns to `categories`
categories = pd.concat([categories, df_cat], axis=1)

In [22]:
# Show the first few rows
categories.head()

Unnamed: 0,id,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,1,0,0,1,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,8,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9,1,1,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,12,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Merge dataframes

Now I will go ahead and do an inner join on these dataframes using the common `id`. (I won't lose any information by doing an inner join on the `id`, since the `id` values between the dataframes are the same as sets.)

In [23]:
# Join the dataframes on their common ids
df = messages.merge(categories, how='inner', on='id')
df.head()

Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,1,0,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,1,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Prepare for binary classification

The goal is to prepare this data for a set of binary classification tasks: Given each message (translated into English), I will build a model to classify it as having class label 0 (meaning "not in the category") or 1 (meaning "in the category") for each of the categories (columns) in the `categories` dataframe excluding the 'id' column.

Which category columns have some label that is not 0 or 1?

In [24]:
# Figure out which category columns (which have column names in 
# `cat_names`) have some label that's not in {0, 1}
cols = []
for c in cat_names:
    unique_vals = set(df[c].unique())
    if not unique_vals.issubset({0, 1}):
        cols.append(c)
        print(f'Values in \'{c}\' column: {unique_vals}')

Values in 'related' column: {0, 1, 2}


I'll now drop the rows from `df` that have some value other than 0 or 1 in these columns (for the given data, only the 'related' column).

In [25]:
# Drop the rows with values not in {0, 1}
num_dropped = 0
# Find the indices of rows to drop, and keep a record of how many there are
for c in cols:
    to_drop = df[~df[c].isin({0, 1})].index
    num_dropped += len(to_drop)
df.drop(to_drop, inplace=True)
print(f'{num_dropped} rows dropped')

187 rows dropped


Fortunately this is still a small fraction of the total rows in the dataframe. It may be worth going back later and see if this data can be relabed so that its 'related' labels are 0 or 1.

## Save the dataset to a database

In [29]:
# Create SQLAlchemy engine and a SQLite database named data.db
engine = create_engine('sqlite:///data.db')

I want to create a table named "LabeledMessages" that contains the dataframe df. If the table already exists, I will replace it.

In [27]:
# Write the dataframe to a table in data.db
# Call the table LabeledMessages
df.to_sql('LabeledMessages', engine, index=False, if_exists='replace')

In [31]:
# Import a construct that allows me to issue textual SQL commands
from sqlalchemy import text

In [32]:
# Get the first two messages from the table
command = "SELECT message FROM LabeledMessages "
command += "LIMIT 2;"
with engine.connect() as conn:
    result = conn.execute(text(command))
    for row in result:
        print(row)

('Weather update - a cold front from Cuba that could pass over Haiti',)
('Is the Hurricane over or is it not over',)
