# ETL Pipeline Preparation

## Import packages and load datasets

In [1]:
# import libraries
import pandas as pd
from sqlalchemy import create_engine

In [2]:
# Load the `messages` dataset
messages = pd.read_csv('messages_0.csv')
messages.head()

Unnamed: 0,id,message,original,genre
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct


In [3]:
# Load the `categories` dataset
categories = pd.read_csv('categories_0.csv')
categories.head()

Unnamed: 0,id,categories
0,2,related-1;request-0;offer-0;aid_related-0;medi...


## Drop duplicates

In [4]:
# Drop duplicate rows from `messages`
messages.drop_duplicates(inplace=True)

In [5]:
# Drop duplicate rows from `categories`
categories.drop_duplicates(inplace=True)

In [6]:
# Check that there are now no duplicate rows in either dataframe
print(sum(messages.duplicated()) == 0)
print(sum(categories.duplicated()) == 0)

True
True


## Merge dataframes

I want to merge these dataframes on their common id, given by the values in the `id` column of each dataframe.

Are these ids the same between the two dataframes?

In [7]:
# Check whether `categories` and `messages` have identical 'id' columns
list(categories['id']) == list(messages['id'])

False

No, they aren't. What's going on here? Are the id values at least equal as sets? That is, is there an id value in one dataframe that isn't in the other?

In [8]:
set(messages['id']) == set(categories['id'])

True

OK, so they are at least equal as sets. So let's check for duplicate ids in both dataframes:

In [9]:
# Check for duplicate ids in the `messages` dataframe
print(sum(messages['id'].duplicated()))
# Do the same for the `categories` dataframe
print(sum(categories['id'].duplicated()))

0
36


OK, so there are duplicate id values in the `categories` dataframe but not in the `messages` dataframe.

In [10]:
categories[categories['id'].duplicated(keep=False)].head()
# Set keep=False so that I am shown all duplicate rows,
# not just the first of each

Unnamed: 0,id,categories
162,202,related-1;request-1;offer-0;aid_related-1;medi...
163,202,related-1;request-1;offer-0;aid_related-1;medi...
709,862,related-0;request-0;offer-0;aid_related-0;medi...
710,862,related-1;request-0;offer-0;aid_related-0;medi...
1407,1652,related-1;request-1;offer-0;aid_related-1;medi...


In [11]:
# What message does id == 862 correspond to?
messages[messages['id'] == 862]

Unnamed: 0,id,message,original,genre
709,862,What is the address of the radio station? I as...,Ki adres radyo a? Paske m bezwen al depoze dos...,direct


Looking at `id == 862`, I can see that one row has this message categorized as `related` and the other does not. This seems like a mistake in data labeling. I'm not sure right now what the best way to address this is, but I may come back to it later.

Now I will go ahead and do an inner join on these dataframes using the common `id`. (I won't lose any information by doing an inner join on the `id`, since the `id` values between the dataframes are the same as sets.)

In [6]:
messages

Unnamed: 0,id,message,original,genre
0,3,weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct


In [7]:
categories

Unnamed: 0,id,categories
0,2,related-1;request-0;offer-0;aid_related-0;medi...


In [5]:
# Join the dataframes on their common ids
df = messages.merge(categories, how='inner', on='id')
df.head()

Unnamed: 0,id,message,original,genre,categories


In [13]:
# Find the rows with id == 862
df[df['id'] == 862]

Unnamed: 0,id,message,original,genre,categories
708,862,What is the address of the radio station? I as...,Ki adres radyo a? Paske m bezwen al depoze dos...,direct,related-0;request-0;offer-0;aid_related-0;medi...
709,862,What is the address of the radio station? I as...,Ki adres radyo a? Paske m bezwen al depoze dos...,direct,related-1;request-0;offer-0;aid_related-0;medi...


As expected, there are rows with duplicate ids (I just wanted to double-check that the join worked as I thought it would).

## Split 'categories' into separate columns

In [14]:
# Show the first few 'categories' rows in df
df['categories'].head()

0    related-1;request-0;offer-0;aid_related-0;medi...
1    related-1;request-0;offer-0;aid_related-1;medi...
2    related-1;request-0;offer-0;aid_related-0;medi...
3    related-1;request-1;offer-0;aid_related-1;medi...
4    related-1;request-0;offer-0;aid_related-0;medi...
Name: categories, dtype: object

Each row has a bunch of categories with associated 0-1 values, separated by semicolons.

In [15]:
# Split the 'categories' column on semicolons
df['categories'] = df['categories'].str.split(';')

In [16]:
# Get the category names from the first row of the dataframe
cat_names = df['categories'].iloc[0]
cat_names[:5]

['related-1', 'request-0', 'offer-0', 'aid_related-0', 'medical_help-0']

In [17]:
# Strip the last two characters of each element to find the category names
cat_names = [c[:-2] for c in cat_names]
cat_names[:5]

['related', 'request', 'offer', 'aid_related', 'medical_help']

In [18]:
# Replace the 'categories' column entries with lists of the numbers alone (no text),
# being sure to convert them from strings to integers
df['categories'] = df['categories'].apply(
    lambda x: [int(s[-1]) for s in x])
df.head()

Unnamed: 0,id,message,original,genre,categories
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,"[1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,"[1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [19]:
# Split the `categories` column lists into different columns and use
# the `cat_names` to name them
df_cat = pd.DataFrame(df['categories'].to_list(), index=df.index,
            columns=cat_names)
df_cat.head()

Unnamed: 0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
# Drop the old 'categories' column from df
df.drop(columns='categories', inplace=True)
# Concatenate the new columns to df
df = pd.concat([df, df_cat], axis=1)

In [21]:
df.head()

Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,1,0,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,1,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Save the dataset to a database

In [22]:
# Create SQLAlchemy engine and a SQLite database named data.db
engine = create_engine('sqlite:///data.db')

I want to create a table named "LabeledMessages" that contains the dataframe df. If the table already exists, I will just append these rows to it. This can be done using `if_exists='append'` in pandas' `to_sql()` method.

In [26]:
# Write the dataframe to a table in data.db
# Call the table LabeledMessages
df.to_sql('LabeledMessages', engine, index=False, if_exists='append')

In [27]:
# Import a construct that allows me to issue textual SQL commands
from sqlalchemy import text

In [28]:
# Get the first two messages from the table
command = "SELECT message FROM LabeledMessages "
command += "LIMIT 2;"
with engine.connect() as conn:
    result = conn.execute(text(command))
    for row in result:
        print(row)

('Weather update - a cold front from Cuba that could pass over Haiti',)
('Is the Hurricane over or is it not over',)


In [25]:
import threading
import _thread

In [35]:
def exit_timeout():
    _thread.interrupt_main()

In [37]:
t = threading.Timer(1.0, exit_timeout)
try:
    t.start()
    while True:
        foo = input('Enter something')
except:
    pass
t.cancel()

Enter something 0


In [45]:
import _thread
import time

def worker():
    _thread.interrupt_main()

_thread.start_new_thread(worker, ())

try:
    time.sleep(1)
    foo = input('Enter something')
    time.sleep(5)
except KeyboardInterrupt as e:
    print("the main thread has been interrupted")

the main thread has been interrupted
