# ETL Project

In [52]:
# ETL_Project_TK_ES
# data extraction, transform, and load project on births and baby names in USA
# Trevor Kulbeth
# Eric Staveley
# MWSa Cohort
#
# need to perform a     !pip install mysqlclient

In [53]:
import pandas as pd
from sqlalchemy import create_engine
import datetime as dt

In [54]:
#EXTRACT, TRANSFORM, LOAD
#csv method...
data_file = "births.csv"

In [55]:
#construct the df
df_from_csv = pd.read_csv(data_file)
print(f"Raw data births record count: {len(df_from_csv)}")
df_from_csv.head()

Raw data births record count: 3652


Unnamed: 0,year,month,date_of_month,day_of_week,births
0,1994,1,1,6,8096
1,1994,1,2,7,7772
2,1994,1,3,1,10142
3,1994,1,4,2,11248
4,1994,1,5,3,11053


In [56]:
#clean up the data in incident df

#get rid of the NaNs found
print(f"Non-NA values per column:\n{df_from_csv.count()}")   #show the non-NA cells for each field

#remove the rows with any empty cells
df_from_csv_clean = df_from_csv.dropna(how='any')

print(f"Total of rows with complete data now: {len(df_from_csv_clean)}")



Non-NA values per column:
year             3652
month            3652
date_of_month    3652
day_of_week      3652
births           3652
dtype: int64
Total of rows with complete data now: 3652


In [57]:
#make a datetime field of a full date
df_from_csv_clean['Birthdate'] = df_from_csv_clean.apply(lambda x: dt.date(x['year'], x['month'], x['date_of_month']), axis=1)
#type(df_from_csv_clean['Birthdate'][0])

In [58]:
df_from_csv_clean.head()

Unnamed: 0,year,month,date_of_month,day_of_week,births,Birthdate
0,1994,1,1,6,8096,1994-01-01
1,1994,1,2,7,7772,1994-01-02
2,1994,1,3,1,10142,1994-01-03
3,1994,1,4,2,11248,1994-01-04
4,1994,1,5,3,11053,1994-01-05


In [59]:
#peek at the dt date ranges with data
print(f"Earliest date with data: {min(df_from_csv_clean['Birthdate'])}")
print(f"Latest date with data: {max(df_from_csv_clean['Birthdate'])}")

Earliest date with data: 1994-01-01
Latest date with data: 2003-12-31


In [60]:
#reorganize so order is Birthdate, births, year, month, date_of_month, day_of_week
temp_births_df = df_from_csv_clean[['Birthdate', 'births', 'year', 'month', 'date_of_month', 'day_of_week']]


In [61]:
temp_births_df.head()

Unnamed: 0,Birthdate,births,year,month,date_of_month,day_of_week
0,1994-01-01,8096,1994,1,1,6
1,1994-01-02,7772,1994,1,2,7
2,1994-01-03,10142,1994,1,3,1
3,1994-01-04,11248,1994,1,4,2
4,1994-01-05,11053,1994,1,5,3


In [62]:
births_df = temp_births_df.rename(index=str, columns={"Birthdate" : "BIRTHDATE", "births": "NUM_BIRTHS", "year":"YEAR","month":"MONTH" , "date_of_month" : "DATE_OF_MONTH", "day_of_week":"DAY_OF_WEEK" })

In [63]:
births_df.head()

Unnamed: 0,BIRTHDATE,NUM_BIRTHS,YEAR,MONTH,DATE_OF_MONTH,DAY_OF_WEEK
0,1994-01-01,8096,1994,1,1,6
1,1994-01-02,7772,1994,1,2,7
2,1994-01-03,10142,1994,1,3,1
3,1994-01-04,11248,1994,1,4,2
4,1994-01-05,11053,1994,1,5,3


# Ensure the MySQL database schema was pre-constructed via MySQL Workbench:


Perform outside of this script, and the execution of any further code access.   The below SQL was used in MySQL Wrokbench to construct the database schema:

-- Create and use database:    baby_names_and_birthdays_db

CREATE DATABASE baby_names_and_birthdays_db;
USE baby_names_and_birthdays_db;

-- birthdays table
-- Create tables for raw data to be loaded into
CREATE TABLE birthdays (
  id INT PRIMARY KEY,
  BIRTHDATE datetime,
  NUM_BIRTHS integer,
  YEAR integer,
  MONTH integer,
  DATE_OF_MONTH integer,
  DAY_OF_WEEK integer);
  
  
-- 
  

# Connect To Local Database (guest account)

In [87]:
rds_connection_string = "guest:guest123#@127.0.0.1/baby_names_and_birthdays_db"
engine = create_engine(f'mysql://{rds_connection_string}')

# Check for tables seen after connection

In [88]:
engine.table_names()

['birthdays']

# Use pandas to load csv-converted DataFrame (births_df) into database table birthdays

In [83]:
births_df.to_sql(name='birthdays', con=engine, if_exists='append', index=False)

# Confirm data has been added by querying the customer_name table

In [89]:
pd.read_sql_query('select * from birthdays', con=engine).head()

Unnamed: 0,id,BIRTHDATE,NUM_BIRTHS,YEAR,MONTH,DATE_OF_MONTH,DAY_OF_WEEK
0,1,1994-01-01,8096,1994,1,1,6
1,2,1994-01-02,7772,1994,1,2,7
2,3,1994-01-03,10142,1994,1,3,1
3,4,1994-01-04,11248,1994,1,4,2
4,5,1994-01-05,11053,1994,1,5,3
