# USA Name Data - kaggle - test

### The code below pulls and summarizes the data from the USA Name Data (BigQuery Dataset) from kaggle.com

In [3]:
# ******************************************************************************
# Dependencies and Setup
# ******************************************************************************
import pandas as pd
from sqlalchemy import create_engine

# ******************************************************************************
# File with your personal/local MySQL user name and password in the following 
# format (this file is not stored in repo to protect personal connection info):
# 
# #####################################
# # Enter local MySQL connection info #
# #####################################
# user = '<MySQL user name>'
# password = '<MySQL user password>'
# 
# ******************************************************************************
import credentials

# ******************************************************************************
# PyMySQL 
# ******************************************************************************
import pymysql
pymysql.install_as_MySQLdb()

# ******************************************************************************
# Provides access to USA Name Data (BigQuery Dataset) from kaggle.com
# ******************************************************************************
from google.cloud import bigquery
client = bigquery.Client()


ModuleNotFoundError: No module named 'credentials'

In [2]:
# ******************************************************************************
# Define the SQL Query to capture all the data including state, year and the 
# total number of births in the USA Name data (from kaggle)
# ******************************************************************************
sql = """
    SELECT state as State_Abbr, year as Birth_Year, SUM(number) as Num_Births
    FROM `bigquery-public-data.usa_names.usa_1910_current`
    GROUP BY Birth_Year, State_Abbr
    ORDER BY State_Abbr, Birth_Year ASC
"""

# ******************************************************************************
# Create a pandas dataframe that captures the result of running the SQL Query 
# above. It includes all 50 states with data from 1910 to 2017 for each state 
# (where it exists).
# ******************************************************************************
usa_name_data_df = client.query(sql).to_dataframe()

# ******************************************************************************
# Print the number of rows in the dataset
# ******************************************************************************
num_rows = usa_name_data_df.shape[0]
print(f"There are {num_rows} rows in this dataset")


There are 5508 rows in this dataset


In [2]:
# ******************************************************************************
# Print a sample of the data included in the dataset
# ******************************************************************************
usa_name_data_df.head()

NameError: name 'usa_name_data_df' is not defined

### Connect to local database

In [4]:
# ******************************************************************************
# Establish connection to Birth_State_db MySQL Database
# ******************************************************************************
rds_connection_string = f'{credentials.user}:{credentials.password}@127.0.0.1/Birth_State_db'
engine = create_engine(f'mysql://{rds_connection_string}')

### Check for tables

In [5]:
# ******************************************************************************
# Print the tables to vrify that we are connected and the MySQL DB exists.
# ******************************************************************************
engine.table_names()

['birth_data', 'state_data']

### Use pandas to load USA Name Data (BigQuery Dataset) converted DataFrame into database

In [6]:
# ******************************************************************************
# Write the USA Name data to the birth_data table in the Birth_State_db MySQL DB
# ******************************************************************************
usa_name_data_df.to_sql(name='birth_data', con=engine, if_exists='append', index=False)

### Read in and store data from the States csv file into a pandas dataframe.

In [7]:
# ******************************************************************************
# Store states csv filepath in a variable
# ******************************************************************************
states_file = "states.csv"

In [8]:
# ******************************************************************************
# Read the Data file with the pandas library
# ******************************************************************************
states_df = pd.read_csv(states_file, encoding="ISO-8859-1")

In [9]:
# ******************************************************************************
# Show just the header to verify we have the data included in the dataframe
# ******************************************************************************
states_df.head()

Unnamed: 0,State,Abbreviation
0,Alabama,AL
1,Alaska,AK
2,Arizona,AZ
3,Arkansas,AR
4,California,CA


In [10]:
# ******************************************************************************
# Rename the columns to align with the MySQL State_Birth_db, State_Data table.
# ******************************************************************************
renamed_state_df = states_df.rename(columns={"State":"Full_Name", "Abbreviation":"State_Abbr"})
renamed_state_df.head()

Unnamed: 0,Full_Name,State_Abbr
0,Alabama,AL
1,Alaska,AK
2,Arizona,AZ
3,Arkansas,AR
4,California,CA


### Use pandas to load States converted DataFrame into database

In [11]:
# ******************************************************************************
# Write the State data to the state_data table in the Birth_State_db MySQL DB
# ******************************************************************************
renamed_state_df.to_sql(name='state_data', con=engine, if_exists='append', index=False)