# Project 2: ETL Challenge

### Briana Friendt, Haley Huhtala, Ian Mac Moore and Kaylene Retka

For our project we are using data from the CDC - National Health and Examination Health Survey (2013-2014). We started by exploring the data and determining which of the data sets we wanted to use. Several CSV files were available from the survey data. We selected the demographic data set and examination data set. This project will create a data set 

The columns in each data set had variable names. We started our project by creating a variable database for column names. This allowed us to determine which columns we wanted to keep in our final database. 

In this notebook, we will transform the data by combining cleaned CSV files into a final joined table in Postgres SQL DB.

### Imports

In [1]:
import pandas as pd
from sqlalchemy import create_engine

### Import CSV files and create DataFrames

In [3]:
#demographic_modified_renamed_columns.csv
#examination_modified_renamed_columns.csv

demographic_file = "./Resources/demographic_modified_renamed_columns.csv"
demographic_df = pd.read_csv(demographic_file)
demographic_df.head()

Unnamed: 0,SEQN,Gender,Age,Race,US_Citizen,Years_In_US,Marital_Status,Pregnant,Num_Family,Interview_Weight,Exam_Weight,Income
0,73557,Male,69,4,1.0,,4.0,Unknown,3,13281.23739,13481.0421,4.0
1,73558,Male,54,3,1.0,,1.0,Unknown,4,23682.05739,24471.76963,7.0
2,73559,Male,72,3,1.0,,1.0,Unknown,2,57214.80332,57193.28538,10.0
3,73560,Male,9,3,1.0,,,Unknown,4,55201.17859,55766.51244,9.0
4,73561,Female,73,3,1.0,,1.0,Unknown,2,63709.66707,65541.87123,15.0


In [4]:
examination_file = "./Resources/examination_modified_renamed_columns.csv"
examination_df = pd.read_csv(examination_file)
examination_df.head()

Unnamed: 0,SEQN,BP_Systolic_mmHg,BP_Diastolic_mmHg,Weight_kg,Height_cm,BMI_kg_m2,Dominate_Hand,Grip_Strength_H2_kg,Grip_Strength_H1_kg,Leather_Odor,Natural_Gas_Odor
0,73557,122.0,72.0,78.3,171.3,26.7,Right,18.3,27.5,2.0,4.0
1,73558,156.0,62.0,89.5,176.8,28.6,Right,32.5,27.8,3.0,4.0
2,73559,140.0,90.0,88.9,175.3,28.9,Both,43.1,45.7,2.0,4.0
3,73560,108.0,38.0,32.2,137.3,17.1,Right,13.1,12.2,,
4,73561,136.0,86.0,52.0,162.4,19.7,Left,10.6,16.2,3.0,4.0


### Connect to SQL DB

In [5]:
connection_string = "postgres:postgres@localhost:5432/HealthStudy"
engine = create_engine(f'postgresql://{connection_string}')

In [6]:
# Confirm tables
engine.table_names()

['Leather_Odor_Code',
 'Examination',
 'Natural_Gas_Odor_Code',
 'Race_Code',
 'Demographics',
 'US_Citizen_Code',
 'Years_In_US_Code',
 'Marital_Status_Code',
 'Income_Code',
 'Health_Study']

### Combine DFs and clean data
We'll drop all rows that have NAN, as there'll be sufficient entries for the purpose of this project that contain all data for all columns.

In [20]:
examinationClean_df = examination_df.dropna(how="any",inplace=True)
examinationClean_df

In [21]:
demographicClean_df = demographic_df.dropna(how="any",inplace=True)
demographicClean_df

In [35]:
healthStudy_df = examination_df.join(demographic_df,on="SEQN",how="outer",lsuffix="Exam",rsuffix="Demo")
healthStudy_df

Unnamed: 0,SEQN,SEQNExam,BP_Systolic_mmHg,BP_Diastolic_mmHg,Weight_kg,Height_cm,BMI_kg_m2,Dominate_Hand,Grip_Strength_H2_kg,Grip_Strength_H1_kg,...,Age,Race,US_Citizen,Years_In_US,Marital_Status,Pregnant,Num_Family,Interview_Weight,Exam_Weight,Income
0.0,73557,73557.0,122.0,72.0,78.3,171.3,26.7,Right,18.3,27.5,...,,,,,,,,,,
1.0,73558,73558.0,156.0,62.0,89.5,176.8,28.6,Right,32.5,27.8,...,,,,,,,,,,
2.0,73559,73559.0,140.0,90.0,88.9,175.3,28.9,Both,43.1,45.7,...,,,,,,,,,,
4.0,73561,73561.0,136.0,86.0,52.0,162.4,19.7,Left,10.6,16.2,...,,,,,,,,,,
5.0,73562,73562.0,160.0,84.0,105.0,158.7,41.7,Right,24.6,23.1,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,10156,,,,,,,,,,...,34.0,6.0,1.0,4.0,1.0,Unknown,4.0,14919.21878,16535.23660,6.0
,10160,,,,,,,,,,...,80.0,1.0,2.0,6.0,2.0,Unknown,7.0,12010.32530,12031.43972,6.0
,10164,,,,,,,,,,...,52.0,3.0,1.0,3.0,1.0,Unknown,2.0,67937.15346,68456.77185,15.0
,10169,,,,,,,,,,...,40.0,1.0,2.0,5.0,4.0,Unknown,2.0,45268.57270,49177.29100,9.0


In [36]:
healthStudyClean_df = healthStudy_df.drop(columns="SEQNExam")
healthStudyClean_df = healthStudyClean_df.reset_index(drop=True,inplace=False)
healthStudyClean_df


Unnamed: 0,SEQN,BP_Systolic_mmHg,BP_Diastolic_mmHg,Weight_kg,Height_cm,BMI_kg_m2,Dominate_Hand,Grip_Strength_H2_kg,Grip_Strength_H1_kg,Leather_Odor,...,Age,Race,US_Citizen,Years_In_US,Marital_Status,Pregnant,Num_Family,Interview_Weight,Exam_Weight,Income
0,73557,122.0,72.0,78.3,171.3,26.7,Right,18.3,27.5,2.0,...,,,,,,,,,,
1,73558,156.0,62.0,89.5,176.8,28.6,Right,32.5,27.8,3.0,...,,,,,,,,,,
2,73559,140.0,90.0,88.9,175.3,28.9,Both,43.1,45.7,2.0,...,,,,,,,,,,
3,73561,136.0,86.0,52.0,162.4,19.7,Left,10.6,16.2,3.0,...,,,,,,,,,,
4,73562,160.0,84.0,105.0,158.7,41.7,Right,24.6,23.1,3.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4549,10156,,,,,,,,,,...,34.0,6.0,1.0,4.0,1.0,Unknown,4.0,14919.21878,16535.23660,6.0
4550,10160,,,,,,,,,,...,80.0,1.0,2.0,6.0,2.0,Unknown,7.0,12010.32530,12031.43972,6.0
4551,10164,,,,,,,,,,...,52.0,3.0,1.0,3.0,1.0,Unknown,2.0,67937.15346,68456.77185,15.0
4552,10169,,,,,,,,,,...,40.0,1.0,2.0,5.0,4.0,Unknown,2.0,45268.57270,49177.29100,9.0


In [37]:
healthStudyClean_df = healthStudyClean_df.dropna(how="any", inplace=False)
healthStudyClean_df.head()


Unnamed: 0,SEQN,BP_Systolic_mmHg,BP_Diastolic_mmHg,Weight_kg,Height_cm,BMI_kg_m2,Dominate_Hand,Grip_Strength_H2_kg,Grip_Strength_H1_kg,Leather_Odor,...,Age,Race,US_Citizen,Years_In_US,Marital_Status,Pregnant,Num_Family,Interview_Weight,Exam_Weight,Income


### Send DFs to SQL

In [39]:
examination_df.to_sql(name='Examination', con=engine, if_exists='append', index=False)

IntegrityError: (psycopg2.errors.ForeignKeyViolation) insert or update on table "Examination" violates foreign key constraint "fk_Examination_Leather_Odor"
DETAIL:  Key (Leather_Odor)=(2) is not present in table "Leather_Odor_Code".

[SQL: INSERT INTO "Examination" ("SEQN", "BP_Systolic_mmHg", "BP_Diastolic_mmHg", "Weight_kg", "Height_cm", "BMI_kg_m2", "Dominate_Hand", "Grip_Strength_H2_kg", "Grip_Strength_H1_kg", "Leather_Odor", "Natural_Gas_Odor") VALUES (%(SEQN)s, %(BP_Systolic_mmHg)s, %(BP_Diastolic_mmHg)s, %(Weight_kg)s, %(Height_cm)s, %(BMI_kg_m2)s, %(Dominate_Hand)s, %(Grip_Strength_H2_kg)s, %(Grip_Strength_H1_kg)s, %(Leather_Odor)s, %(Natural_Gas_Odor)s)]
[parameters: ({'SEQN': 73557, 'BP_Systolic_mmHg': 122.0, 'BP_Diastolic_mmHg': 72.0, 'Weight_kg': 78.3, 'Height_cm': 171.3, 'BMI_kg_m2': 26.7, 'Dominate_Hand': 'Right', 'Grip_Strength_H2_kg': 18.3, 'Grip_Strength_H1_kg': 27.5, 'Leather_Odor': 2.0, 'Natural_Gas_Odor': 4.0}, {'SEQN': 73558, 'BP_Systolic_mmHg': 156.0, 'BP_Diastolic_mmHg': 62.0, 'Weight_kg': 89.5, 'Height_cm': 176.8, 'BMI_kg_m2': 28.6, 'Dominate_Hand': 'Right', 'Grip_Strength_H2_kg': 32.5, 'Grip_Strength_H1_kg': 27.8, 'Leather_Odor': 3.0, 'Natural_Gas_Odor': 4.0}, {'SEQN': 73559, 'BP_Systolic_mmHg': 140.0, 'BP_Diastolic_mmHg': 90.0, 'Weight_kg': 88.9, 'Height_cm': 175.3, 'BMI_kg_m2': 28.9, 'Dominate_Hand': 'Both', 'Grip_Strength_H2_kg': 43.1, 'Grip_Strength_H1_kg': 45.7, 'Leather_Odor': 2.0, 'Natural_Gas_Odor': 4.0}, {'SEQN': 73561, 'BP_Systolic_mmHg': 136.0, 'BP_Diastolic_mmHg': 86.0, 'Weight_kg': 52.0, 'Height_cm': 162.4, 'BMI_kg_m2': 19.7, 'Dominate_Hand': 'Left', 'Grip_Strength_H2_kg': 10.6, 'Grip_Strength_H1_kg': 16.2, 'Leather_Odor': 3.0, 'Natural_Gas_Odor': 4.0}, {'SEQN': 73562, 'BP_Systolic_mmHg': 160.0, 'BP_Diastolic_mmHg': 84.0, 'Weight_kg': 105.0, 'Height_cm': 158.7, 'BMI_kg_m2': 41.7, 'Dominate_Hand': 'Right', 'Grip_Strength_H2_kg': 24.6, 'Grip_Strength_H1_kg': 23.1, 'Leather_Odor': 3.0, 'Natural_Gas_Odor': 4.0}, {'SEQN': 73564, 'BP_Systolic_mmHg': 118.0, 'BP_Diastolic_mmHg': 80.0, 'Weight_kg': 93.4, 'Height_cm': 161.8, 'BMI_kg_m2': 35.7, 'Dominate_Hand': 'Right', 'Grip_Strength_H2_kg': 23.6, 'Grip_Strength_H1_kg': 16.9, 'Leather_Odor': 3.0, 'Natural_Gas_Odor': 4.0}, {'SEQN': 73566, 'BP_Systolic_mmHg': 128.0, 'BP_Diastolic_mmHg': 74.0, 'Weight_kg': 61.8, 'Height_cm': 152.8, 'BMI_kg_m2': 26.5, 'Dominate_Hand': 'Left', 'Grip_Strength_H2_kg': 16.6, 'Grip_Strength_H1_kg': 19.6, 'Leather_Odor': 3.0, 'Natural_Gas_Odor': 4.0}, {'SEQN': 73567, 'BP_Systolic_mmHg': 140.0, 'BP_Diastolic_mmHg': 78.0, 'Weight_kg': 65.3, 'Height_cm': 172.4, 'BMI_kg_m2': 22.0, 'Dominate_Hand': 'Right', 'Grip_Strength_H2_kg': 19.4, 'Grip_Strength_H1_kg': 22.6, 'Leather_Odor': 3.0, 'Natural_Gas_Odor': 4.0}  ... displaying 10 of 2930 total bound parameter sets ...  {'SEQN': 83724, 'BP_Systolic_mmHg': 164.0, 'BP_Diastolic_mmHg': 70.0, 'Weight_kg': 77.1, 'Height_cm': 176.0, 'BMI_kg_m2': 24.9, 'Dominate_Hand': 'Right', 'Grip_Strength_H2_kg': 37.4, 'Grip_Strength_H1_kg': 41.8, 'Leather_Odor': 3.0, 'Natural_Gas_Odor': 1.0}, {'SEQN': 83729, 'BP_Systolic_mmHg': 136.0, 'BP_Diastolic_mmHg': 82.0, 'Weight_kg': 89.6, 'Height_cm': 162.3, 'BMI_kg_m2': 34.0, 'Dominate_Hand': 'Right', 'Grip_Strength_H2_kg': 24.1, 'Grip_Strength_H1_kg': 14.1, 'Leather_Odor': 1.0, 'Natural_Gas_Odor': 4.0})]
(Background on this error at: http://sqlalche.me/e/gkpj)