In [None]:
import pandas as pd
from sqlalchemy import create_engine
from config import username, password

# Diversity Dataset

In [None]:
diversity_csv = "Resources/diversityindex.csv"
diversity_df = pd.read_csv(diversity_csv)
diversity_df

In [None]:
# Create a a new df with info we need 

# Split the County coloum into State and County
new_df = diversity_df
new_df['County'], new_df['State'] = diversity_df['Location'].str.split(',').str
new_df.tail()

# Drops all rows that have NaN as County as States got split into County above. 
new_df = new_df.dropna()

#checking if this worked and drop was sucessful in removing STATe names from county coloum 

# dsd = df[df['County'] == 'TEXAS']
# dsd.tail()

new_df.head()

In [None]:
# drop uneccessary column "Location" - replaced by county, state columns
diversity_df = new_df.drop(['Location'], axis=1)

# rename columns for compatibility with SQL
diversity_df = diversity_df.rename(columns={'Diversity-Index': 'diversity_index',
                                            'Black or African American alone, percent, 2013': 'black',
                                           'American Indian and Alaska Native alone, percent, 2013': 'native_am',
                                           'Asian alone, percent, 2013': 'asian',
                                           'Native Hawaiian and Other Pacific Islander alone, percent,': 'hawaiian',
                                           'Two or More Races, percent, 2013': 'mixed',
                                           'Hispanic or Latino, percent, 2013': 'latinx',
                                           'White alone, not Hispanic or Latino, percent, 2013': 'white',
                                           'County': 'county',
                                           'State': 'state'})

diversity_df

In [None]:

new_df2 = pd.DataFrame({"Diversity-Index":new_df.groupby(["State","County"])["Diversity-Index"].mean(),
                       "Black":new_df.groupby(["State","County"])["Black or African American alone, percent, 2013"].mean(),
                       "American Indian":new_df.groupby(["State","County"])["American Indian and Alaska Native alone, percent, 2013"].mean(),
                       "Asian":new_df.groupby(["State","County"])["Asian alone, percent, 2013"].mean(),
                       "Native Hawaiian":new_df.groupby(["State","County"])["Native Hawaiian and Other Pacific Islander alone, percent,"].mean(),
                       "Two or More Races":new_df.groupby(["State","County"])["Two or More Races, percent, 2013"].mean(),
                       "Hispanic/ Latino":new_df.groupby(["State","County"])["Hispanic or Latino, percent, 2013"].mean(),
                       "White":new_df.groupby(["State","County"])["White alone, not Hispanic or Latino, percent, 2013"].mean()})

In [None]:
new_df2.head()

# Unemployment Dataset

In [None]:
unemp_csv = "Resources/unemployment.csv"
unemp_df = pd.read_csv(unemp_csv)

unemp_df.head()

In [None]:
unemp_df = unemp_df[['County', 'State', 'Rate']]
unemp_df.head(10)

In [None]:
us_state_abbrev = {
    
'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR', 'California': 'CA', 'Colorado': 'CO',
'Connecticut': 'CT', 'Delaware': 'DE', 'Florida': 'FL', 'Georgia': 'GA', 'Hawaii': 'HI', 'Idaho': 'ID',
'Illinois': 'IL', 'Indiana': 'IN', 'Iowa': 'IA', 'Kansas': 'KS', 'Kentucky': 'KY', 'Louisiana': 'LA',
'Maine': 'ME', 'Maryland': 'MD', 'Massachusetts': 'MA', 'Michigan': 'MI', 'Minnesota': 'MN', 'Mississippi': 'MS',
'Missouri': 'MO', 'Montana': 'MT', 'Nebraska': 'NE', 'Nevada': 'NV', 'New Hampshire': 'NH', 'New Jersey': 'NJ',
'New Mexico': 'NM', 'New York': 'NY', 'North Carolina': 'NC', 'North Dakota': 'ND', 'Ohio': 'OH', 'Oklahoma': 'OK',
'Oregon': 'OR', 'Pennsylvania': 'PA', 'Rhode Island': 'RI', 'South Carolina': 'SC', 'South Dakota': 'SD',
'Tennessee': 'TN', 'Texas': 'TX', 'Utah': 'UT', 'Vermont': 'VT', 'Virginia': 'VA', 'Washington': 'WA',
'West Virginia': 'WV', 'Wisconsin': 'WI', 'Wyoming': 'WY'}


In [None]:
unemp_df['State'] = unemp_df['State'].map(us_state_abbrev).fillna(unemp_df['State'])

# rename columns for compatibility with SQL
unemp_df = unemp_df.rename(columns={'County': 'county', 'State': 'state', 'Rate': 'rate'})

# drop duplicate rows
unemp_df = unemp_df.drop_duplicates(subset=['county', 'state'])

unemp_df.head(10)

In [None]:
# unemp_df_new = pd.DataFrame({"Unemployment Rate":unemp_df.groupby(["State","County"])["Rate"].mean()})
# unemp_df_new

# Median Income Dataset

In [None]:
median_csv = "Resources/medianincome.csv"
median_df = pd.read_csv(median_csv)
median_df.info()

In [None]:
median_df.info()

In [None]:
median_df = median_df[['County',"State Code", 'Population',"Median household income"]]

# rename columns for compatibility with SQL
median_df = median_df.rename(columns={'County': 'county', 'State Code': 'state', 'Population': 'population', 'Median household income': 'median_household_income'})

median_df.head()

In [None]:
# median_df = median_df.rename(columns={"State Code": "State"})


# median_df_new = pd.DataFrame({"Median household income":median_df.groupby(["State","County"])["Median household income"].mean(),
#                               "Population":median_df.groupby(["State","County"])["Population"].sum()})

# median_df_new.head()

# Load to SQL

In [None]:
# Connect to database
rds_connection_string = f"{username}:{password}@localhost:5432/ETL_db"
engine = create_engine(f'postgresql://{rds_connection_string}')

In [None]:
# Check for tables
engine.table_names()

In [None]:
# Use pandas to load converted dataframes into ETL_db
median_df.to_sql(name='income', con=engine, if_exists='append', index=False)

In [None]:
diversity_df.to_sql(name='diversity', con=engine, if_exists='append', index=False)

In [None]:
unemp_df.to_sql(name='unemployment', con=engine, if_exists='append', index=False)