In [None]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objs as go 
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import re

%matplotlib inline
init_notebook_mode(connected=True) 

# Physical House Occupancy Characteristics

In [None]:
state_abbreviations = {
'Alabama': 'AL',
'Alaska': 'AK',
'Arizona': 'AZ',
'Arkansas': 'AR',
'California': 'CA',
'Colorado': 'CO',
'Connecticut': 'CT',
'Delaware': 'DE',
'District of Columbia': 'DC',
'Florida': 'FL',
'Georgia': 'GA',
'Hawaii': 'HI',
'Idaho': 'ID',
'Illinois': 'IL',
'Indiana': 'IN',
'Iowa': 'IA',
'Kansas': 'KS',
'Kentucky': 'KY',
'Louisiana': 'LA',
'Maine': 'ME',
'Maryland': 'MD',
'Massachusetts': 'MA',
'Michigan': 'MI',
'Minnesota': 'MN',
'Mississippi': 'MS',
'Missouri': 'MO',
'Montana': 'MT',
'Nebraska': 'NE',
'Nevada': 'NV',
'New Hampshire': 'NH',
'New Jersey': 'NJ',
'New Mexico': 'NM',
'New York': 'NY',
'North Carolina': 'NC',
'North Dakota': 'ND',
'Ohio': 'OH',
'Oklahoma': 'OK',
'Oregon': 'OR',
'Pennsylvania': 'PA',
'Rhode Island': 'RI',
'South Carolina': 'SC',
'South Dakota': 'SD',
'Tennessee': 'TN',
'Texas': 'TX',
'Utah': 'UT',
'Vermont': 'VT',
'Virginia': 'VA',
'Washington': 'WA',
'West Virginia': 'WV',
'Wisconsin': 'WI',
'Wyoming': 'WY',
'Puerto Rico': 'PR'
}

In [None]:
def convert_value(value):
    if '%' in value:
        return float(value.replace('%', '')) / 100  # Convert percentage to a decimal
    else:
        return int(value.replace(',', ''))  # Remove commas and convert to integer

In [None]:
def clean_house_char_headers(val):
    if isinstance(val, str):
        if 'Occupied' in val:
            val = val.split("!!")[0]
            val = val + "_total"
        elif 'Percent occupied housing units' in val:
            val = val.split("!!")[0]
            val = val + "_total_percent"
        elif 'Owner-occupied housing'in val:
            val = val.split("!!")[0]
            val = val + "_owner"
        elif 'Percent owner-occupied housing units' in val:
            val = val.split("!!")[0]
            val = val + "_own_percent"
        elif 'Renter-occupied housing units' in val:
            val = val.split("!!")[0]
            val = val + "_renter"
        elif 'Percent renter-occupied' in val:
            val = val.split("!!")[0]
            val = val + "_rent_percent"
        else:
            val = val.split("!!")[0]
        return val
    else:
        return val

In [None]:
house_char_data = pd.read_csv('../app/data/Physical_Housing_Occup.csv', index_col=0)
house_char_data = house_char_data.rename(columns=clean_house_char_headers)
house_char_data.head()

In [None]:
units_in_struc = house_char_data.iloc[[2,3,4,5,6,7,8]]
units_in_struc.head()

In [None]:
def data_cleanup(df):

    df_dict = df.to_dict()
    cleaned_dict = {state: {key.strip(): convert_value(value) for key, value in data.items()} for state, data in df_dict.items()}

    # Create nested dictionary for each state to combine data by state
    new_dict = {}
    for state_attr, attr_values in cleaned_dict.items():
        state, attribute = state_attr.split("_", 1)
        if state not in new_dict:
            new_dict[state] = {}
        if attribute not in new_dict[state]:
            new_dict[state][attribute] = {}
        for attr, value in attr_values.items():
            new_dict[state][attribute][attr] = value

    # Create category by total units in state, homeowner units and renter units
    total_unit_lst = [{k: v.get('total')} for k, v in new_dict.items() if v.get('total') is not None]
    owner_unit_lst = [{k: v.get('owner')} for k, v in new_dict.items() if v.get('owner') is not None]
    renter_unit_lst = [{k: v.get('renter')} for k, v in new_dict.items() if v.get('renter') is not None]

    # Function to convert list of dictionaries into a DataFrame
    def create_df(lst):
        df = pd.concat({k: pd.DataFrame.from_dict(v, 'index') for d in lst for k, v in d.items()}, axis=0)
        df.reset_index(inplace=True)
        df.columns = ['State', 'Value', 'Count']
        df['Code'] = df['State'].map(state_abbreviations)
        return df

    # Convert the list of nested dictionaries into a DataFrame
    df_total = create_df(total_unit_lst)
    df_owner = create_df(owner_unit_lst)
    df_renter = create_df(renter_unit_lst)

    return df_total, df_owner, df_renter

In [None]:
df_total, df_owner, df_renter = data_cleanup(units_in_struc)

In [None]:
df_total.head()

In [None]:
df_owner.head()

In [None]:
df_renter.head()

In [None]:
year_struc = house_char_data.iloc[[10,11,12,13,14,15,16]]
year_struc.head()

In [None]:
df_total_yr, df_owner_yr, df_renter_yr = data_cleanup(year_struc)

In [None]:
df_total_yr.head()

In [None]:
def graph_pie(df,state:str,data_year: str):
    target_state = df[df['State']==state]
    # Create the bar chart
    fig = px.pie(target_state, values='Count', names='Value', title=f'{data_year} Average age of homes in {state}')
    fig.show()

graph_pie(df_total_yr, 'Virginia', '2022')

In [None]:
df_owner_yr.head()

In [None]:
df_renter_yr.head()

In [None]:
rooms = house_char_data.iloc[[18,19,20,21,22]]
rooms.head()

In [None]:
df_room_total, df_room_owner, df_room_renter = data_cleanup(rooms)

In [None]:
df_room_total.head()

In [None]:
df_room_owner.head()

In [None]:
df_room_renter.head()

In [None]:
bedroom = house_char_data.iloc[[24,25,26,27]]
bedroom.head()

In [None]:
df_bed_total, df_bed_owner, df_bed_renter = data_cleanup(bedroom)

In [None]:
df_bed_total.head()

In [None]:
df_bed_owner.head()

In [None]:
df_bed_renter.head()

In [None]:
target_state_own = df_bed_owner[df_bed_owner['State']=='Virginia']
target_state_rent = df_bed_renter[df_bed_renter['State']=='Virginia']

In [None]:
# Add a new column to distinguish between homeowners and renters
target_state_own['Type'] = 'Homeowners'
target_state_rent['Type'] = 'Renters'

# Concatenate the dataframes
df = pd.concat([target_state_own, target_state_rent])

# Create the bar graph
fig = px.bar(df, x='Value', y='Count', color='Type', barmode='group', 
             facet_row='State', labels={'Count':'Count', 'Value':'Number of Bedrooms'}, 
             title='Comparison of Homeowners and Renters')

fig.show()

In [None]:
vehicles = house_char_data.iloc[[32,33,34,35]]
vehicles.head()

In [None]:
df_car_total, df_car_owner, df_car_renter = data_cleanup(vehicles)

In [None]:
df_car_total.head()

In [None]:
df_car_owner.head()

In [None]:
df_car_renter.head()

In [None]:
house_heat_fuel = house_char_data.iloc[[39,40,41,42,43,44,45]]
house_heat_fuel.tail()

In [None]:
df_heat_total, df_heat_owner, df_heat_renter = data_cleanup(house_heat_fuel)

In [None]:
df_heat_total.head()

In [None]:
df_heat_owner.head()

In [None]:
df_heat_renter.head()

In [None]:
df_total.head()

In [None]:
df_total.groupby('State').describe()

In [None]:
total_occupied_housing = df_total.groupby('Code').sum()

In [None]:
total_occupied_housing = total_occupied_housing.reset_index()

In [None]:
total_occupied_housing.head()

In [None]:
def make_map(df, data_year: str):

    data = dict(type = 'choropleth',
                colorscale = 'Portland',
                locations = df['Code'],
                locationmode = 'USA-states',
                z=df['Count'],
                colorbar = {'title':'Occupied housing units'})

    layout = dict(
        title = {'text': f'{data_year} US Occupied housing units', 'x':0.5, 'xanchor': 'center'},
        geo = dict(scope = 'usa'),
        autosize = True,
        width = 800,
        height = 600,
    )

    choromap = go.Figure(data = [data],layout = layout)

    iplot(choromap)

In [None]:
make_map(total_occupied_housing, '2022')

## Segment states by predominant housing type - cluster analysis on housing structure types by state

In [None]:
df_total.head()

In [None]:
#Get dominant house type and return state and headers
df_max = df_total.loc[df_total.groupby('State')['Count'].idxmax()]

print(df_max)

In [None]:
fig = px.scatter(df_max, y="Value", x="Count", color="State")
#fig.update_traces(marker_size=10)
fig.show()

## Fertility rates by state

In [None]:
def clean_headers(val):
    if isinstance(val, str):
        if 'Total' in val:
            val = val.split("!!")[0]
            val = val + "_total"
        elif 'Women with births in the past 12 months!!Number!!Estimate' in val:
            val = val.split("!!")[0]
            val = val + "_births"
        elif 'Women with births in the past 12 months!!Rate per 1,000 women!!Estimate' in val:
            val = val.split("!!")[0]
            val = val + "_thou"
        else:
            val = val.split("!!")[0]
        return val
    else:
        return val

In [None]:
fert_data = pd.read_excel('../app/data/fertility_data.xlsx', index_col=0)
fert_data = fert_data.rename(columns=clean_headers)

fert_data.head()

In [None]:
def fert_data_cleanup(df, year:str):

    df_dict = df.to_dict()
    cleaned_dict = {state: {key.strip(): value for key, value in data.items()} for state, data in df_dict.items()}

    # Create nested dictionary for each state to combine data by state
    new_dict = {}
    for state_attr, attr_values in cleaned_dict.items():
        state, attribute = state_attr.split("_", 1)
        if state not in new_dict:
            new_dict[state] = {}
        if attribute not in new_dict[state]:
            new_dict[state][attribute] = {}
        for attr, value in attr_values.items():
            new_dict[state][attribute][attr] = value

    # Create category by total units in state, homeowner units and renter units
    total_lst = [{k: v.get('total')} for k, v in new_dict.items() if v.get('total') is not None]
    birth_lst = [{k: v.get('births')} for k, v in new_dict.items() if v.get('births') is not None]
    thou_lst = [{k: v.get('thou')} for k, v in new_dict.items() if v.get('thou') is not None]

    # Function to convert list of dictionaries into a DataFrame
    def create_df(lst):
        #count_column = f'Count'
        df = pd.concat({k: pd.DataFrame.from_dict(v, 'index') for d in lst for k, v in d.items()}, axis=0)
        df.reset_index(inplace=True)
        df.columns = ['State', 'Value', 'Count']
        df['Code'] = df['State'].map(state_abbreviations)
        return df

    # Convert the list of nested dictionaries into a DataFrame
    df_total = create_df(total_lst)
    df_birth = create_df(birth_lst)
    df_thou = create_df(thou_lst)

    return df_total, df_birth, df_thou

In [None]:
births_data_22 = fert_data.iloc[[1]]
births_data_21 = fert_data.iloc[[12]]
births_data_19 = fert_data.iloc[[23]]
births_data_18 = fert_data.iloc[[34]]
births_data_17 = fert_data.iloc[[45]]
births_data_16 = fert_data.iloc[[56]]
births_data_15 = fert_data.iloc[[67]]
births_data_14 = fert_data.iloc[[78]]
births_data_13 = fert_data.iloc[[89]]
births_data_12 = fert_data.iloc[[100]]
births_data_11 = fert_data.iloc[[111]]
births_data_10 = fert_data.iloc[[122]]


In [None]:
# List of data and corresponding years
data_years = [(births_data_22, '2022'), (births_data_21, '2021'), (births_data_19, '2019'), 
              (births_data_18, '2018'), (births_data_17, '2017'), (births_data_16, '2016'), 
              (births_data_15, '2015'), (births_data_14, '2014'), (births_data_13, '2013'), 
              (births_data_12, '2012'), (births_data_11, '2011'), (births_data_10, '2010')]

def consolidate_dataframe(data_years: list):

    # Initialize dictionaries to store dataframes
    fert_pop_dict = {}
    birth_dict = {}
    birth_thou_dict = {}

    # Initialize a list to store dataframes
    df_list = []
    thou_list = []

    # Loop over all data and years
    for data, year in data_years:
        fert_pop, birth, birth_thou = fert_data_cleanup(data, year)
        fert_pop_dict[year] = fert_pop
        birth_dict[year] = birth
        birth_thou_dict[year] = birth_thou

        # Add a 'Year' column to the dataframe
        birth['Year'] = year
        birth_thou['Year'] = year
        # Append the dataframe to df_list
        df_list.append(birth)
        thou_list.append(birth_thou)

    # Concatenate all dataframes in df_list
    all_years_df = pd.concat(df_list)
    birth_df = pd.concat(thou_list)

    # Reset the index of all_years_df
    all_years_df.reset_index(drop=True, inplace=True)
    birth_df.reset_index(drop=True, inplace=True)

    return all_years_df, fert_pop_dict, birth_dict, birth_thou_dict, birth_df


In [None]:
df, fert_pop_dict, birth_dict, birth_thou_dict, birth_df = consolidate_dataframe(data_years)

In [None]:
df.tail()

In [None]:
fig = px.line(df, x='Year', y='Count', color='State', title='Births by Year and State')
fig.show()

In [None]:
birth_df.head()

In [None]:
birth_df.tail()

In [None]:
df_sum = df.groupby('Year')['Count'].sum().reset_index()
fig = px.line(df_sum, x='Year', y='Count', title='Total Births by year in United States')
fig.show()

In [None]:
target_state = df[df['State']=='Virginia']

fig = px.line(target_state, x='Year', y='Count', title='Births by Year in Virginia')
fig.show()

In [None]:
labor_2022 = fert_data.iloc[[10]]
labor_2021 = fert_data.iloc[[21]]
labor_2019 = fert_data.iloc[[32]]
labor_2018 = fert_data.iloc[[43]]
labor_2017 = fert_data.iloc[[54]]
labor_2016 = fert_data.iloc[[65]]
labor_2015 = fert_data.iloc[[76]]
labor_2014 = fert_data.iloc[[87]]
labor_2013 = fert_data.iloc[[98]]
labor_2012 = fert_data.iloc[[109]]
labor_2011 = fert_data.iloc[[120]]
labor_2010 = fert_data.iloc[[131]]

In [None]:
labor_data_years = [(labor_2022, '2022'), (labor_2021, '2021'), (labor_2019, '2019'), 
              (labor_2018, '2018'), (labor_2017, '2017'), (labor_2016, '2016'), 
              (labor_2015, '2015'), (labor_2014, '2014'), (labor_2013, '2013'), 
              (labor_2012, '2012'), (labor_2011, '2011'), (labor_2010, '2010')]

In [None]:
labor_df, lab_fert_pop_dict, lab_birth_dict, lab_birth_thou_dict = consolidate_dataframe(labor_data_years)
#labor_df.rename(columns={'Count': 'Employed'}, inplace=True)

In [None]:
labor_df.tail()

In [None]:
target_state_lab = labor_df[labor_df['State']=='Virginia']

fig = px.line(target_state_lab, x='Year', y='Count', title='Women who gave birth in employment for Virginia')
fig.show()

In [None]:
consol_df = pd.concat([df,labor_df])

In [None]:
consol_df.tail()

In [None]:
target_state = consol_df[consol_df['State']=='Virginia']
# Create the line chart
fig = px.line(target_state, x='Year', y='Count', color='Value', title='Births and Employment by Year and State')
fig.show()

In [None]:
target_state = consol_df[consol_df['State']=='California']
# Create the line chart
fig = px.line(target_state, x='Year', y='Count', color='Value', title='Births and Employment by Year and State')
fig.show()

In [None]:
# Merge the two dataframes
merged_df = pd.merge(df, labor_df, on=['State', 'Code', 'Year'], suffixes=('_birth', '_labor'))

# Calculate the spread
merged_df['Spread'] = merged_df['Count_birth'] - merged_df['Count_labor']

fig = px.line(merged_df, x='Year', y='Spread', color='State', title='Spread of Births and Labor by Year and State')
fig.show()


In [None]:
target_state = merged_df[merged_df['State']=='California']
fig = px.line(target_state, x='Year', y='Spread', color='State', title='Spread of Births and Labor by Year and State')
fig.show()

In [None]:
fem_age_22 = fert_data.iloc[[2,3,4]]
fem_age_22.head()

In [None]:
fem_age_21 = fert_data.iloc[[13,14, 15]]
fem_age_21.head()

In [None]:
fem_age_19 = fert_data.iloc[[24, 25, 26]]
fem_age_19.head()

In [None]:
fem_age_18 = fert_data.iloc[[35,36,37]]
fem_age_18.head()

In [None]:
fem_age_17 = fert_data.iloc[[46, 47, 48]]
fem_age_17.head()

In [None]:
fem_age_16 = fert_data.iloc[[57,58,59]]
fem_age_16.head()

In [None]:
fem_age_15 = fert_data.iloc[[68,69,70]]
fem_age_15.head()

In [None]:
fem_age_14 = fert_data.iloc[[79,80,81]]
fem_age_14.head()

In [None]:
fem_age_13 = fert_data.iloc[[90, 91, 92]]
fem_age_13.head()

In [None]:
fem_age_12 = fert_data.iloc[[101,102,103]]
fem_age_12.head()

In [None]:
fem_age_11 = fert_data.iloc[[112,113,114]]
fem_age_11.head()

In [None]:
fem_age_10 = fert_data.iloc[[123,124,125]]
fem_age_10.head()

In [None]:
fem_stat_2022 = fert_data.iloc[[6,7]]

fem_stat_2022.head()