# Setup and define variables

In [2]:
import pandas as pd
import numpy as np
import requests
import pyodbc
import arcpy
# This is using Andy's Census API KEy
census_api_key = '9a73d08c296b844e58f1c70bd19c831826da5cbf'

# Need to define datatypes so that FIPS code doesn't get cast as int and drop leading 0s
dtypes = {
    'YEAR' : str,
    'STATE': str,
    'GEOGRAPHY': str,
    'GEOID': str,
    'TRPAID':str,
    'NEIGHBORHOOD': str
}

#Manually defined list of census tracts that are within the basin
 
#This should be redundant once we get to the new tahoe census feature class
#Generated a GEO? that is a combination of the GEOID and Census year. This is how we join to tblDemographics to determine what's in the basin


server = 'xxxx'
database = 'xxxx'
username = 'xxxx'
password = 'xxxx'
query = 'Select YEAR, STATE, GEOGRAPHY, GEOID, TRPAID, NEIGHBORHOOD from sde.Tahoe_Census_Geography'

conn_str = f"DRIVER={{SQL Server}};SERVER={server};DATABASE={database};UID={username};PWD={password}"

# Establish a connection to the SQL Server
conn = pyodbc.connect(conn_str)

# Execute the query and fetch the results
tahoe_geometry = pd.read_sql_query(query, conn, dtype=dtypes)

# Close the database connection
conn.close()




## Define the census download functions

In [4]:
#Helper function that is used to concatenate census data return
def create_or_append_df(df, summary_df):
    if df.empty:
        df = summary_df.copy()
    else:
        df = pd.concat([df, summary_df])
    return df

#Thus gets the result of the get request and does some data wrangling to make it fit our structure
def get_request_census(request_url, sample_level, geo_name):
    response = requests.get(request_url)
            
    df = pd.DataFrame(response.json())
    #The json returns column names in the first row
    df.columns = df.iloc[0]
    df = df[1:]
    df['sample_level']=sample_level
    df['Geo_Name']=geo_name
    #Might as well add counties and states at this stage
    return df



def get_variable_data(year, dataset, geometry_return, variable, variablename, census_api_key, census_geom_year, tahoe_geometry, variable_category):
    #Returns all data for a given dataset for Washoe, El Dorado, Carson City, Douglas, Placer Counties
    #Need to make five seperate api calls because of the geometry structure
    county_states ={
        '06': ['017','061'],
        '32': ['005', '031', '510']
    }
    base_url = 'https://api.census.gov/data'
    df_total=pd.DataFrame()
    #Formatting to match html get request
    geometry_return=geometry_return.replace(" ", "%20")
    #This adds tract level to make block groups or blocks get request valid
    if geometry_return == 'tract':
        geometry_level = ''
    else:
        geometry_level='%20tract:*'
    if 'acs/acs5' in dataset:
        variable= variable +'E,'+variable + 'M'

    
    for state in county_states:
        for county in county_states[state]:
            print(f'{base_url}/{year}/{dataset}?get=GEO_ID,{variable}&for={geometry_return}:*&in=state:{state}%20county:{county}{geometry_level}&key={census_api_key}')
            request_url = f'{base_url}/{year}/{dataset}?get=GEO_ID,{variable}&for={geometry_return}:*&in=state:{state}%20county:{county}{geometry_level}&key={census_api_key}'
            response = requests.get(request_url)
            
            df = pd.DataFrame(response.json())
            #The json returns column names in the first row
            df.columns = df.iloc[0]
            df = df[1:]
            #Might as well add counties and states at this stage
            if df_total.empty:
                df_total=df
            else:
                df_total=pd.concat([df_total, df])
    #Figure out exactly what variable we want here
    #Add something here to handle margin of error
    df_total['variable_code']=variable
    df_total['variable_name']=variablename
    df_total['variable_category']= variable_category
    df_total['year_sample']=year
    df_total['sample_level']=geometry_return.replace("%20", " ")
    df_total['dataset']= dataset
    df_total['census_geom_year'] = census_geom_year
    df_total['GEO_ID'] = df_total['GEO_ID'].str.split('US').str[1]
    df_total['TRPAID'] = df_total['GEO_ID']+df_total['census_geom_year'].astype(str)
    df_total.columns.values[1] = 'value'
    df_total['value'] = df_total['value'].astype(float)
    if 'acs/acs5' in dataset:
        df_total.columns.values[2]='MarginOfError'
        df_total['variable_code'] = df_total['variable_code'].str.split(',').str[0]
    else:
        df_total.insert(2, 'MarginOfError', np.NaN)
    if geometry_return == 'tract':
        tract_col_loc = df_total.columns.get_loc('tract')
        df_total.insert(tract_col_loc, 'block group', np.NaN)

    #filter to just the tahoe parcels
    df_total = df_total[df_total['TRPAID'].isin(tahoe_geometry['TRPAID'])]
    df_total =  pd.merge(df_total, tahoe_geometry[['TRPAID', 'NEIGHBORHOOD']], on='TRPAID', how= 'left')
    
    return df_total

def get_non_tahoe_data(year,dataset, variable, variablename, census_api_key, census_geom_year, variable_category):
    base_url = 'https://api.census.gov/data'
    df_total=pd.DataFrame()
    county_states ={
        '06': ['017','061'],
        '32': ['005', '031', '510']
    }
    state_names={
        '06':'CA',
        '32':'NV'
    }
    county_names={
        '017':'El Dorado County',
        '061':'Placer County',
        '005':'Douglas County',
        '031':'Washoe County',
        '510':'Carson City County'
    }
    #Need to update this so that it handles the different years - are 2010 and 2020 the same?
    urban_centers = {
        'Reno-Sparks MSA':'39900',
        'Sacramento MSA': '40900',   
    }
    combined_metro_areas={
        'Sanfranciso CMSA': '488'
    }
    urban_centers_2000 = {
        'Reno-Sparks MSA':'6720',
        'Sacramento MSA': '6922',   
    }
    combined_metro_areas_2000={
        'Sanfranciso CMSA': '7362'
    }
    if year=="2000":
        for urban_center in urban_centers_2000:
            urban_center_code = urban_centers_2000[urban_center]
            statistical_region_url = f'metropolitan%20statistical%20area/micropolitan%20statistical%20area'
            print(f'{base_url}/{year}/{dataset}?get=GEO_ID,{variable}&for={statistical_region_url}:{urban_center_code}&key={census_api_key}')
            request_url= f'{base_url}/{year}/{dataset}?get=GEO_ID,{variable}&for={statistical_region_url}:{urban_center_code}&key={census_api_key}'
            df = get_request_census(request_url,'MSA',urban_center)
            df_total = create_or_append_df(df_total,df)
    else:        
        for urban_center in urban_centers:
            urban_center_code = urban_centers[urban_center]
            print(f'{base_url}/{year}/{dataset}?get=GEO_ID,{variable}&for=metropolitan%20statistical%20area/micropolitan%20statistical%20area:{urban_center_code}&key={census_api_key}')
            request_url = f'{base_url}/{year}/{dataset}?get=GEO_ID,{variable}&for=metropolitan%20statistical%20area/micropolitan%20statistical%20area:{urban_center_code}&key={census_api_key}'            
            df = get_request_census(request_url,'MSA', urban_center)
            df_total = create_or_append_df(df_total, df)    
        for cma in combined_metro_areas:
            cma_code = combined_metro_areas[cma]
            print(f'{base_url}/{year}/{dataset}?get=GEO_ID,{variable}&for=combined%20statistical%20area:{cma_code}&key={census_api_key}')
            request_url = f'{base_url}/{year}/{dataset}?get=GEO_ID,{variable}&for=combined%20statistical%20area:{cma_code}&key={census_api_key}'
            df = get_request_census(request_url, 'MSA', cma)
            df_total = create_or_append_df(df_total, df)
    for state in county_states:
        for county in county_states[state]:
            #https://api.census.gov/data/2010/dec/sf1?get=GEO_ID,P001001&for=county:017&in=state:06&key=9a73d08c296b844e58f1c70bd19c831826da5cbf
            print(f'{base_url}/{year}/{dataset}?get=GEO_ID,{variable}&for=county:{county}&in=state:{state}&key={census_api_key}')
            request_url = f'{base_url}/{year}/{dataset}?get=GEO_ID,{variable}&for=county:{county}&in=state:{state}&key={census_api_key}'
            countyname = county_names[county]
            df = get_request_census(request_url, 'County', countyname)
            df_total = create_or_append_df(df_total, df)
    for state in county_states:
        #https://api.census.gov/data/2010/dec/sf1?get=GEO_ID,P001001&for=county:017&in=state:06&key=9a73d08c296b844e58f1c70bd19c831826da5cbf
        print(f'{base_url}/{year}/{dataset}?get=GEO_ID,{variable}&for=state:{state}&key={census_api_key}')
        request_url = f'{base_url}/{year}/{dataset}?get=GEO_ID,{variable}&for=state:{state}&key={census_api_key}'
        geoname = state_names[state]
        df = get_request_census(request_url,'State', geoname)
        df_total = create_or_append_df(df_total, df)
        
    #Figure out exactly what variable we want here
    df_total['variable_code']=variable
    df_total['variable_name']=variablename
    df_total['variable_category']= variable_category
    df_total['year_sample']=year
    df_total['dataset']= dataset
    df_total['census_geom_year'] = census_geom_year
    df_total['GEO_ID'] = df_total['GEO_ID'].str.split('US').str[1]
    df_total['GEO_CODE'] = df_total['GEO_ID']+df_total['census_geom_year'].astype(str)
    df_total.columns.values[1] = 'value'
    return df_total

def census_download_wrapper (variable_file):
    dtypes = {
    'Variable' : str,
    'Code': str,
    'Category': str,
    'Datasource': str,
    'CodeNumber':str,
    'Year':str,
    'census_geom_year':str,
    'GeometryLevel':str
    }


    variables = pd.read_csv(variable_file,dtype=dtypes)

    #Loop through this?
    df_values=pd.DataFrame()
    for index, row in variables.iterrows():
        print(index)
        df = get_variable_data(row['Year'], row['Datasource Name'],row['GeometryLevel'],row['CodeNumber'],row['Variable'], census_api_key, row['census_geom_year'], tahoe_geometry, row['Category'])
        
        df_values = create_or_append_df(df_values, df)
    return df_values

## Define the processing functions

In [5]:


def calculate_median_value(df, bin_column, sort_column, count_column, category_field, category,grouping_variables):
        # Create a new DataFrame to avoid modifying the original one
    #change value to count column
    #Do we need to handle excluding non-tahoe things here or should we do it in the input?
    summary_df = df.copy()
    summary_df[count_column]=summary_df[count_column].astype(int)
    #summary_df=summary_df.loc[summary_df[count_column]!='510']
    summary_df = summary_df.groupby(grouping_variables)[count_column].sum()
    summary_df = summary_df.reset_index()
    #This handles -6666 values that they sometimes add for unknown data
    summary_df = summary_df.loc[summary_df[count_column]>=0]
    
    
    summary_df= summary_df.loc[summary_df[category_field]==category]
    # Sort the DataFrame based on the variable name column
    # This depends on the fact that census variables start with the lowest value and go up 
    #This needs to all be rethought to have some kind of window function to handle multiple years
    summary_df.sort_values(by=sort_column, inplace=True)
    summary_df = summary_df.reset_index()
    
    # Extract lower and upper limits from bin categories
    #This uses regex to find numbers and removes commas to make numbers numbers 
    pattern = r'(\d+[\d,]*)'
    summary_df['temp'] = summary_df[bin_column].str.replace(',', '').str.findall(pattern)
    #Looks for values that have two numbers and puts empty placeholders for the ones that only have one (upper and lower)
    summary_df[['Lower', 'Upper']] = summary_df['temp'].apply(lambda x: pd.Series(x[:2]) if len(x) == 2 else pd.Series([None, None]))
    summary_df['Lower'] = summary_df['Lower'].astype(float)
    summary_df['Upper'] = summary_df['Upper'].astype(float)
    # Handle first category
    
    first_upper = float(summary_df['temp'].iloc[0][0])
    #convert string to float
    print(first_upper)
    summary_df.loc[0, 'Lower'] = 0  # Set lower value to 0
    summary_df.loc[0, 'Upper'] = first_upper
    
    # Handle last category
    
    last_lower = float(summary_df['temp'].iloc[-1][0])
    print(last_lower)
    summary_df.loc[summary_df.index[-1], 'Lower'] = last_lower
    summary_df.loc[summary_df.index[-1], 'Upper'] = np.inf  # Set upper value to infinity
    summary_df[count_column]= summary_df[count_column].astype(float)   
    # Calculate cumulative count
    summary_df['Cumulative_Count'] = summary_df[count_column].cumsum()
    
    # Calculate total count
    total_count = summary_df[count_column].sum()
    
    # Find the bin where the cumulative count exceeds or equals half of the total count
    median_bin = summary_df.loc[summary_df['Cumulative_Count'] >= total_count / 2, bin_column].iloc[0]
    print(median_bin)
    # Estimate the median value
    #We need to pull the median income
    median_bin_index = summary_df[summary_df['Cumulative_Count'] >= total_count / 2].index[0]
    print(median_bin_index)
    previous_cumulative_count = summary_df.loc[median_bin_index-1, 'Cumulative_Count']
    print(previous_cumulative_count)
    previous_upper = summary_df.loc[summary_df['Cumulative_Count'] == previous_cumulative_count, 'Upper'].iloc[0]
    
    next_cumulative_count = summary_df.loc[median_bin_index + 1, 'Cumulative_Count']
    print(next_cumulative_count)
    next_lower = summary_df.loc[median_bin_index+1, 'Lower']
    print(total_count)

    median_bin_count = summary_df.loc[median_bin_index, 'Cumulative_Count']
    # Interpolate the median value within the bin range
    #this is the amount of the range that it takes to get to the exact median
    cumulative_diff = total_count / 2 - previous_cumulative_count
    print(cumulative_diff)
    #this is the number of people it takes to get to the exact median divided by the number of people in the bin
    interpolation_ratio = cumulative_diff /  (median_bin_count- previous_cumulative_count)
    print(interpolation_ratio)
    #this is the upper range of the previous bin times the ratio of the bin that should be applied times the size of the bin
    median_value = previous_upper + interpolation_ratio * (next_lower - previous_upper)
    
    # Add the estimated median value to the new DataFrame
    summary_df['Estimated_Median'] = median_value
    print(median_value)
    
    
    return summary_df

def load_variable_multiple_year(year_range, dataset, geometry_return, variable, variablename, census_api_key, census_geom_year, tahoe_geometry, variable_category):
    df=pd.DataFrame()
    df_return=pd.DataFrame()
    #year_range = [str(num) for num in range(year_start, year_end+1)]
    for year in year_range:
        if year in ['2020', '2021', '2022']:
            census_geom_year = '2020'
        else:
            census_geom_year = '2010'
        df = get_variable_data(year,dataset,geometry_return,variable,variablename,census_api_key, census_geom_year, tahoe_geometry, variable_category)
        if df_return.empty:
            df_return=df
        else:
            df_return=pd.concat([df_return, df])
    return df_return

def categorize_values(census_df, category_csv, category_column, grouping_prefix):
    categories = pd.read_csv(category_csv)    
    census_df['value'] = census_df['value'].astype(float)
    joined_data = census_df.merge(categories, on = 'variable_code', how = 'left')
    joined_data.sort_values(by='variable_code', inplace=True)
    #This will get rid of any extra columns in the category_csv
    group_columns = [column for column in census_df if column not in ['value', 'variable_code', 'variable_name', 'MarginOfError','OBJECTID']]
    group_columns.append(category_column)
    #grouped_data = joined_data.groupby(group_columns, as_index=False)['value'].sum()    
    print(group_columns)
    grouped_data = joined_data.groupby(group_columns, as_index=False, dropna=False).agg({'value':'sum',
                                                                           'variable_code':lambda x: grouping_prefix +  ', '.join(x)})
    
    #Need to return this formatted for appending to the table - need to get locations of variable_code and variable name, 
    #add them in as columns in those locations and then populate them with category column nanme
    var_code_col_location = census_df.columns.get_loc('variable_code')
    var_name_col_location = census_df.columns.get_loc('variable_name')
    var_moe_col_location = census_df.columns.get_loc('MarginOfError')
    grouped_data.insert(var_moe_col_location, 'MarginOfError', '')
    #grouped_data.insert(var_code_col_location, 'variable_code','Grouped Value')
    grouped_data.insert(var_name_col_location, 'variable_name','')
    #grouped_data['variable_code'] = grouped_data['variable_code'] +  '_Grouped'
    grouped_data['variable_name'] = grouped_data[category_column]
    grouped_data['dataset']= grouping_prefix + grouped_data['dataset']
    grouped_data['variable_category']= grouping_prefix +  grouped_data['variable_category'] 
    columns_to_keep = [column for column in census_df if column not in ['OBJECTID']]
    grouped_data= grouped_data[columns_to_keep]
    return grouped_data


# Download data sets

# Import 1990 data

## Download all contents of folder



In [None]:
#Folder of csvs with variables to be downloaded
folder_path = 'Census_Variable_Lists/'

# Create an empty dictionary to store DataFrames
variable_lists = {}

# Loop through all files in the folder
for filename in os.listdir(folder_path):
    # Check if the file is a CSV file
    if filename.endswith('.csv'):
        # Construct the full path to the CSV file
        file_path = os.path.join(folder_path, filename)
        
        # Read the CSV file into a DataFrame
        df_name = os.path.splitext(filename)[0]  # Extract the name without the '.csv' extension
        variable_lists[df_name] = pd.read_csv(file_path, dtype=dtypes)

In [None]:
print(variable_lists.keys())

data_downloads = {}

for df_name, df in variable_lists.items():
    # Perform the substring substitution in the DataFrame name
    new_df_name = df_name.replace('variables', 'values')
    
    # Store the DataFrame with the updated name in the new dictionary
    data_downloads[new_df_name] = pd.DataFrame()
    for index, row in df.iterrows():
        print(index)
        df1 = get_variable_data(row['Year'], row['Datasource Name'],'block group',row['CodeNumber'],row['Variable'], census_api_key, row['census_geom_year'], tahoe_geometry, row['Category'])
        
        data_downloads[new_df_name] = create_or_append_df(data_downloads[new_df_name], df1)
    

#Loop through this dictionary, create a dataframe using those names and run the downloads for each one

## Download Data Sets

### Download core acs 2020 block group data

In [None]:
acs_bg_2020 = census_download_wrapper('census_variables_acs_blockgroup_2020.csv')

In [None]:
acs_tract_2020 = census_download_wrapper('census_variables_acs_tract_2020.csv')

In [None]:
acs_bg_homevalue_2020 = census_download_wrapper('Census_Variable_Lists\census_variables_acs_blockgroup_Median Home Values.csv')

In [11]:
acs_bg_homevalue_2020.to_excel("acs_bg_homevalue_2020.xlsx")

In [None]:
acs_tract_homevalue_2020 = census_download_wrapper('Census_Variable_Lists\census_variables_acs_tract_home_values.csv')
acs_tract_homevalue_2020.to_excel("acs_tract_homevalue_2020.xlsx")

## Download 2021 acs tract data

In [None]:
acs_tract_2020_1 = census_download_wrapper('Census_Variable_Lists\census_variables_acs_tract_2020_1.csv')
acs_tract_2020_1.to_excel("Census_Data_Downloads/acs_tract_2020_1.xlsx", index=False)

In [4]:
acs_internet_2021 = census_download_wrapper('Census_Variable_Lists\census_variables_internet.csv')
acs_internet_2021.to_excel("Census_Data_Downloads/acs_internet_2021.xlsx", index=False)

0
https://api.census.gov/data/2021/acs/acs5?get=GEO_ID,B28002_013E,B28002_013M&for=block%20group:*&in=state:06%20county:017%20tract:*&key=9a73d08c296b844e58f1c70bd19c831826da5cbf
https://api.census.gov/data/2021/acs/acs5?get=GEO_ID,B28002_013E,B28002_013M&for=block%20group:*&in=state:06%20county:061%20tract:*&key=9a73d08c296b844e58f1c70bd19c831826da5cbf
https://api.census.gov/data/2021/acs/acs5?get=GEO_ID,B28002_013E,B28002_013M&for=block%20group:*&in=state:32%20county:005%20tract:*&key=9a73d08c296b844e58f1c70bd19c831826da5cbf
https://api.census.gov/data/2021/acs/acs5?get=GEO_ID,B28002_013E,B28002_013M&for=block%20group:*&in=state:32%20county:031%20tract:*&key=9a73d08c296b844e58f1c70bd19c831826da5cbf
https://api.census.gov/data/2021/acs/acs5?get=GEO_ID,B28002_013E,B28002_013M&for=block%20group:*&in=state:32%20county:510%20tract:*&key=9a73d08c296b844e58f1c70bd19c831826da5cbf
1
https://api.census.gov/data/2021/acs/acs5?get=GEO_ID,B28002_001E,B28002_001M&for=block%20group:*&in=state:06%20

In [None]:
acs_tract_2020_2 = census_download_wrapper('Census_Variable_Lists\census_variables_acs_tract_2020_2.csv')
acs_tract_2020_2.to_excel("Census_Data_Downloads/acs_tract_2020_2.xlsx", index=False)

In [3]:
acs_tract_2010_3 = census_download_wrapper('Census_Variable_Lists\census_variables_acs_tract_2010_3.csv')
acs_tract_2010_3.to_excel("Census_Data_Downloads/acs_tract_2010_3.xlsx", index=False)

0
https://api.census.gov/data/2010/acs/acs5?get=GEO_ID,B19001_014E,B19001_014M&for=tract:*&in=state:06%20county:017&key=9a73d08c296b844e58f1c70bd19c831826da5cbf
https://api.census.gov/data/2010/acs/acs5?get=GEO_ID,B19001_014E,B19001_014M&for=tract:*&in=state:06%20county:061&key=9a73d08c296b844e58f1c70bd19c831826da5cbf
https://api.census.gov/data/2010/acs/acs5?get=GEO_ID,B19001_014E,B19001_014M&for=tract:*&in=state:32%20county:005&key=9a73d08c296b844e58f1c70bd19c831826da5cbf
https://api.census.gov/data/2010/acs/acs5?get=GEO_ID,B19001_014E,B19001_014M&for=tract:*&in=state:32%20county:031&key=9a73d08c296b844e58f1c70bd19c831826da5cbf
https://api.census.gov/data/2010/acs/acs5?get=GEO_ID,B19001_014E,B19001_014M&for=tract:*&in=state:32%20county:510&key=9a73d08c296b844e58f1c70bd19c831826da5cbf
1
https://api.census.gov/data/2010/acs/acs5?get=GEO_ID,B19001_015E,B19001_015M&for=tract:*&in=state:06%20county:017&key=9a73d08c296b844e58f1c70bd19c831826da5cbf
https://api.census.gov/data/2010/acs/acs5?

In [4]:
acs_tract_2010_4 = census_download_wrapper('Census_Variable_Lists\census_variables_acs_tract_2010_4.csv')
acs_tract_2010_4.to_excel("Census_Data_Downloads/acs_tract_2010_4.xlsx", index=False)

0
https://api.census.gov/data/2010/acs/acs5?get=GEO_ID,B19001_017E,B19001_017M&for=tract:*&in=state:06%20county:017&key=9a73d08c296b844e58f1c70bd19c831826da5cbf
https://api.census.gov/data/2010/acs/acs5?get=GEO_ID,B19001_017E,B19001_017M&for=tract:*&in=state:06%20county:061&key=9a73d08c296b844e58f1c70bd19c831826da5cbf
https://api.census.gov/data/2010/acs/acs5?get=GEO_ID,B19001_017E,B19001_017M&for=tract:*&in=state:32%20county:005&key=9a73d08c296b844e58f1c70bd19c831826da5cbf
https://api.census.gov/data/2010/acs/acs5?get=GEO_ID,B19001_017E,B19001_017M&for=tract:*&in=state:32%20county:031&key=9a73d08c296b844e58f1c70bd19c831826da5cbf
https://api.census.gov/data/2010/acs/acs5?get=GEO_ID,B19001_017E,B19001_017M&for=tract:*&in=state:32%20county:510&key=9a73d08c296b844e58f1c70bd19c831826da5cbf
1
https://api.census.gov/data/2010/acs/acs5?get=GEO_ID,B19001_002E,B19001_002M&for=tract:*&in=state:06%20county:017&key=9a73d08c296b844e58f1c70bd19c831826da5cbf
https://api.census.gov/data/2010/acs/acs5?

In [4]:
dtypes = {
    'Variable' : str,
    'Code': str,
    'Category': str,
    'Datasource': str,
    'CodeNumber':str,
    'Year':str,
    'census_geom_year':str,
    'GeometryLevel':str
}


acs_tract_2020_variables = pd.read_csv("Census_Variable_Lists\census_variables_acs_tract_2020.csv",dtype=dtypes)

#Loop through this?
acs_tract_2020=pd.DataFrame()
for index, row in acs_tract_2020_variables.iterrows():
    print(index)
    df = get_variable_data(row['Year'], row['Datasource Name'],'tract',row['CodeNumber'],row['Variable'], census_api_key, row['census_geom_year'], tahoe_geometry, row['Category'])
    
    acs_tract_2020= create_or_append_df(acs_tract_2020, df)

0
https://api.census.gov/data/2020/acs/acs5?get=GEO_ID,B19052_001E,B19052_001M&for=tract:*&in=state:06%20county:017&key=9a73d08c296b844e58f1c70bd19c831826da5cbf
https://api.census.gov/data/2020/acs/acs5?get=GEO_ID,B19052_001E,B19052_001M&for=tract:*&in=state:06%20county:061&key=9a73d08c296b844e58f1c70bd19c831826da5cbf
https://api.census.gov/data/2020/acs/acs5?get=GEO_ID,B19052_001E,B19052_001M&for=tract:*&in=state:32%20county:005&key=9a73d08c296b844e58f1c70bd19c831826da5cbf
https://api.census.gov/data/2020/acs/acs5?get=GEO_ID,B19052_001E,B19052_001M&for=tract:*&in=state:32%20county:031&key=9a73d08c296b844e58f1c70bd19c831826da5cbf
https://api.census.gov/data/2020/acs/acs5?get=GEO_ID,B19052_001E,B19052_001M&for=tract:*&in=state:32%20county:510&key=9a73d08c296b844e58f1c70bd19c831826da5cbf
1
https://api.census.gov/data/2020/acs/acs5?get=GEO_ID,B19052_002E,B19052_002M&for=tract:*&in=state:06%20county:017&key=9a73d08c296b844e58f1c70bd19c831826da5cbf
https://api.census.gov/data/2020/acs/acs5?

: 

: 

## download 2000 Census Data

In [None]:
dem_tract_2020 = census_download_wrapper('Census_Variable_Lists\demographic_variables_2000.csv')
dem_tract_2020.to_excel("dem_tract_2020_first_half.xlsx")

In [None]:
dem_tract_2020_2 = census_download_wrapper('Census_Variable_Lists\demographic_variables_2000_second.csv')
dem_tract_2020_2.to_excel("dem_tract_2020_second.xlsx")

## Download 2010 DEC Census Data

In [None]:
dem_tract_2010 = census_download_wrapper('Census_Variable_Lists\census_variables_dec_tract_2010.csv')
dem_tract_2010.to_excel("Census_Data_Downloads\dem_tract_2010.xlsx")

In [None]:
dem_tract_2010_age = census_download_wrapper('Census_Variable_Lists\census_variables_dec_tract_2010_age.csv')
dem_tract_2010_age.to_excel("Census_Data_Downloads\dem_tract_2010_age.xlsx")

In [5]:
dem_tract_2000_age = census_download_wrapper('Census_Variable_Lists\census_variables_dec_tract_2000_age.csv')
dem_tract_2000_age.to_excel("Census_Data_Downloads\dem_tract_2000_age.xlsx")

0
https://api.census.gov/data/2000/dec/sf1?get=GEO_ID,P012002&for=tract:*&in=state:06%20county:017&key=9a73d08c296b844e58f1c70bd19c831826da5cbf
https://api.census.gov/data/2000/dec/sf1?get=GEO_ID,P012002&for=tract:*&in=state:06%20county:061&key=9a73d08c296b844e58f1c70bd19c831826da5cbf
https://api.census.gov/data/2000/dec/sf1?get=GEO_ID,P012002&for=tract:*&in=state:32%20county:005&key=9a73d08c296b844e58f1c70bd19c831826da5cbf
https://api.census.gov/data/2000/dec/sf1?get=GEO_ID,P012002&for=tract:*&in=state:32%20county:031&key=9a73d08c296b844e58f1c70bd19c831826da5cbf
https://api.census.gov/data/2000/dec/sf1?get=GEO_ID,P012002&for=tract:*&in=state:32%20county:510&key=9a73d08c296b844e58f1c70bd19c831826da5cbf
1
https://api.census.gov/data/2000/dec/sf1?get=GEO_ID,P012003&for=tract:*&in=state:06%20county:017&key=9a73d08c296b844e58f1c70bd19c831826da5cbf
https://api.census.gov/data/2000/dec/sf1?get=GEO_ID,P012003&for=tract:*&in=state:06%20county:061&key=9a73d08c296b844e58f1c70bd19c831826da5cbf
ht

## Download 2020 Dec Census Data

In [None]:
dem_tract_2020 = census_download_wrapper('Census_Variable_Lists\census_variables_dec_tract_2020.csv')
dem_tract_2020.to_excel("Census_Data_Downloads\dem_tract_2020.xlsx", index=False)

In [None]:
dem_tract_2020_age = census_download_wrapper('Census_Variable_Lists\census_variables_dec_tract_2020_age.csv')
dem_tract_2020_age.to_excel("Census_Data_Downloads\dem_tract_2020_age.xlsx", index=False)

In [None]:
housing_additional = census_download_wrapper('Census_Variable_Lists\census_variables_housing_add.csv')
housing_additional.to_excel("Census_Data_Downloads\housing_additional.xlsx", index=False)

In [6]:
housing_additional_acs = census_download_wrapper('Census_Variable_Lists\census_variables_housing_add_full.csv')
housing_additional_acs.to_excel("Census_Data_Downloads\housing_additional_acs.xlsx", index=False)

0
https://api.census.gov/data/2021/acs/acs5?get=GEO_ID,B25004_006E,B25004_006M&for=tract:*&in=state:06%20county:017&key=9a73d08c296b844e58f1c70bd19c831826da5cbf
https://api.census.gov/data/2021/acs/acs5?get=GEO_ID,B25004_006E,B25004_006M&for=tract:*&in=state:06%20county:061&key=9a73d08c296b844e58f1c70bd19c831826da5cbf
https://api.census.gov/data/2021/acs/acs5?get=GEO_ID,B25004_006E,B25004_006M&for=tract:*&in=state:32%20county:005&key=9a73d08c296b844e58f1c70bd19c831826da5cbf
https://api.census.gov/data/2021/acs/acs5?get=GEO_ID,B25004_006E,B25004_006M&for=tract:*&in=state:32%20county:031&key=9a73d08c296b844e58f1c70bd19c831826da5cbf
https://api.census.gov/data/2021/acs/acs5?get=GEO_ID,B25004_006E,B25004_006M&for=tract:*&in=state:32%20county:510&key=9a73d08c296b844e58f1c70bd19c831826da5cbf
1
https://api.census.gov/data/2020/acs/acs5?get=GEO_ID,B25004_006E,B25004_006M&for=tract:*&in=state:06%20county:017&key=9a73d08c296b844e58f1c70bd19c831826da5cbf
https://api.census.gov/data/2020/acs/acs5?

In [3]:
dtypes = {
    'Variable' : str,
    'Code': str,
    'Category': str,
    'Datasource': str,
    'CodeNumber':str,
    'Year':str,
    'census_geom_year':str,
    'GeometryLevel':str
}


demographic_variables_2020_bg_extra = pd.read_csv("Census_Variable_Lists\census_variables_acs_blockgroup_2021_extra.csv",dtype=dtypes)

#Loop through this?
demographic_values_2020_bg_extra=pd.DataFrame()
for index, row in demographic_variables_2020_bg_extra.iterrows():
    print(index)
    df = get_variable_data(row['Year'], row['Datasource Name'],'block group',row['CodeNumber'],row['Variable'], census_api_key, row['census_geom_year'], tahoe_geometry, row['Category'])
    
    demographic_values_2020_bg_extra = create_or_append_df(demographic_values_2020_bg_extra, df)

0
https://api.census.gov/data/2021/acs/acs5?get=GEO_ID,B19001_001E,B19001_001M&for=block%20group:*&in=state:06%20county:017%20tract:*&key=9a73d08c296b844e58f1c70bd19c831826da5cbf
https://api.census.gov/data/2021/acs/acs5?get=GEO_ID,B19001_001E,B19001_001M&for=block%20group:*&in=state:06%20county:061%20tract:*&key=9a73d08c296b844e58f1c70bd19c831826da5cbf
https://api.census.gov/data/2021/acs/acs5?get=GEO_ID,B19001_001E,B19001_001M&for=block%20group:*&in=state:32%20county:005%20tract:*&key=9a73d08c296b844e58f1c70bd19c831826da5cbf
https://api.census.gov/data/2021/acs/acs5?get=GEO_ID,B19001_001E,B19001_001M&for=block%20group:*&in=state:32%20county:031%20tract:*&key=9a73d08c296b844e58f1c70bd19c831826da5cbf
https://api.census.gov/data/2021/acs/acs5?get=GEO_ID,B19001_001E,B19001_001M&for=block%20group:*&in=state:32%20county:510%20tract:*&key=9a73d08c296b844e58f1c70bd19c831826da5cbf
1
https://api.census.gov/data/2020/acs/acs5?get=GEO_ID,B19001_001E,B19001_001M&for=block%20group:*&in=state:06%20

In [4]:
demographic_values_2020_bg_extra.to_excel('demographic_values_2021_bg_extra.xlsx')

In [50]:
dtypes = {
    'Variable' : str,
    'Code': str,
    'Category': str,
    'Datasource': str,
    'CodeNumber':str,
    'Year':str,
    'census_geom_year':str,
    'GeometryLevel':str
}


demographic_variables_2020_bg_disability = pd.read_csv("Census_Variable_Lists\census_variables_acs_blockgroup_2021_Disability.csv",dtype=dtypes)

#Loop through this?
demographic_values_2020_bg_disability=pd.DataFrame()
for index, row in demographic_variables_2020_bg_disability.iterrows():
    print(index)
    df = get_variable_data(row['Year'], row['Datasource Name'],'block group',row['CodeNumber'],row['Variable'], census_api_key, row['census_geom_year'], tahoe_geometry, row['Category'])
    
    demographic_values_2020_bg_disability = create_or_append_df(demographic_values_2020_bg_disability, df)

0
https://api.census.gov/data/2021/acs/acs5?get=GEO_ID,B23024_001E,B23024_001M&for=block%20group:*&in=state:06%20county:017%20tract:*&key=9a73d08c296b844e58f1c70bd19c831826da5cbf
https://api.census.gov/data/2021/acs/acs5?get=GEO_ID,B23024_001E,B23024_001M&for=block%20group:*&in=state:06%20county:061%20tract:*&key=9a73d08c296b844e58f1c70bd19c831826da5cbf
https://api.census.gov/data/2021/acs/acs5?get=GEO_ID,B23024_001E,B23024_001M&for=block%20group:*&in=state:32%20county:005%20tract:*&key=9a73d08c296b844e58f1c70bd19c831826da5cbf
https://api.census.gov/data/2021/acs/acs5?get=GEO_ID,B23024_001E,B23024_001M&for=block%20group:*&in=state:32%20county:031%20tract:*&key=9a73d08c296b844e58f1c70bd19c831826da5cbf
https://api.census.gov/data/2021/acs/acs5?get=GEO_ID,B23024_001E,B23024_001M&for=block%20group:*&in=state:32%20county:510%20tract:*&key=9a73d08c296b844e58f1c70bd19c831826da5cbf
1
https://api.census.gov/data/2021/acs/acs5?get=GEO_ID,B23024_003E,B23024_003M&for=block%20group:*&in=state:06%20

In [51]:
demographic_values_2020_bg_disability.to_excel("demographic_values_2021_bg_disability.xlsx")

In [None]:
data_downloads['census_values_acs_blockgroup_2021'].to_excel("census_values_acs_blockgroup_2021.xlsx")


In [69]:
dtypes = {
    'Variable' : str,
    'Code': str,
    'Category': str,
    'Datasource': str,
    'CodeNumber':str,
    'Year':str,
    'census_geom_year':str,
    'GeometryLevel':str
}


demographic_variables_2020_tract = pd.read_csv("Census_Variable_Lists\census_variables_acs_tract_2020.csv",dtype=dtypes)

#Loop through this?
demographic_values_2020_tract=pd.DataFrame()
for index, row in demographic_variables_2020_bg.iterrows():
    print(index)
    df = get_variable_data(row['Year'], row['Datasource Name'],'tract',row['CodeNumber'],row['Variable'], census_api_key, row['census_geom_year'], tahoe_geometry, row['Category'])
    
    demographic_values_2020_tract = create_or_append_df(demographic_values_2020_tract, df)

0
https://api.census.gov/data/2020/acs/acs5?get=GEO_ID,B19052_001E,B19052_001M&for=tract:*&in=state:06%20county:017&key=9a73d08c296b844e58f1c70bd19c831826da5cbf
https://api.census.gov/data/2020/acs/acs5?get=GEO_ID,B19052_001E,B19052_001M&for=tract:*&in=state:06%20county:061&key=9a73d08c296b844e58f1c70bd19c831826da5cbf
https://api.census.gov/data/2020/acs/acs5?get=GEO_ID,B19052_001E,B19052_001M&for=tract:*&in=state:32%20county:005&key=9a73d08c296b844e58f1c70bd19c831826da5cbf
https://api.census.gov/data/2020/acs/acs5?get=GEO_ID,B19052_001E,B19052_001M&for=tract:*&in=state:32%20county:031&key=9a73d08c296b844e58f1c70bd19c831826da5cbf
https://api.census.gov/data/2020/acs/acs5?get=GEO_ID,B19052_001E,B19052_001M&for=tract:*&in=state:32%20county:510&key=9a73d08c296b844e58f1c70bd19c831826da5cbf
1
https://api.census.gov/data/2020/acs/acs5?get=GEO_ID,B19052_002E,B19052_002M&for=tract:*&in=state:06%20county:017&key=9a73d08c296b844e58f1c70bd19c831826da5cbf
https://api.census.gov/data/2020/acs/acs5?

ConnectionError: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))

In [None]:
acs_income_data = data_downloads['census_values_acs_blockgroup_2021'].loc[data_downloads['census_values_acs_blockgroup_2021']['variable_category']=='Household Income']
grouped_acs_income_data = categorize_values(acs_income_data, 'acs_income_categories.csv','Census Category', 'TRPA Census Report Grouped: ')
grouped_acs_income_data['sample_level']=grouped_acs_income_data['sample_level'].replace("%20", " ")
grouped_acs_income_data.to_excel('grouped_acs_income_data.xlsx')
#acs_age_data = demographic_values_2020.loc[(demographic_values_2020['variable_category']=='Age') & (demographic_values_2020['dataset']=='acs/acs5')]

In [76]:
acs_age_data = data_downloads['census_values_acs_blockgroup_2021'].loc[data_downloads['census_values_acs_blockgroup_2021']['variable_category']=='Age']
grouped_acs_age_data = categorize_values(acs_age_data, 'acs_age_categories.csv','Broad Category', 'TRPA Broad Age Categories Grouped: ')
grouped_acs_age_data['sample_level']=grouped_acs_age_data['sample_level'].replace("%20", " ")
grouped_acs_age_data.to_excel('acs_age_data.xlsx')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  census_df['value'] = census_df['value'].astype(float)


In [73]:
acs_disability_data = demographic_values_2020_bg_disability.loc[demographic_values_2020_bg_disability['variable_category']=='Disability By Work']
grouped_acs_disability_data = categorize_values(acs_disability_data,'acs_disability_categories.csv', 'Broad Category', 'TRPA Disability Categories Grouped: ')
grouped_acs_disability_data['sample_level']= grouped_acs_disability_data['sample_level'].replace("%20", " ")
grouped_acs_disability_data.to_excel('grouped_acs_disability_data.xlsx')

In [None]:
arcpy.env.workspace = 'C:/path/to/your/geodatabase.gdb'

# Define the target table
# Replace 'TargetTable' with the name of the table you want to append to.
target_table = 'TargetTable'

# Get the field names from the target table
field_names = [field.name for field in arcpy.ListFields(target_table)]

# Filter the DataFrame to include only the fields that match the target table's fields
df = df[field_names]

# Convert the DataFrame to an ArcPy table (numpy recarray)
# Specify the field names in the order you want them to be inserted into the table.
data_to_insert = df.to_records(index=False)

# Use an InsertCursor to append records to the table
# Note that the 'data_to_insert' array should have the same field order as the target table.
with arcpy.da.InsertCursor(target_table, field_names) as cursor:
    for row in data_to_insert:
        cursor.insertRow(row)

In [28]:
#test_median = calculate_median_value(clean_income_data,'variable_name', 'variable_code', 'value', 'variable_category', 'Income')
grouping_variables = ['state', 'county', 'tract', 'block group', 'year_sample', 'sample_level', 'dataset', 'census_geom_year', 'TRPAID', 'variable_category', 'variable_code', 'variable_name']

median_income = calculate_median_value(acs_income_data, 'variable_name', 'variable_code', 'value', 'variable_category', 'Household Income', grouping_variables)

median_income.to_excel('blockgroup_median.xlsx')

10000.0
200000.0
$75,000 to $99,999
865
11784.0
11934.0
23613.0
22.5
0.25
93749.25


In [2]:
def calculate_weighted_median(group):
    total_count = group['value'].sum()
    group['cumulative_count'] = group['value'].cumsum()
    median_index = total_count / 2
    
    # Find the row that contains the median value
    median_row = group[group['cumulative_count'] >= median_index].iloc[0]
    lower, upper = median_row['Lower'], median_row['Upper']
    
    # Calculate the median considering the weights of bins
    median = lower + (upper - lower) * (median_index - median_row['cumulative_count'] + median_row['value']) / median_row['value']
    print(median)
    return median
count_column = 'value'
sort_column = 'variable_code'
bin_column = 'variable_name'

#Create a copy of the dataframe because we don't want to modify the original
summary_df = acs_income_data.copy()
#Make sure that the value is an integer
summary_df[count_column]=summary_df[count_column].astype(int)
#This part needs to be modified for smaller geometries
#Not really relevant if we're going for the smallest geometry - should only be 1 row per category and geometry
#summary_df = summary_df.groupby(grouping_variables)[count_column].sum()
#summary_df = summary_df.reset_index()
#Still need this to get rid of 0s
summary_df = summary_df.loc[summary_df[count_column]>=0]


#summary_df= summary_df.loc[summary_df[category_field]==category]
# Sort the DataFrame based on the variable name column - need to add this back into new groupings
summary_df.sort_values(by=sort_column, inplace=True)
summary_df = summary_df.reset_index()

# Extract lower and upper limits from bin categories
pattern = r'(\d+[\d,]*)'
summary_df['temp'] = summary_df[bin_column].str.replace(',', '').str.findall(pattern)
summary_df[['Lower', 'Upper']] = summary_df['temp'].apply(lambda x: pd.Series(x[:2]) if len(x) == 2 else pd.Series([None, None]))
summary_df['Lower'] = summary_df['Lower'].astype(float)
summary_df['Upper'] = summary_df['Upper'].astype(float)
# Handle first category

lower_category = summary_df['variable_code'].iloc[0]
first_upper = float(summary_df['temp'].iloc[0][0])
#convert string to float
print(first_upper)
summary_df.loc[summary_df['variable_code']==lower_category, 'Lower'] = 0  # Set lower value to 0
summary_df.loc[summary_df['variable_code']==lower_category, 'Upper'] = first_upper

# Handle last category

last_lower = float(summary_df['temp'].iloc[-1][0])
upper_category = summary_df['variable_code'].iloc[-1]
print(upper_category)
print(last_lower)
summary_df.loc[summary_df['variable_code']==upper_category, 'Lower'] = last_lower
summary_df.loc[summary_df['variable_code']==upper_category, 'Upper'] = np.inf  # Set upper value to infinity
#This seems extranous
summary_df[count_column]= summary_df[count_column].astype(float)   
# Calculate cumulative count
#This is where we need to group by everything except for the variable code/name
grouping_columns = ['GEO_ID', 'state', 'county', 'tract', 'block group', 'variable_category', 'year_sample', 'sample_level', 'dataset', 'census_geom_year', 'TRPAID']

result = summary_df.groupby(grouping_columns, as_index = False).apply(calculate_weighted_median).reset_index(name='weighted_median')
result

result.to_excel('bg_median_income.xlsx')

NameError: name 'acs_income_data' is not defined

In [19]:
#Start writing some code to create classifications for income and age
#This will work as a template for all of these groupings. We can then reselct down to only the relevant ones
def categorize_values(census_df, category_csv, category_column):
    categories = pd.read_csv(category_csv)
    census_df['value'] = census_df['value'].astype(float)
    joined_data = census_df.merge(categories, on = 'variable_code', how = 'left')
    #This will get rid of any extra columns in the category_csv
    group_columns = [column for column in census_df if column not in ['value', 'variable_code', 'variable_name', 'MarginOfError']]
    group_columns.append(category_column)
    
    grouped_data = joined_data.groupby(group_columns, as_index=False)['value'].sum()    
    #Need to return this formatted for appending to the table - need to get locations of variable_code and variable name, 
    #add them in as columns in those locations and then populate them with category column nanme
    var_code_col_location = census_df.columns.get_loc('variable_code')
    var_name_col_location = census_df.columns.get_loc('variable_name')
    var_moe_col_location = census_df.columns.get_loc('MarginOfError')
    grouped_data.insert(var_moe_col_location, 'MarginOfError', '')
    grouped_data.insert(var_code_col_location, 'variable_code','Grouped Value')
    grouped_data.insert(var_name_col_location, 'variable_name','')
    grouped_data['variable_code'] = 'Grouped Value'
    grouped_data['variable_name'] = grouped_data[category_column]
    columns_to_keep = census_df.columns
    grouped_data= grouped_data[columns_to_keep]
    return grouped_data


acs_age_categories = pd.read_csv('acs_age_categories.csv')

acs_age_data = demographic_values_2020.loc[(demographic_values_2020['variable_category']=='Age') & (demographic_values_2020['dataset']=='acs/acs5')]

#add to function
acs_age_data['variable_code'] = acs_age_data['variable_code'].str.split(',').str[0]
acs_age_data['value'] = acs_age_data['value'].astype(float)

acs_age_data_joined= acs_age_data.merge(acs_age_categories, left_on='variable_code', right_on='variable_code', how='left')

#add to function
acs_age_data_joined['value'] = acs_age_data_joined['value'].astype(float)
group_columns = [column for column in df.columns if column != 'ExcludeColumn']


broad_age_groups = acs_age_data_joined.groupby(['state','county', 'tract', 'year_sample', 'Broad Category'], as_index=False)['value'].sum()
census_age_groups = acs_age_data_joined.groupby(['state','county', 'tract', 'year_sample', 'Census Category'], as_index=False)['value'].sum()

test_broad_age_groups = categorize_values(acs_age_data,'acs_age_categories.csv','Broad Category')

NameError: name 'demographic_values_2020' is not defined

In [None]:
dtypes = {
    'Variable' : str,
    'Code': str,
    'Category': str,
    'Datasource': str,
    'CodeNumber':str,
    'Year':str,
    'census_geom_year':str,
    'GeometryLevel':str
}


demographic_variables_2020_bg = pd.read_csv("Census_Variable_Lists\census_variables_acs_tract_2020.csv",dtype=dtypes)

#Loop through this?
demographic_values_2020_bg=pd.DataFrame()
for index, row in demographic_variables_2020_bg.iterrows():
    print(index)
    df = get_variable_data(row['Year'], row['Datasource Name'],'block group',row['CodeNumber'],row['Variable'], census_api_key, row['census_geom_year'], tahoe_geometry, row['Category'])
    
    demographic_values_2020_bg = create_or_append_df(demographic_values_2020_bg, df)

In [None]:
demographic_variables_vehicles = pd.read_csv("demographic_variables_Vehicles.csv")

In [None]:
demographic_values_vehicles=pd.DataFrame()
for index, row in demographic_variables_vehicles.iterrows():
    df = get_variable_data(row['Year'], row['Datasource Name'],'tract',row['CodeNumber'],row['Variable'], census_api_key, row['census_geom_year'], tahoe_geometry, row['Category'])
    #df = get_variable_data(row['Year'], row['Datasource Name'],'tract',row['CodeNumber'],row['Variable'], census_api_key, row['census_geom_year'], tahoe_geometry )
    demographic_values_vehicles = create_or_append_df(demographic_values_vehicles, df)

In [None]:
demographic_values_vehicles.to_excel("Vehicle_Stats.xlsx")

In [None]:
demographic_variables_2000 = pd.read_csv("demographic_variables_2000.csv")

In [None]:
demographic_values_2000=pd.DataFrame()
for index, row in demographic_variables_2000.iterrows():
    df = get_variable_data(row['Year'], row['Datasource Name'],'tract',row['CodeNumber'],row['Variable'], census_api_key, row['census_geom_year'], tahoe_geometry, row['Category'])
    #df = get_variable_data(row['Year'], row['Datasource Name'],'tract',row['CodeNumber'],row['Variable'], census_api_key, row['census_geom_year'], tahoe_geometry )
    demographic_values_2000 = create_or_append_df(demographic_values_2000, df)

In [None]:
demographic_values_2000.to_excel("Comparison.xlsx")

In [None]:
demographic_variables_2010 = pd.read_csv("demographic_variables_2010.csv")
demographic_values_2010=pd.DataFrame()
for index, row in demographic_variables_2010.iterrows():
    df = get_variable_data(row['Year'], row['Datasource Name'],'tract',row['CodeNumber'],row['Variable'], census_api_key, row['census_geom_year'], tahoe_geometry, row['Category'])
    #df = get_variable_data(row['Year'], row['Datasource Name'],'tract',row['CodeNumber'],row['Variable'], census_api_key, row['census_geom_year'], tahoe_geometry )
    demographic_values_2010 = create_or_append_df(demographic_values_2010, df)

In [None]:
demographic_values_2010.to_excel("comparison_2010.xlsx")

In [None]:
dtypes = {
    'Variable' : str,
    'Code': str,
    'Category': str,
    'Datasource': str,
    'CodeNumber':str,
    'Year':str,
    'census_geom_year':str,
    'GeometryLevel':str
}


demographic_variables_2020 = pd.read_csv("demographic_variables_2020.csv",dtype=dtypes)
demographic_values_2020=pd.DataFrame()
for index, row in demographic_variables_2020.iterrows():
    df = get_variable_data(row['Year'], row['Datasource Name'],'tract',row['CodeNumber'],row['Variable'], census_api_key, row['census_geom_year'], tahoe_geometry, row['Category'])
    #df = get_variable_data(row['Year'], row['Datasource Name'],'tract',row['CodeNumber'],row['Variable'], census_api_key, row['census_geom_year'], tahoe_geometry )
    demographic_values_2020 = create_or_append_df(demographic_values_2020, df)

In [None]:
demographic_values_2020.to_excel("demographic_variables_2020-2021.xlsx")

In [None]:
dtypes = {
    'Variable' : str,
    'Code': str,
    'Category': str,
    'Datasource': str,
    'CodeNumber':str,
    'Year':str,
    'census_geom_year':str,
    'GeometryLevel':str
}


demographic_variables_2020_bg = pd.read_csv("demographic_variables_acs_bg.csv",dtype=dtypes)


demographic_values_2020_bg=pd.DataFrame()
for index, row in demographic_variables_2020_bg.iterrows():
    print(index)
    df = get_variable_data(row['Year'], row['Datasource Name'],'block group',row['CodeNumber'],row['Variable'], census_api_key, row['census_geom_year'], tahoe_geometry, row['Category'])
    #df = get_variable_data(row['Year'], row['Datasource Name'],'tract',row['CodeNumber'],row['Variable'], census_api_key, row['census_geom_year'], tahoe_geometry )
    demographic_values_2020_bg = create_or_append_df(demographic_values_2020_bg, df)

In [None]:
tahoe_geometry.to_excel("tg_test.xlsx")
demographic_values_2020_bg.to_excel("demo_test.xlsx")

In [None]:
print(len(demographic_values_2020_bg))

In [None]:
#Start writing some code to create classifications for income and age
#This will work as a template for all of these groupings. We can then reselct down to only the relevant ones
def categorize_values(census_df, category_csv, category_column):
    categories = pd.read_csv(category_csv)
    joined_data = census_df.merge(categories, on = 'variable_code', how = 'left')
    #This will get rid of any extra columns in the category_csv
    group_columns = [column for column in census_df if column not in ['value', 'variable_code', 'variable_name', 'MarginOfError']]
    group_columns.append(category_column)
    grouped_data = joined_data.groupby(group_columns, as_index=False)['value'].sum()    
    #Need to return this formatted for appending to the table - need to get locations of variable_code and variable name, 
    #add them in as columns in those locations and then populate them with category column nanme
    var_code_col_location = census_df.columns.get_loc('variable_code')
    var_name_col_location = census_df.columns.get_loc('variable_name')
    var_moe_col_location = census_df.columns.get_loc('MarginOfError')
    grouped_data.insert(var_moe_col_location, 'MarginOfError', '')
    grouped_data.insert(var_code_col_location, 'variable_code','Grouped Value')
    grouped_data.insert(var_name_col_location, 'variable_name','')
    grouped_data['variable_code'] = 'Grouped Value'
    grouped_data['variable_name'] = grouped_data[category_column]
    columns_to_keep = census_df.columns
    grouped_data= grouped_data[columns_to_keep]
    return grouped_data


acs_age_categories = pd.read_csv('acs_age_categories.csv')

acs_age_data = demographic_values_2020.loc[(demographic_values_2020['variable_category']=='Age') & (demographic_values_2020['dataset']=='acs/acs5')]

#add to function
acs_age_data['variable_code'] = acs_age_data['variable_code'].str.split(',').str[0]
acs_age_data['value'] = acs_age_data['value'].astype(float)

acs_age_data_joined= acs_age_data.merge(acs_age_categories, left_on='variable_code', right_on='variable_code', how='left')

#add to function
acs_age_data_joined['value'] = acs_age_data_joined['value'].astype(float)
group_columns = [column for column in df.columns if column != 'ExcludeColumn']


broad_age_groups = acs_age_data_joined.groupby(['state','county', 'tract', 'year_sample', 'Broad Category'], as_index=False)['value'].sum()
census_age_groups = acs_age_data_joined.groupby(['state','county', 'tract', 'year_sample', 'Census Category'], as_index=False)['value'].sum()

test_broad_age_groups = categorize_values(acs_age_data,'acs_age_categories.csv','Broad Category')


In [None]:
all_demographic_values = demographic_values_2000.append(demographic_values_2010, ignore_index= True)
all_demographic_values = all_demographic_values.append(demographic_values_2020, ignore_index = True)

In [None]:
all_demographic_values.to_excel('demographic_values_total.xlsx')

In [None]:
#Get rid of weird -6666 values
#Group by the variables 
#demographic_values_2020.head()
clean_income_data = demographic_values_2020.copy()
clean_income_data['value']=clean_income_data['value'].astype(int)
clean_income_data=clean_income_data.loc[clean_income_data['county']!='510']
clean_income_data = clean_income_data.groupby(['variable_code', 'variable_name', 'variable_category'])["value"].sum()
clean_income_data = clean_income_data.reset_index()
clean_income_data = clean_income_data.loc[clean_income_data['value']>=0]

#print(clean_income_data.head())
test_median = calculate_median_value(clean_income_data,'variable_name', 'variable_code', 'value', 'variable_category', 'Income')
#print(test_median.head())
test_median.to_excel("median_income.xlsx")

In [None]:
group_variables = ['variable_code', 'variable_name', 'variable_category']

test_median = calculate_median_value(clean_income_data,'variable_name', 'variable_code', 'value', 'variable_category', 'Income', group_variables)

In [None]:
test_homevalues = calculate_median_value(demographic_values_2020,'variable_name', 'variable_code', 'value', 'variable_category', 'Home Value', group_variables)
test_homevalues.to_excel('homevalue.xlsx')

In [None]:
demographic_values_2020.to_excel("2020_raw_check.xlsx")

In [None]:
#Put in code to run the median calculating function
        # Create a new DataFrame to avoid modifying the original one
summary_df = fd.copy()
summary_df= summary_df.loc[summary_df[category_field]==category]
# Sort the DataFrame based on the variable name column
summary_df.sort_values(by=sort_column, inplace=True)


# Extract lower and upper limits from bin categories
pattern = r'(\d+[\d,]*)'
summary_df['temp'] = summary_df[bin_column].str.replace(',', '').str.findall(pattern)
summary_df[['Lower', 'Upper']] = summary_df['temp'].apply(lambda x: pd.Series(x[:2]) if len(x) == 2 else pd.Series([None, None]))
summary_df['Lower'] = summary_df['Lower'].astype(float)
summary_df['Upper'] = summary_df['Upper'].astype(float)
# Handle first category

first_upper = summary_df['temp'].iloc[0]
print(first_upper)
summary_df.loc[0, 'Lower'] = 0  # Set lower value to 0
summary_df.loc[0, 'Upper'] = first_upper

# Handle last category

last_lower = summary_df['temp'].iloc[-1]

summary_df.loc[summary_df.index[-1], 'Lower'] = last_lower
summary_df.loc[summary_df.index[-1], 'Upper'] = np.inf  # Set upper value to infinity
summary_df[count_column]= summary_df[count_column].astype(float)   
# Calculate cumulative count
summary_df['Cumulative_Count'] = summary_df[count_column].cumsum()

# Calculate total count
total_count = summary_df[count_column].sum()

# Find the bin where the cumulative count exceeds or equals half of the total count
median_bin = summary_df.loc[summary_df['Cumulative_Count'] >= total_count / 2, bin_column].iloc[0]
print('m')
# Estimate the median value
previous_cumulative_count = summary_df.loc[summary_df[bin_column] < median_bin, 'Cumulative_Count'].max()
previous_lower = summary_df.loc[summary_df['Cumulative_Count'] == previous_cumulative_count, 'Lower'].iloc[0]
previous_upper = summary_df.loc[summary_df['Cumulative_Count'] == previous_cumulative_count, 'Upper'].iloc[0]

next_cumulative_count = summary_df.loc[summary_df[bin_column] > median_bin, 'Cumulative_Count'].min()
next_lower = summary_df.loc[summary_df['Cumulative_Count'] == next_cumulative_count, 'Lower'].iloc[0]
next_upper = summary_df.loc[summary_df['Cumulative_Count'] == next_cumulative_count, 'Upper'].iloc[0]

# Interpolate the median value within the bin range
cumulative_diff = total_count / 2 - previous_cumulative_count
interpolation_ratio = cumulative_diff / (next_cumulative_count - previous_cumulative_count)

median_value = previous_upper + interpolation_ratio * (next_lower - previous_upper)

# Add the estimated median value to the new DataFrame
summary_df['Estimated_Median'] = median_value



In [None]:




acs_variables_2011=demographic_variables.loc[(demographic_variables['Datasource Name']=='acs/acs5')&(demographic_variables['Year']=='2011-2021' )&(demographic_variables['Added']!='Loaded')].reset_index()
acs_variables_2010=demographic_variables.loc[(demographic_variables['Datasource Name']=='acs/acs5')&(demographic_variables['Year']=='2010-2021')&(demographic_variables['Added']!='Loaded')].reset_index()
dec_variables=demographic_variables.loc[(demographic_variables['Datasource Name']=='dec/sf1')].reset_index()

df_census_data=pd.DataFrame()
year_range_2011 = [str(num) for num in range(2011, 2022)]
year_range_2010 = [str(num) for num in range(2010, 2022)]

for index, row in acs_variables_2011.iterrows():
    df = load_variable_multiple_year(year_range_2011, row['Datasource Name'],'tract',row['CodeNumber'],row['Variable'], census_api_key, '', tahoe_geometry, row['Category'])
    #df = get_variable_data(row['Year'], row['Datasource Name'],'tract',row['CodeNumber'],row['Variable'], census_api_key, row['census_geom_year'], tahoe_geometry )
    df_census_data = create_or_append_df(df_census_data, df)







In [None]:
for index, row in acs_variables_2010.iterrows():
    
    df = load_variable_multiple_year(year_range_2010, row['Datasource Name'],'tract',row['CodeNumber'],row['Variable'], census_api_key, '', tahoe_geometry, row['Category'])
    #df = get_variable_data(row['Year'], row['Datasource Name'],'tract',row['CodeNumber'],row['Variable'], census_api_key, row['census_geom_year'], tahoe_geometry )
    df_census_data = create_or_append_df(df_census_data, df)

In [None]:
df_census_data.to_excel('acs_data_additional.xlsx')


In [None]:
#Test out census blocks

In [None]:

#Load 2011-2019 Variables - this takes a long time
variables=pd.read_csv("demographic_variables.csv")
acs_variables=variables.loc[(variables['Datasource Name']=='acs/acs5')]
acs_variables=acs_variables.reset_index()
df_test_acs=pd.DataFrame()
year_range = [str(num) for num in range(2011, 2020)]

for index, row in acs_variables.iterrows():
    df = load_variable_multiple_year(year_range_acs, row['Datasource Name'],'tract',row['CodeNumber'],row['Variable'], census_api_key, '2010', tahoe_geometry, row['Category'])
    #df = get_variable_data(row['Year'], row['Datasource Name'],'tract',row['CodeNumber'],row['Variable'], census_api_key, row['census_geom_year'], tahoe_geometry )
    if df_test_acs is None:
        df_test_acs=df
    else:
        df_test_acs=pd.concat([df_test_acs, df])
df_test_acs.to_csv('acs_demographic_data_2020.csv')

In [None]:
#Load 2010 Variables
variables=pd.read_csv("demographic_variables.csv")
dec_variables=variables.loc[(variables['Datasource Name']=='dec/sf1')&(variables['Year']==2000)]
dec_variables=dec_variables.reset_index()
df_test_dec=pd.DataFrame()
year_range_dec = ['2000']
for index, row in dec_variables.iterrows():
    df = load_variable_multiple_year(year_range_dec, row['Datasource Name'],'tract',row['CodeNumber'],row['Variable'], census_api_key, '2000', tahoe_geometry, row['Category'])
    #df = get_variable_data(row['Year'], row['Datasource Name'],'tract',row['CodeNumber'],row['Variable'], census_api_key, row['census_geom_year'], tahoe_geometry )
    if df_test_dec is None:
        df_test_dec=df
    else:
        df_test_dec=pd.concat([df_test_dec, df])
df_test_dec.to_csv('dec_demographic_data_2000.csv')

In [None]:
variable_name = 'Work From Home'
variable_code = 'B99084_005E'
variable_datasource = 'acs/acs5'
variable_year_range=[ '2011','2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020','2021']
census_geom_year = '2010'
variable_category = 'Employment'
df_work_from_home=pd.DataFrame()
for year in variable_year_range:
    df=get_non_tahoe_data(year,variable_datasource,variable_code,variable_name,census_api_key,'2010',variable_category)
    df_work_from_home = create_or_append_df(df_work_from_home,df)
variable_name = 'Did Not Work From Home'
variable_code = 'B99084_002E'
for year in variable_year_range:
    df=get_non_tahoe_data(year,variable_datasource,variable_code,variable_name,census_api_key,'2010',variable_category)
    df_work_from_home = create_or_append_df(df_work_from_home,df)


In [None]:
df_work_from_home.to_excel('regional_work_from_home.xlsx')

In [None]:
#Load 2020 variables - currently using the redistricting data because they haven't released the full group yet
variables=pd.read_csv("demographic_variables.csv")
dec_variables=variables.loc[(variables['Datasource Name']=='dec/pl')&(variables['Year']==2020)]
dec_variables=dec_variables.reset_index()
df_test_dec=pd.DataFrame()
year_range_dec = ['2020']
for index, row in dec_variables.iterrows():
    df = load_variable_multiple_year(year_range_dec, row['Datasource Name'],'tract',row['CodeNumber'],row['Variable'], census_api_key, '2020', tahoe_geometry, row['Category'])
    #df = get_variable_data(row['Year'], row['Datasource Name'],'tract',row['CodeNumber'],row['Variable'], census_api_key, row['census_geom_year'], tahoe_geometry )
    if df_test_dec is None:
        df_test_dec=df
    else:
        df_test_dec=pd.concat([df_test_dec, df])
df_test_dec.to_csv('dec_demographic_data_2020.csv')

In [None]:
#Load 2010 variables
variables=pd.read_csv("demographic_variables.csv")
dec_variables=variables.loc[(variables['Datasource Name']=='dec/pl')&(variables['Year']==2010)]
dec_variables=dec_variables.reset_index()
df_test_dec=pd.DataFrame()
year_range_dec = ['2010']
for index, row in dec_variables.iterrows():
    df = load_variable_multiple_year(year_range_dec, row['Datasource Name'],'tract',row['CodeNumber'],row['Variable'], census_api_key, '2010', tahoe_geometry, row['Category'])
    #df = get_variable_data(row['Year'], row['Datasource Name'],'tract',row['CodeNumber'],row['Variable'], census_api_key, row['census_geom_year'], tahoe_geometry )
    if df_test_dec is None:
        df_test_dec=df
    else:
        df_test_dec=pd.concat([df_test_dec, df])
df_test_dec.to_csv('dec_demographic_data_2010.csv')

In [None]:
#Template for manual download
variable_name = 'Population'
variable_code = 'B08134_002E'
variable_geometry_level = 'tract'
variable_datasource = 'acs/acs5'
variable_year_range=[ '2011','2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019']
census_geom_year = '2010'
variable_category = 'Transportation'
df_variable=pd.DataFrame()
for year in variable_year_range:
    df = get_variable_data(year,variable_datasource,variable_geometry_level,variable_code,variable_name, census_api_key, census_geom_year, tahoe_geometry, variable_category )
    if df_variable.empty:
        df_variable=df
    else:
        df_variable=pd.concat([df_variable, df])
print(df_variable.head())

df_variable.to_csv('acs_travel_time.csv')

In [None]:
variable_name = 'Housing Units'
variable_code = 'H001001'
variable_geometry_level = 'tract'
variable_datasource = 'dec/sf1'
variable_year_range=['2010']
census_geom_year = '2010'

df_variable=pd.DataFrame()
for year in variable_year_range:
    df = get_variable_data(year,variable_datasource,variable_geometry_level,variable_code,variable_name, census_api_key, census_geom_year, tahoe_geometry )
    if df_variable.empty:
        df_variable=df
    else:
        df_variable=pd.concat([df_variable, df])
print(df_variable.head())

In [None]:
variable_name = 'Population'
variable_code = 'P008001'
variable_geometry_level = 'tract'
variable_datasource = 'dec/sf1'
variable_year_range=['2010']
census_geom_year = '2010'

df_variable=pd.DataFrame()
for year in variable_year_range:
    df = get_variable_data(year,variable_datasource,variable_geometry_level,variable_code,variable_name, census_api_key, census_geom_year )
    if df_variable.empty:
        df_variable=df
    else:
        df_variable=pd.concat([df_variable, df])

df_variable['GEO_ID']=df_variable['GEO_ID'].str.split('US').str[1]
df_variable['Geom_Code']=df_variable['GEO_ID']+df_variable['census_geom_year'].astype(str)

tahoe_variable = df_variable[df_variable['Geom_Code'].isin(tahoe_geometry['GeomCode'])]

print(tahoe_variable.head())

tahoe_variable.to_csv('population_numbers.csv')



In [None]:
variable_name = 'Median Income'
variable_code = 'B06011_001E'
variable_geometry_level = 'tract'
variable_datasource = 'acs/acs5'
variable_year_range=['2020','2021']
census_geom_year = '2010'
year_range = [str(num) for num in range(2011, 2020)]
df_variable=pd.DataFrame()
for year in year_range:
    df = get_variable_data(year,variable_datasource,variable_geometry_level,variable_code,variable_name, census_api_key, census_geom_year, tahoe_geometry,'Income' )
    if df_variable.empty:
        df_variable=df
    else:
        df_variable=pd.concat([df_variable, df])

df_variable.to_csv('MedianIncome_Tract_2011.csv')
print(df_variable.head())

In [None]:
#Load 2011-2019 Variables - this takes a long time
variables=pd.read_csv("demographic_variables.csv")
acs_variables=variables.loc[(variables['Datasource Name']=='acs/acs5')&(variables['Category']=='Employment')]
acs_variables=acs_variables.reset_index()
df_test_acs=pd.DataFrame()
year_range = [str(num) for num in range(2011, 2020)]
year_range_2020 = ['2020','2021']

for index, row in acs_variables.iterrows():
    df = load_variable_multiple_year(year_range_2020, row['Datasource Name'],'tract',row['CodeNumber'],row['Variable'], census_api_key, '2020', tahoe_geometry, row['Category'])
    #df = get_variable_data(row['Year'], row['Datasource Name'],'tract',row['CodeNumber'],row['Variable'], census_api_key, row['census_geom_year'], tahoe_geometry )
    if df_test_acs is None:
        df_test_acs=df
    else:
        df_test_acs=pd.concat([df_test_acs, df])
df_test_acs.to_excel('acs_employment_data_2020.xlsx')



In [None]:
df_test_acs.to_excel('acs_employment_data_2011.xlsx')

# Group Data to categories

## Group Age data

In [32]:
conn = pyodbc.connect(conn_str)

query = "Select * from sde.Census_Demographics where variable_category = 'age' and dataset = 'acs/acs5'"
# Execute the query and fetch the results
age_data = pd.read_sql_query(query, conn)

# Close the database connection
conn.close()







### Group to Census Categories

In [33]:
age_categories = 'acs_age_categories.csv'
grouped_acs_age_data = categorize_values(age_data, age_categories ,'Census Category with Sex', 'TRPA Census Age Sex Categories Grouped: ')

['GEO_ID', 'state', 'county', 'tract', 'block_group', 'variable_category', 'year_sample', 'sample_level', 'dataset', 'census_geom_year', 'TRPAID', 'NEIGHBORHOOD', 'Data_Scale', 'Census Category with Sex']


In [34]:
grouped_acs_age_data.to_excel('Census_Data_Downloads\grouped_acs_age__sex_census.xlsx')

In [39]:
conn = pyodbc.connect(conn_str)

query = "Select * from sde.Census_Demographics where variable_category = 'age' and dataset = 'dec/dhc'"
# Execute the query and fetch the results
age_data = pd.read_sql_query(query, conn)

# Close the database connection
conn.close()




In [40]:
age_categories = 'dec_age_categories_2020.csv'
grouped_dec_age_data_sex = categorize_values(age_data, age_categories ,'Census Category with Sex', 'TRPA Census Age Sex Categories Grouped: ')
grouped_dec_age_data = categorize_values(age_data, age_categories ,'Census Category', 'TRPA Census Age Categories Grouped: ')

['GEO_ID', 'state', 'county', 'tract', 'block_group', 'variable_category', 'year_sample', 'sample_level', 'dataset', 'census_geom_year', 'TRPAID', 'NEIGHBORHOOD', 'Data_Scale', 'Census Category with Sex']
['GEO_ID', 'state', 'county', 'tract', 'block_group', 'variable_category', 'year_sample', 'sample_level', 'dataset', 'census_geom_year', 'TRPAID', 'NEIGHBORHOOD', 'Data_Scale', 'Census Category']


In [41]:
grouped_dec_age_data_sex.to_excel('Census_Data_Downloads\grouped_dec_age_data_sex.xlsx')
grouped_dec_age_data.to_excel('Census_Data_Downloads\grouped_dec_age_data.xlsx')

## Group Income Data

In [None]:
conn = pyodbc.connect(conn_str)

query = "Select * from sde.Census_Demographics where variable_category = 'age'"
# Execute the query and fetch the results
age_data = pd.read_sql_query(query, conn)

# Close the database connection
conn.close()

## Group 1990 Data

In [6]:
conn = pyodbc.connect(conn_str)

query = "Select * from sde.Census_Demographics where variable_category = 'age' and year_sample = 1990"
# Execute the query and fetch the results
age_data_1990 = pd.read_sql_query(query, conn)

# Close the database connection
conn.close()



In [8]:
age_categories = 'age_categories_1990.csv'
grouped_1990_age_data = categorize_values(age_data_1990, age_categories ,'Census Category', 'TRPA Census Age Categories Grouped: ')

['GEO_ID', 'state', 'county', 'tract', 'block_group', 'variable_category', 'year_sample', 'sample_level', 'dataset', 'census_geom_year', 'TRPAID', 'NEIGHBORHOOD', 'Data_Scale', 'Census Category']


In [9]:
grouped_1990_age_data.to_excel('grouped_1990_age_data.xlsx')

# Generate Basin Estimates

## Basinwide Sums

### Population

In [5]:
conn = pyodbc.connect(conn_str)

query = "Select * from sde.Census_Demographics where variable_name = 'Population Total' or variable_name = 'Total Population'"
# Execute the query and fetch the results
population_data = pd.read_sql_query(query, conn)

# Close the database connection
conn.close()



In [12]:
filtered_population_data = population_data.loc[(population_data['county']!='510')&(population_data['tract']!='990000')]
summarized_population_data = filtered_population_data.groupby(['variable_code', 'year_sample'], as_index=False).sum(['value'])


In [11]:
filtered_population_data.to_excel('filtered_population_data.xlsx')

## Basinwide Medians

### Income

In [10]:
conn = pyodbc.connect(conn_str)

query = "Select * from sde.Census_Demographics where variable_category = 'Household Income'"
# Execute the query and fetch the results
Income_data = pd.read_sql_query(query, conn)

# Close the database connection
conn.close()



In [17]:
grouping_variables = ['variable_category', 'year_sample', 'dataset', 'variable_code', 'variable_name']

clean_income_data_1990 = Income_data.loc[(Income_data['county']!='510') & (Income_data['year_sample']==1990)]
bin_column= 'variable_name'
sort_column = 'variable_code'
count_column = 'value'
category_field = 'variable_category'
category = 'Household Income'

median_income_basin = calculate_median_value(clean_income_data_1990, bin_column, sort_column, count_column, category_field, category,grouping_variables)

5000.0
150000.0
$30,000 to $32,499
10
10233.0
12060.0
21475.0
504.5
0.403277378097522
31007.596722621904
