In [1]:
import pandas as pd
import os
import arcpy
from utils import *

# Get Data

In [2]:
# get parcels from the database
# network path to connection files
filePath = "F:/GIS/PARCELUPDATE/Workspace/"
# database file path 
sdeBase    = os.path.join(filePath, "Vector.sde")
sdeCollect = os.path.join(filePath, "Collection.sde")
sdeTabular = os.path.join(filePath, "Tabular.sde")

arcpy.env.workspace = 'memory'
# # clear memory workspace
# arcpy.management.Delete('memory')

# overwrite true
arcpy.env.overwriteOutput = True
# Set spatial reference to NAD 1983 UTM Zone 10N
sr = arcpy.SpatialReference(26910)

We need to get data for parcels, join census assign to geographies

In [3]:
engine = get_conn('sde')
with engine.begin() as conn:
    df_census = pd.read_sql("SELECT * FROM SDE.Census_Demographics", conn)

In [4]:
def summarize_census_data(csv_path):
    variables_demographics = pd.read_csv(csv_path)
    # filter df_census to only include variables in the variables_demographics list joined on sample_year, sample_level and variable_name
    df_census_demographics = df_census.merge(variables_demographics, how='inner', left_on=['year_sample', 'sample_level', 'variable_code'], right_on=['year', 'geography', 'variable_code'])
    # group df_census_demographics by sample_year, tract, and variable_name and sum the values
    df_census_demographics_grouped = df_census_demographics.groupby(['year_sample', 'tract','state', 'Description',
            'county', 'variable_name', 'variable_code', 'census_geom_year_x'])['value'].sum().reset_index()
    df_census_demographics_grouped['year_sample'] = df_census_demographics_grouped['year_sample'].astype(int).astype(str)
    df_census_demographics_grouped['census_geom_year_x'] = df_census_demographics_grouped['census_geom_year_x'].astype(int).astype(str)
    df_census_demographics_grouped['trpa_id'] = df_census_demographics_grouped['state'] + df_census_demographics_grouped['county'] + df_census_demographics_grouped['tract']+df_census_demographics_grouped['census_geom_year_x']
    #Rename census_geom_year_x to census_geom_year
    df_census_demographics_grouped.rename(columns={'census_geom_year_x': 'census_geom_year'}, inplace=True)
    return df_census_demographics_grouped

In [8]:
chas_variables = pd.read_csv(r'C:\Users\amcclary\Documents\GitHub\Housing\Scripts\Lookup_Lists\chas_variables.csv')

In [26]:
tahoe_census_tracts = pd.read_csv(r'C:\Users\amcclary\Documents\GitHub\Housing\Scripts\Lookup_Lists\tahoe_census_tracts.csv')
tahoe_census_tracts['geoid'] = tahoe_census_tracts['GEO_ID'].astype(str).str.zfill(11)

In [10]:
import chardet

with open(r'C:\Users\amcclary\Downloads\2017thru2021-140-csv\140\Table1.csv', 'rb') as f:
    result = chardet.detect(f.read())
    print(result)  # Outputs the encoding

{'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}


In [28]:
chas_data_1 = pd.read_csv(r'C:\Users\amcclary\Downloads\2017thru2021-140-csv\140\Table1.csv', encoding='ISO-8859-1')

In [29]:
chas_data_1['state'] = chas_data_1['st'].astype(str).str.zfill(2)
chas_data_1['county'] = chas_data_1['cnty'].astype(str).str.zfill(3)
# Drop rows with a missing value
# drop everything in the geoid column to the left of "US"
chas_data_1['geoid'] = chas_data_1['geoid'].str.split('US').str[1]
chas_data_tahoe = chas_data_1[chas_data_1['geoid'].isin(tahoe_census_tracts['geoid'])]

In [None]:
# Basically just need to clean geoid to match our list of geoids and then filter it down

In [15]:
county_states ={
        '06': ['017','061'],
        '32': ['005', '031']}
# Filter chas_data_1 to only include data for the specified counties
chas_data_local = pd.DataFrame()
for state, counties in county_states.items():
    if chas_data_local.empty:
        chas_data_local = chas_data_1[(chas_data_1['state'] == state) & (chas_data_1['county'].isin(counties))]
    else:
        chas_data_local = pd.concat([chas_data_local, chas_data_1[(chas_data_1['state'] == state) & (chas_data_1['county'].isin(counties))]])
    

In [30]:
# get a list of all columns that start with T1
t1_columns = [col for col in chas_data_tahoe.columns if col.startswith('T')]

In [31]:
chas_data_melted = chas_data_tahoe.melt(id_vars=['state', 'county', 'geoid'], value_vars=t1_columns, var_name='variable_code', value_name='value')

In [None]:
chas_data_1 = pd.read_csv(r'C:\Users\amcclary\Downloads\2017thru2021-140-csv\140\Table1.csv', encoding='ISO-8859-1')
chas_data_1['state'] = chas_data_1['st'].astype(str).str.zfill(2)
chas_data_1['county'] = chas_data_1['cnty'].astype(str).str.zfill(3)
chas_data_1['geoid'] = chas_data_1['geoid'].str.split('US').str[1]
chas_data_tahoe = chas_data_1[chas_data_1['geoid'].isin(tahoe_census_tracts['geoid'])]
t1_columns = [col for col in chas_data_tahoe.columns if col.startswith('T')]
chas_data_melted = chas_data_tahoe.melt(id_vars=['state', 'county', 'geoid'], value_vars=t1_columns, var_name='variable_code', value_name='value')
chas_variables.rename(columns={'Column/Variable Name': 'variable_code'}, inplace=True)
chas_data_local_merged = chas_data_melted.merge(chas_variables, how='inner', on='variable_code')

In [34]:
import os
import pandas as pd

# Define the function to process each CSV
def process_chas_data(csv_path, tahoe_census_tracts, chas_variables):
    # Load CSV data
    chas_data_1 = pd.read_csv(csv_path, encoding='ISO-8859-1')

    # Format state, county, and geoid columns
    chas_data_1['state'] = chas_data_1['st'].astype(str).str.zfill(2)
    chas_data_1['county'] = chas_data_1['cnty'].astype(str).str.zfill(3)
    chas_data_1['geoid'] = chas_data_1['geoid'].str.split('US').str[1]

    # Filter for Tahoe Basin data (assuming tahoe_census_tracts is a DataFrame with 'geoid' column)
    chas_data_tahoe = chas_data_1[chas_data_1['geoid'].isin(tahoe_census_tracts['geoid'].tolist())]

    # Select columns that start with 'T'
    t1_columns = [col for col in chas_data_tahoe.columns if col.startswith('T')]
        # Rename columns in chas_variables
    chas_data_tahoe.rename(columns={'Column/Variable Name': 'variable_code'}, inplace=True)
    # Melt the data (long format)
    chas_data_melted = chas_data_tahoe.melt(id_vars=['state', 'county', 'geoid'], value_vars=t1_columns, var_name='variable_code', value_name='value')



    # Merge the melted data with the variable names
    chas_data_local_merged = chas_data_melted.merge(chas_variables, how='inner', on='variable_code')

    return chas_data_local_merged


# Function to process all CSV files in a directory and combine into a single DataFrame
def process_all_csv_in_directory(directory_path, tahoe_census_tracts, chas_variables):
    all_data = []  # List to store individual DataFrames

    # Loop over all files in the directory
    for filename in os.listdir(directory_path):
        if filename.endswith('.csv'):
            csv_path = os.path.join(directory_path, filename)

            # Process each CSV file
            print(f"Processing {filename}...")
            processed_data = process_chas_data(csv_path, tahoe_census_tracts, chas_variables)
            
            # Append to the list
            all_data.append(processed_data)

    # Concatenate all data into a single DataFrame
    final_data = pd.concat(all_data, ignore_index=True)

    return final_data


# Example usage
directory_path = r'C:\Users\amcclary\Downloads\2017thru2021-140-csv\140'  # Update with your directory path
# Ensure tahoe_census_tracts and chas_variables are defined before calling
# final_dataframe = process_all_csv_in_directory(directory_path, tahoe_census_tracts, chas_variables)

# Display first few rows of the resulting combined dataframe
# print(final_dataframe.head())

all_hud_data = process_all_csv_in_directory(directory_path, tahoe_census_tracts, chas_variables)


Processing Table1.csv...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chas_data_tahoe.rename(columns={'Column/Variable Name': 'variable_code'}, inplace=True)


Processing Table10.csv...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chas_data_tahoe.rename(columns={'Column/Variable Name': 'variable_code'}, inplace=True)


Processing Table11.csv...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chas_data_tahoe.rename(columns={'Column/Variable Name': 'variable_code'}, inplace=True)


Processing Table12.csv...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chas_data_tahoe.rename(columns={'Column/Variable Name': 'variable_code'}, inplace=True)


Processing Table13.csv...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chas_data_tahoe.rename(columns={'Column/Variable Name': 'variable_code'}, inplace=True)


Processing Table14A.csv...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chas_data_tahoe.rename(columns={'Column/Variable Name': 'variable_code'}, inplace=True)


Processing Table14B.csv...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chas_data_tahoe.rename(columns={'Column/Variable Name': 'variable_code'}, inplace=True)


Processing Table15A.csv...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chas_data_tahoe.rename(columns={'Column/Variable Name': 'variable_code'}, inplace=True)


Processing Table15B.csv...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chas_data_tahoe.rename(columns={'Column/Variable Name': 'variable_code'}, inplace=True)


Processing Table15C.csv...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chas_data_tahoe.rename(columns={'Column/Variable Name': 'variable_code'}, inplace=True)


Processing Table16.csv...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chas_data_tahoe.rename(columns={'Column/Variable Name': 'variable_code'}, inplace=True)


Processing Table17A.csv...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chas_data_tahoe.rename(columns={'Column/Variable Name': 'variable_code'}, inplace=True)


Processing Table17B.csv...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chas_data_tahoe.rename(columns={'Column/Variable Name': 'variable_code'}, inplace=True)


Processing Table18A.csv...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chas_data_tahoe.rename(columns={'Column/Variable Name': 'variable_code'}, inplace=True)


Processing Table18B.csv...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chas_data_tahoe.rename(columns={'Column/Variable Name': 'variable_code'}, inplace=True)


Processing Table18C.csv...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chas_data_tahoe.rename(columns={'Column/Variable Name': 'variable_code'}, inplace=True)


Processing Table2.csv...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chas_data_tahoe.rename(columns={'Column/Variable Name': 'variable_code'}, inplace=True)


Processing Table3.csv...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chas_data_tahoe.rename(columns={'Column/Variable Name': 'variable_code'}, inplace=True)


Processing Table4.csv...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chas_data_tahoe.rename(columns={'Column/Variable Name': 'variable_code'}, inplace=True)


Processing Table5.csv...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chas_data_tahoe.rename(columns={'Column/Variable Name': 'variable_code'}, inplace=True)


Processing Table7.csv...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chas_data_tahoe.rename(columns={'Column/Variable Name': 'variable_code'}, inplace=True)


Processing Table8.csv...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chas_data_tahoe.rename(columns={'Column/Variable Name': 'variable_code'}, inplace=True)


Processing Table9.csv...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chas_data_tahoe.rename(columns={'Column/Variable Name': 'variable_code'}, inplace=True)


In [35]:
all_hud_data['TRPAID']= all_hud_data['geoid']+'2020'

In [38]:
all_hud_data.to_csv(r'C:\Users\amcclary\Documents\GitHub\Housing\Scripts\Dowloaded_Data\all_hud_data.csv', index=False)

In [19]:
chas_data_local_merged = chas_data_melted.merge(chas_variables, how='inner', on='variable_code')

In [5]:
household_data = summarize_census_data('Lookup_Lists/housing_employment_census.csv')

In [6]:
household_data.to_csv('census_employment_data.csv', index=False)

In [7]:
variables_household = pd.read_csv('Lookup_Lists/demographic_variables_housing.csv')

In [15]:
variables_demographics = pd.read_csv('Lookup_Lists/demographic_variables_housing.csv')

In [16]:
# filter df_census to only include variables in the variables_demographics list joined on sample_year, sample_level and variable_name
df_census_demographics = df_census.merge(variables_demographics, how='inner', left_on=['year_sample', 'sample_level', 'variable_name'], right_on=['year', 'geography', 'Variable Name'])

In [17]:
# group df_census_demographics by sample_year, tract, and variable_name and sum the values
df_census_demographics_grouped = df_census_demographics.groupby(['year_sample', 'tract','state', 

                                                                 'county', 'variable_name', 'variable_code', 'census_geom_year_x'])['value'].sum().reset_index()

In [18]:
df_census_demographics_grouped['year_sample'] = df_census_demographics_grouped['year_sample'].astype(int).astype(str)
df_census_demographics_grouped['census_geom_year_x'] = df_census_demographics_grouped['census_geom_year_x'].astype(int).astype(str)
df_census_demographics_grouped['trpa_id'] = df_census_demographics_grouped['state'] + df_census_demographics_grouped['county'] + df_census_demographics_grouped['tract']+df_census_demographics_grouped['census_geom_year_x']
#Rename census_geom_year_x to census_geom_year
df_census_demographics_grouped.rename(columns={'census_geom_year_x': 'census_geom_year'}, inplace=True)
df_census_demographics_grouped.to_csv('Summarized_Data/Demographics_Data.csv', index=False)

# Employment Analysis

In [None]:
taz_employment_data = pd.read_csv('Lookup_Lists/employment_2022_data.csv')
tract_lookup = pd.read_csv('Lookup_Lists/TAZ_Tract_lookup.csv')

In [None]:
# Do some data cleanup on this and fill in blanks. What is happening with group quarters???

# Add group quarters by type to the data
#change geoid type to string


In [None]:
# parcel development layer polygons
parcel_db = sdeCollect + "\\SDE.Parcel\\SDE.Parcel_History_Attributed"
# query 2022 rows
sdf_units = pd.DataFrame.spatial.from_featureclass(parcel_db)
sdf_units = sdf_units.loc[sdf_units['YEAR'] == 2022]
sdf_units.spatial.sr = sr

In [None]:
census_variable_list = pd.read_csv('Lookup_Lists/census_variables.csv')
df_census_2022_include = df_census_2022.loc[df_census_2022['variable_code'].isin(census_variable_list['variable_code'])]
block_group_pivot = df_census_2022_include.pivot(index='block_group', columns='variable_code', values='value')


In [None]:
units_attributed = pd.merge(sdf_units, block_group_pivot, left_on='TRPAID', right_on='TRPAID', how='left')