In [17]:
import os
import pandas as pd
#This doesn't come with the standard esri virtual environment so it needs to be added to a cloned env
from dbfread import DBF

from arcgis.features import FeatureLayer


def read_all_dbf_files(folder_path):
    # Get a list of all DBF files in the folder
    dbf_files = [file for file in os.listdir(folder_path) if file.endswith('.dbf')]
    
    # Create a dictionary to store the DataFrames
    dataframes = {}
    
    # Loop through each DBF file and read it into a DataFrame
    for file in dbf_files:
        file_path = os.path.join(folder_path, file)
        df_name = os.path.splitext(file)[0]  # Use the file name (without extension) as DataFrame name
        
        try:
            # Use dbfread to read the DBF file into a DataFrame
            table = DBF(file_path)
            df = pd.DataFrame(iter(table))
            dataframes[df_name] = df
            print(f"Successfully read '{file}' into DataFrame '{df_name}'")
        except Exception as e:
            print(f"Error reading '{file}': {str(e)}")
    
    return dataframes

# Replace 'folder_path' with the path to the folder containing the DBF files
folder_path = 'C:/Users/amcclary/Documents/GitHub/Demographics/1990_Tables'
all_dataframes = read_all_dbf_files(folder_path)

Successfully read 'stf301ca.dbf' into DataFrame 'stf301ca'
Successfully read 'stf301nv.dbf' into DataFrame 'stf301nv'
Successfully read 'stf302ca.dbf' into DataFrame 'stf302ca'
Successfully read 'stf302nv.dbf' into DataFrame 'stf302nv'
Successfully read 'stf309ca.dbf' into DataFrame 'stf309ca'
Successfully read 'stf309nv.dbf' into DataFrame 'stf309nv'
Successfully read 'stf314ca.dbf' into DataFrame 'stf314ca'
Successfully read 'stf314nv.dbf' into DataFrame 'stf314nv'
Successfully read 'stf327ca.dbf' into DataFrame 'stf327ca'
Successfully read 'stf327nv.dbf' into DataFrame 'stf327nv'
Successfully read 'stf330ca.dbf' into DataFrame 'stf330ca'
Successfully read 'stf330nv.dbf' into DataFrame 'stf330nv'
Successfully read 'stf333ca.dbf' into DataFrame 'stf333ca'
Successfully read 'stf333nv.dbf' into DataFrame 'stf333nv'


# Combine the Nevada and California tables

In [18]:
def combine_dataframes_by_prefix(dataframes_dict, prefix_length):
    combined_dataframes = {}
    
    # Group DataFrames by the first x characters of their names
    groups = {}
    for name, df in dataframes_dict.items():
        prefix = name[:prefix_length]
        if prefix not in groups:
            groups[prefix] = []
        groups[prefix].append(df)
    
    # Combine DataFrames in each group using pd.concat
    for prefix, dfs in groups.items():
        if len(dfs) > 1:
            combined_dataframes[prefix] = pd.concat(dfs, ignore_index=True)
        else:
            combined_dataframes[prefix] = dfs[0]
    
    return combined_dataframes

# Assuming you already have a dictionary of dataframes named 'all_dataframes'
# Replace 'prefix_length' with the number of characters you want to consider for grouping
prefix_length = 6
combined_dataframes = combine_dataframes_by_prefix(all_dataframes, prefix_length)



# Specify which variables we want to include and what we want them named

In [19]:
variable_list = pd.read_csv('Census_Variable_Lists/1990_variables_age_grouping.csv')

# Download tahoe geometry

In [20]:


 
service_url = 'https://maps.trpa.org/server/rest/services/Demographics/FeatureServer/27'

feature_layer = FeatureLayer(service_url)
tahoe_geometry_fields = ['YEAR', 'STATE', 'GEOGRAPHY', 'GEOID', 'TRPAID', 'NEIGHBORHOOD']
query_result = feature_layer.query(out_fields=",".join(tahoe_geometry_fields))
# Convert the query result to a list of dictionaries
feature_list = query_result.features

# Create a pandas DataFrame from the list of dictionaries
tahoe_geometry = pd.DataFrame([feature.attributes for feature in feature_list])


# Filter all the dataframes down to sum level 140 (tract level) and state, county and tract are in the ones we want

In [21]:
#Filter all the dataframes down to sum level 140 and state, county and tract are in the ones we want
# df = df[df['TRPAID'].isin(tahoe_geometry['TRPAID'])]
# Loop through dictionary
# Filter on 140
# Match tractnumbers

def filter_1990_table(df, tahoe_geometry, grouping_columns):
    df = df.loc[df['SUMLEV']=='140']
    df['TRACT'] = df['TRACTBNA'].str.pad(width=6, side='right', fillchar='0')
    df['TRPAID'] = df['STATEFP']+df['CNTY']+df['TRACT']+'1990'
    df= df[df['TRPAID'].isin(tahoe_geometry['TRPAID'])]
    df = df.melt(id_vars=grouping_columns)
    df =  pd.merge(df, tahoe_geometry[['TRPAID', 'NEIGHBORHOOD']], on='TRPAID', how= 'left')
    df['year_sample']='1990'
    df['sample_level']='tract'
    df['dataset']= 'dec/sf3'
    df['census_geom_year'] = '1990'
    return df

df_test = combined_dataframes['stf301']



# Melt data to change it to long format 

In [22]:
grouping_columns = ['SUMLEV', 'STATEFP', 'CNTY', 'COUSUBFP', 'PLACEFP', 'TRACTBNA', 'BLCKGR', 'LOGRECNU', 'TRACT', 'TRPAID']
melted_dataframes = []
for name, df in combined_dataframes.items():
    df1 = filter_1990_table(df, tahoe_geometry, grouping_columns)
    melted_dataframes.append(df1)
combined_dataframe = pd.concat(melted_dataframes, ignore_index= True)    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['TRACT'] = df['TRACTBNA'].str.pad(width=6, side='right', fillchar='0')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['TRPAID'] = df['STATEFP']+df['CNTY']+df['TRACT']+'1990'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['TRACT'] = df['TRACTBNA'].str.pad(width=6, side='right', fillchar='0'

In [14]:
combined_dataframe.to_excel('combined_data.xlsx')

In [12]:
print(combined_dataframes)

{'stf301':       SUMLEV STATEFP CNTY COUSUBFP PLACEFP TRACTBNA BLCKGR LOGRECNU  P0010001  \
0        040      06                                         000001  29760021   
1        040      06                                         000002     45947   
2        040      06                                         000003         0   
3        040      06                                         000004         0   
4        050      06  001                                    000005   1279182   
...      ...     ...  ...      ...     ...      ...    ...      ...       ...   
23603    450      32  031    94714   60600                   003275    133838   
23604    450      32  031    94714   68400                   003276     53367   
23605    450      32  031    94714   71600                   003277     11391   
23606    440      32  031    94840                           003278       911   
23607    450      32  031    94840   60600                   003279         0   

       P0020001 

In [23]:
combined_dataframe_withvariables = pd.merge(combined_dataframe, variable_list, on='variable', how= 'inner')

In [24]:
combined_dataframe_withvariables.to_excel('data_1990_age.xlsx')

In [18]:
grouping_columns = ['SUMLEV', 'STATEFP', 'CNTY', 'COUSUBFP', 'PLACEFP', 'TRACTBNA', 'BLCKGR', 'LOGRECNU', 'TRACT', 'TRPAID']

df1990_melted = df1990.melt(id_vars=grouping_columns)

In [28]:
group_columns = [column for column in combined_dataframe_withvariables if column not in ['value', 'variable', 
    'variable_name', 'MarginOfError','Unnamed: 5', 'Table', 'NEIGHBORHOOD', 'File']]
grouping_prefix = "TRPA Census Age Sex Categories Grouped:"
#grouped_data = joined_data.groupby(group_columns, as_index=False)['value'].sum()    
print(group_columns)
grouped_data = combined_dataframe_withvariables.groupby(group_columns, as_index=False, dropna=False).agg({'value':'sum',
                                                                        'variable':lambda x: grouping_prefix +  ', '.join(x)})


['SUMLEV', 'STATEFP', 'CNTY', 'COUSUBFP', 'PLACEFP', 'TRACTBNA', 'BLCKGR', 'LOGRECNU', 'TRACT', 'TRPAID', 'year_sample', 'sample_level', 'dataset', 'census_geom_year', 'variable_category', 'census_category']


In [30]:
grouped_data.to_excel('grouped_1990_age.xlsx')