# Set up and import tahoe geometry

In [1]:
import pandas as pd
import numpy as np
import arcpy
from arcgis.features import FeatureLayer

service_url = 'https://maps.trpa.org/server/rest/services/Demographics/FeatureServer/27'

feature_layer = FeatureLayer(service_url)
tahoe_geometry_fields = ['YEAR', 'STATE', 'GEOGRAPHY', 'GEOID', 'TRPAID', 'NEIGHBORHOOD']
query_result = feature_layer.query(out_fields=",".join(tahoe_geometry_fields))
# Convert the query result to a list of dictionaries
feature_list = query_result.features

# Create a pandas DataFrame from the list of dictionaries
tahoe_geometry = pd.DataFrame([feature.attributes for feature in feature_list])

county_lookup = {
    '005': 'Douglas County (Tahoe Basin)',
    '017': 'El Dorado County (Tahoe Basin)',
    '031': 'Washoe County (Tahoe Basin)',
    '061': 'Placer County (Tahoe Basin)'
}

## Define Functions

In [2]:
def calculate_median_value(df, bin_column, sort_column, count_column, category_field, category,grouping_variables, cumulative_sorting_variables):
        # Create a new DataFrame to avoid modifying the original one
    #change value to count column
    #Do we need to handle excluding non-tahoe things here or should we do it in the input?
    summary_df = df.copy()
    summary_df[count_column]=summary_df[count_column].astype(int)
    #summary_df=summary_df.loc[summary_df[count_column]!='510']
    summary_df = summary_df.groupby(grouping_variables)[count_column].sum()
    summary_df = summary_df.reset_index()
    #This handles -6666 values that they sometimes add for unknown data
    summary_df = summary_df.loc[summary_df[count_column]>=0]
    
    
    summary_df= summary_df.loc[summary_df[category_field]==category]
    # Sort the DataFrame based on the variable name column
    # This depends on the fact that census variables start with the lowest value and go up 
    #This needs to all be rethought to have some kind of window function to handle multiple years
    summary_df.sort_values(by=sort_column, inplace=True)
    summary_df = summary_df.reset_index()
    
    # Extract lower and upper limits from bin categories
    #This uses regex to find numbers and removes commas to make numbers numbers 
    pattern = r'(\d+[\d,]*)'
    summary_df['temp'] = summary_df[bin_column].str.replace(',', '').str.findall(pattern)
    #Looks for values that have two numbers and puts empty placeholders for the ones that only have one (upper and lower)
    summary_df[['Lower', 'Upper']] = summary_df['temp'].apply(lambda x: pd.Series(x[:2]) if len(x) == 2 else pd.Series([None, None]))
    summary_df['Lower'] = summary_df['Lower'].astype(float)
    summary_df['Upper'] = summary_df['Upper'].astype(float)
    # Handle first category
    
    first_upper = float(summary_df['temp'].iloc[0][0])
    low_variable_name = summary_df[bin_column].iloc[0]

    summary_df.loc[summary_df[bin_column]==low_variable_name,'Lower'] = 0  # Set lower value to 0
    summary_df.loc[summary_df[bin_column]==low_variable_name,'Upper'] = first_upper

    
    # Handle last category
    
    last_lower = float(summary_df['temp'].iloc[-1][0])
    upper_variable_name = summary_df[bin_column].iloc[-1]
    summary_df.loc[summary_df[bin_column]==upper_variable_name,'Lower'] = last_lower
    summary_df.loc[summary_df[bin_column]==upper_variable_name,'Upper'] = np.inf  # Set upper value to infinity
    summary_df[count_column]= summary_df[count_column].astype(float)   
    # Calculate cumulative count
    cumulative_grouping_variables = grouping_variables

    cumulative_grouping_variables.remove(bin_column)
    cumulative_grouping_variables.remove(sort_column)

    #Update this to be parameterized
    
    summary_df.sort_values(by=cumulative_sorting_variables, inplace=True)
    summary_df = summary_df.reset_index()
    summary_df['cumulative_sum'] = summary_df.groupby(cumulative_grouping_variables, as_index=False)[count_column].cumsum()
    summary_df['TotalSum'] = summary_df.groupby(cumulative_grouping_variables, as_index=False)[count_column].transform('sum')
    summary_df['previous_cumulative'] = summary_df['cumulative_sum'].shift()

    summary_df = summary_df.loc[summary_df['cumulative_sum']>=(summary_df['TotalSum']/2)].groupby(cumulative_grouping_variables, as_index=False).first()

    summary_df['cumulative_difference'] = summary_df['TotalSum']  / 2 - summary_df['previous_cumulative']
    summary_df['interpolation_ratio'] = summary_df['cumulative_difference'] /  (summary_df['cumulative_sum']- summary_df['previous_cumulative'])
    summary_df['median_value'] = summary_df['Lower'] + summary_df['interpolation_ratio'] * (summary_df['Upper'] - summary_df['Lower'])
    
    
    return summary_df


def categorize_values(census_df, category_csv, category_column, grouping_prefix):
    categories = pd.read_csv(category_csv)    
    census_df['value'] = census_df['value'].astype(float)
    joined_data = census_df.merge(categories, on = 'variable_code', how = 'left')
    joined_data.sort_values(by='variable_code', inplace=True)
    #This will get rid of any extra columns in the category_csv
    group_columns = [column for column in census_df if column not in ['value', 'variable_code', 'variable_name', 'MarginOfError','OBJECTID']]
    group_columns.append(category_column)
    #grouped_data = joined_data.groupby(group_columns, as_index=False)['value'].sum()    
    print(group_columns)
    grouped_data = joined_data.groupby(group_columns, as_index=False, dropna=False).agg({'value':'sum',
                                                                           'variable_code':lambda x: grouping_prefix +  ', '.join(x)})
    
    #Need to return this formatted for appending to the table - need to get locations of variable_code and variable name, 
    #add them in as columns in those locations and then populate them with category column nanme
    var_code_col_location = census_df.columns.get_loc('variable_code')
    var_name_col_location = census_df.columns.get_loc('variable_name')
    var_moe_col_location = census_df.columns.get_loc('MarginOfError')
    grouped_data.insert(var_moe_col_location, 'MarginOfError', '')
    #grouped_data.insert(var_code_col_location, 'variable_code','Grouped Value')
    grouped_data.insert(var_name_col_location, 'variable_name','')
    #grouped_data['variable_code'] = grouped_data['variable_code'] +  '_Grouped'
    grouped_data['variable_name'] = grouped_data[category_column]
    grouped_data['dataset']= grouping_prefix + grouped_data['dataset']
    grouped_data['variable_category']= grouping_prefix +  grouped_data['variable_category'] 
    columns_to_keep = [column for column in census_df if column not in ['OBJECTID']]
    grouped_data= grouped_data[columns_to_keep]
    return grouped_data


# Generate Basin Estimates

## Import data for Basin Estimates

In [12]:
service_url = 'https://maps.trpa.org/server/rest/services/Demographics/FeatureServer/28'

feature_layer = FeatureLayer(service_url)
query_result = feature_layer.query()
# Convert the query result to a list of dictionaries
feature_list = query_result.features

# Create a pandas DataFrame from the list of dictionaries
all_data = pd.DataFrame([feature.attributes for feature in feature_list])

all_data_clean = all_data.loc[(all_data['county']!='510')&(all_data['value']>0)]
all_data_clean.loc[all_data_clean['county'].isin(['005','017']),'north_south'] = 'South Lake'
all_data_clean.loc[all_data_clean['county'].isin(['031','061']),'north_south'] = 'North Lake'
all_data_clean['county_name'] = all_data_clean['county'].apply(lambda x: county_lookup.get(x, None))

all_data_clean = all_data_clean.dropna(subset='value')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_data_clean.loc[all_data_clean['county'].isin(['005','017']),'north_south'] = 'South Lake'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_data_clean['county_name'] = all_data_clean['county'].apply(lambda x: county_lookup.get(x, None))


## Summed Estimates

### Population Totals

In [10]:
filtered_population_data = all_data_clean.loc[(all_data_clean['variable_name']=='Total Population')]
filtered_population_data
summarized_population_data_basin = filtered_population_data.groupby([ 'dataset', 'variable_code', 'year_sample'], as_index=False).sum(['value'])
summarized_population_data_county = filtered_population_data.groupby(['dataset', 'variable_code', 'year_sample', 'county_name'], as_index=False).sum(['value'])
summarized_population_data_north_south = filtered_population_data.groupby(['dataset', 'variable_code', 'year_sample', 'north_south'], as_index=False).sum(['value'])

In [8]:
def sum_across_levels(df, variable_name, category_name):
    filtered_df = df.loc[(df['variable_name']==variable_name)]
    basin_summary = filtered_df.groupby([ 'dataset', 'sample_level', 'variable_name', 'variable_code', 'year_sample'], as_index=False).sum(['value'])
    county_summary = filtered_df.groupby(['dataset', 'sample_level', 'variable_name', 'variable_code', 'year_sample', 'county_name'], as_index=False).sum(['value'])
    north_south_summary = filtered_df.groupby(['dataset', 'sample_level', 'variable_name', 'variable_code', 'year_sample', 'north_south'], as_index=False).sum(['value'])
    #basin_summary.rename(columns = {'variable_code': 'Code', 'year_sample': 'Year'})
    basin_summary['Geography'] = 'Basin'
    county_summary['Geography'] = county_summary['county_name']
    north_south_summary['Geography'] = north_south_summary['north_south']
    columns_to_keep = ['variable_code','value', 'Geography', 'year_sample', 'dataset', 'sample_level']
    basin_summary= basin_summary[columns_to_keep]
    county_summary = county_summary[columns_to_keep]
    north_south_summary = north_south_summary[columns_to_keep]
    combined_summary = pd.concat([basin_summary, county_summary, north_south_summary], ignore_index=True)
    combined_summary['Category'] = category_name
    return combined_summary


In [9]:
variable_name = 'Total Population'
category_name = 'Total Population'
dataset_name = 'Test'

total_population_summary = sum_across_levels(all_data_clean, variable_name, category_name)

## Median Estimates

In [14]:
grouping_variables = ['variable_category', 'year_sample', 'dataset', 'variable_code', 'variable_name']

clean_income_data= all_data_clean.loc[(all_data_clean['county']!='510')]
bin_column= 'variable_name'
sort_column = 'variable_code'
count_column = 'value'
category_field = 'variable_category'
category = 'Household Income'
cumulative_sort_columns = ['year_sample', 'variable_code']
median_income_basin = calculate_median_value(clean_income_data, bin_column, sort_column, count_column, category_field, category,grouping_variables, cumulative_sort_columns)