# SET UP

In [None]:
pip install arcgis

In [1]:
#Before processing Stream Habitat Condition you must process SEZ data
#Stream Habitat Condition is calculated based on Riverine Indicators and IPI.. or is it just IPI and CSCI

import arcpy
from datetime import datetime
from functools import reduce
import os
from sqlalchemy.engine import URL
from arcgis.features import FeatureSet, GeoAccessor, GeoSeriesAccessor, FeatureLayer
import pandas as pd
from arcgis import GIS
import numpy as np
from arcgis.geometry import SpatialReference

#import geopandas as gpd to use spatial.reference stuff
gis = GIS()


## Look up Dictionaries- SEZ_ID

In [2]:
#Large Polygons or only polygon shapes lookup dictionary for Assessment Units with lerger values of acreage

# Step 1: Read the Excel file into a DataFrame
excel_data = pd.read_csv("F:\GIS\PROJECTS\ResearchAnalysis\SEZ\Large_Polygon_Lookup.csv")  

#Define Empty look up dataframe
lookup_dict = {}

for index, row in excel_data.iterrows():
    lookup_dict[row['Assessment_Unit_Name']] = row['SEZ_ID']

# See dictionary where keys are Assessment Unit Names and values are SEZ IDs
print(lookup_dict)

#Small Polygon if there are two acres for an SEZ
# Step 1: Read the Excel file into a DataFrame
excel_data = pd.read_csv("F:\GIS\PROJECTS\ResearchAnalysis\SEZ\Small_Polygon_Lookup.csv")  

#Define Empty look up dataframe
lookup_riverine = {}

for index, row in excel_data.iterrows():
    lookup_riverine[row['Assessment_Unit_Name']] = row['SEZ_ID']

# See dictionary where keys are Assessment Unit Names and values are SEZ IDs
print(lookup_riverine)

#All Polygons
# Step 1: Read the Excel file into a DataFrame
excel_data = pd.read_csv("F:\GIS\PROJECTS\ResearchAnalysis\SEZ\All_SEZID_Lookup.csv")  

#Define Empty look up dataframe
lookup_all = {}

for index, row in excel_data.iterrows():
    lookup_all[row['SEZ_ID']] = {'SEZ_Type': row['SEZ_Type']}

# See dictionary where keys are Assessment Unit Names and values are SEZ IDs
print(lookup_all)


{'Angora Creek - tributary': 519, 'Angora Creek - upper': 520, 'Angora meadows - 1': 87, 'Angora meadows - 2': 90, 'Angora meadows - 3': 91, 'Angora meadows - 4': 143, 'Angora meadows - 5': 144, 'Angora meadows - 6': 142, 'Angora meadows - 7': 89, 'Angora meadows - 8': 88, 'Angora meadows - 9': 92, 'Angora meadows tributary - 1': 217, 'Angora meadows tributary - 2': 99, 'Angora meadows tributary - 3': 97, 'Angora meadows tributary - 4': 94, 'Angora meadows tributary - 5': 214, 'Angora meadows tributary - 6': 146, 'Angora meadows tributary - 7': 93, 'Angora meadows tributary - 8': 95, 'Angora meadows tributary - 9': 96, 'Angora tributary': 446, 'Antone meadows': 187, 'Baldwin marsh - 1': 160, 'Benwood meadows - 1': 129, 'Benwood meadows - 2': 131, 'Big Meadow - 1': 47, 'Big Meadow - 2': 48, 'Big Meadow - 3': 38, 'Big Meadow - 4': 39, 'Big Meadow - 5': 37, 'Big Meadow - 6': 40, 'Big meadow - 7': 126, 'Big Meadow Creek - lower': 521, 'Big Meadow Creek - upper': 491, 'Big Meadow Creek - up

## Import Data and Create DataFrames

In [11]:
#SETUP
def get_fs_data(fs_url):
    feature_layer = FeatureLayer(fs_url)
    query_result = feature_layer.query()
    feature_list = query_result.features
    all_data = pd.DataFrame([feature.attributes for feature in feature_list])
    return all_data

def get_fs_data_spatial(fs_url):
    feature_layer = FeatureLayer(fs_url)
    query_result = feature_layer.query().sdf
    query_result.spatial.sr = 26910
    return query_result

# Get Stream Location data so we can do a spatial join on stream miles or riverine units
stream_url = "https://maps.trpa.org/server/rest/services/LTInfo_Monitoring/MapServer/8"

streamsdf = get_fs_data_spatial(stream_url)# Create DataFrame

#Get SEZ spatially enabled dataframe from REST Service
SEZ_url = "https://maps.trpa.org/server/rest/services/SEZ_Assessment_Unit/FeatureServer/0"

dfSEZ = get_fs_data_spatial(SEZ_url)

#spatial reference stuff
streamsdf.spatial.sr = dfSEZ.spatial.sr

#import Riverine Indicators

#REST SERVICES
bank_stability_url = "https://maps.trpa.org/server/rest/services/SEZ_Assessment_Unit/FeatureServer/4"
biotic_integrity_url = "https://maps.trpa.org/server/rest/services/SEZ_Assessment_Unit/FeatureServer/5"
incision_url = "https://maps.trpa.org/server/rest/services/SEZ_Assessment_Unit/FeatureServer/10"
headcuts_url = "https://maps.trpa.org/server/rest/services/SEZ_Assessment_Unit/FeatureServer/9"
AOP_url= "https://maps.trpa.org/server/rest/services/SEZ_Assessment_Unit/FeatureServer/3"
Hab_Frag_url = 'https://maps.trpa.org/server/rest/services/SEZ_Assessment_Unit/FeatureServer/8'

#Get feature service data inot dataframe
dfbanks = get_fs_data(bank_stability_url)
dfbiotic = get_fs_data(biotic_integrity_url)
dfincision = get_fs_data(incision_url)
dfheadcuts = get_fs_data(headcuts_url)
dfAOP = get_fs_data(AOP_url)
dfhabitat = get_fs_data(Hab_Frag_url)


#Stream Habitat Condition ASsessment fc that has SEZ_ID These are a copy ofsde.Fisheries.sde.Stream_Assessment_2020

def get_fc_data_spatial(fc_path, spatial_reference=26910):
   
    # Load the feature class into a spatially enabled DataFrame
    sdf = pd.DataFrame.spatial.from_featureclass(fc_path)
    
    # Set the spatial reference
    sdf.spatial.set_sr = spatial_reference
    
    return sdf

# Check if the geometry column exists
   # if 'SHAPE' in sdf.columns:
    #    sdf.spatial.set_geometry('SHAPE')
    #elif 'geometry' in sdf.columns:
     #   sdf.spatial.set_geometry('geometry')
    #else:
     #   raise ValueError("The DataFrame does not contain a recognized geometry column.")
    
    #return sdf

# Example usage
# Path to your feature class within a geodatabase
Streammiles_path = r"F:\GIS\PROJECTS\ResearchAnalysis\Stream Habitat Condition\Stream_Habitat_Condition.gdb\Stream_Habitat_Condition"

# Get the spatially enabled DataFrame
#Add Threshold Year to this data so that we can stack this data like SEZ Assessment Units
Stream_Miles_sdf = get_fc_data_spatial(Streammiles_path)



#Import IPI Data 

"F:\Research and Analysis\Fisheries\Streams\Bioassessment\California Stream Condition Index\Physical Habitat Condition Index\2009-2018 IPI\2009-2018_Index_of_Physical_Habitat_Integrity.xlsx"

IPIfolder = "F:\Research and Analysis\Fisheries\Streams\Bioassessment"
IPI22 = os.path.join(IPIfolder, "2022", "IPI_22.csv")
IPI20 = os.path.join(IPIfolder, "2020", "IPI_20.csv")
IPI18 = os.path.join(IPIfolder, "2009-2018 IPI","2009-2018_Index_of_Physical_Habitat_Integrity.xlsx" )
#Create IPI Dataframes
IPI22df = pd.read_csv(IPI22)
IPI20df = pd.read_csv(IPI20)
IPI18df = pd.read_excel(IPI18)
IPI22df['IPIYear']= '2022'
IPI20df['IPIYear']= '2020'

#Need to make it so that we can give a year to all IPI scores in IPI 18 which has data from 2009-2018
#Merge dataframes into one 
concatIPI_df = pd.concat([IPI22df, IPI20df], axis=0, ignore_index=True)
#perform spatial join of sde.stream and sez units
#thesdf = SEZsdf.spatial.join(streamsdf, how='inner')



## Grading Indicators

In [26]:
# Grading 
    
#Scoring based off of grading - check this
def score_indicator(Rating):
    if pd.isna(Rating):
        return np.nan
    elif  Rating == 'A':
        return '12'
    elif Rating == 'B':
        return '9'
    elif Rating == 'C':
        return '6'
    else:
        return '3'

    
#define rating SEZ Rating
def rate_SEZ(percent):
    if 0 <= percent < .70:
        return 'D'
    elif .7 <= percent < .80:
        return 'C'
    elif .80 <= percent < .90:
        return 'B'
    else:
        return 'A'
    
#Define Grade for IPI Score - Used only for Stream HAbitat Condition
def categorize_phab(IPI):
     if   IPI >= 0.94:
        return 'A'
     elif 0.83 < IPI < 0.94:
        return 'B'
     elif 0.7 < IPI <= 0.83:
        return 'C'
     else:
        return 'D'
     

#Define Grade for Bioassessment Score
def categorize_csci(biotic_integrity):
     if pd.isna(biotic_integrity):
        return np.nan
     elif   biotic_integrity > 0.92:
        return 'A'
     elif 0.79 < biotic_integrity <= 0.92:
        return 'B'
     elif 0.62 < biotic_integrity <= 0.79:
        return 'C'
     else:
        return 'D'

     
#Used for Stream HAbitat Condition
def rate_stream(percent):
     if   percent >= 0.92:
        return 'Excellent'
     elif 0.80 <= percent < 0.92:
        return 'Good'
     else:
        return 'Marginal'


In [25]:
Stream_Miles_sdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 186 entries, 0 to 185
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   OBJECTID    186 non-null    Int64         
 1   Acres       186 non-null    Float64       
 2   Assessment  186 non-null    string        
 3   SEZ_Type    186 non-null    string        
 4   Ownership_  186 non-null    string        
 5   Final_Rati  186 non-null    string        
 6   GlobalID    186 non-null    string        
 7   created_us  186 non-null    string        
 8   created_da  186 non-null    datetime64[us]
 9   last_edite  186 non-null    string        
 10  last_edi_1  186 non-null    datetime64[us]
 11  SEZ_ID      186 non-null    Int32         
 12  SHAPE       186 non-null    geometry      
dtypes: Float64(1), Int32(1), Int64(1), datetime64[us](2), geometry(1), string(7)
memory usage: 18.8 KB


## Prep Riverine Indicator Data

In [12]:
# ----------------------#
# STREAM MILES #
#-----------------------#

gsa = GeoSeriesAccessor(Stream_Miles_sdf['SHAPE'])

# alternatively, just add a new column at the end of the sedf
Stream_Miles_sdf['Length_meters'] = gsa.length


#Convert the length to miles
Stream_Miles_sdf['Stream_Miles'] = Stream_Miles_sdf['Length_meters'] * 0.000621371

## Now you can use the 'Length_miles' column in your analysis
print(Stream_Miles_sdf[['Stream_Miles']])

    Stream_Miles
0       3.231289
1       4.877088
2       2.411536
3       9.918253
4      11.011662
..           ...
181     0.115077
182     0.685075
183     0.652906
184     0.948657
185      1.16541

[186 rows x 1 columns]


In [13]:
#------------------#
#Biotic Integrity
#------------------#
#Prep data- Add any scores and find average oif there are two stream sites for one sez. Also rename data source so it includes are streams that were averaged
# Function to average scores and concatenate data sources for each Year and Assessment_Unit_Name
def average_biotic_scores(dfbiotic, unit_col='Assessment_Unit_Name', year_col='Year', score='Biotic_Integrity_CSCI', source_col='Biotic_Integrity_Data_Source'):
    # Group by Assessment Unit and Year
    group = dfbiotic.groupby([unit_col, year_col])
    
    # Calculate the mean of the scores
    averaged_scores = group[score].mean().reset_index()
    
    # Concatenate the data sources with specific formatting
    def concatenate_sources(x, year):
        formatted_sources = []
        for entry in x:
            parts = entry.split(",")
            if len(parts) >= 3:
                formatted_sources.append(f'TRPA, {parts[1].strip()}, {parts[-1].strip()}')  # Extract station code and year
        if formatted_sources:
            return '/ '.join(formatted_sources)
        else:
            return None  # Return None if all entries are invalid
    
    # Apply concatenate_sources to each group
    concatenated_sources = group.apply(lambda grp: concatenate_sources(grp[source_col], grp[year_col])).reset_index(name=source_col)
    
    # Merge the averaged scores with concatenated sources
    averaged_df = pd.merge(averaged_scores, concatenated_sources, on=[unit_col, year_col], how='left')
    
    return averaged_df


# Drop duplicates based on 'Assessment_Unit_Name' and 'Year'
dfbiotic = dfbiotic.drop_duplicates(subset=['Assessment_Unit_Name', 'Year', 'Biotic_Integrity_CSCI'])

# Apply the function to dfbiotic
averaged_biotic_df = average_biotic_scores(dfbiotic)

# Apply the rating function to the averaged biotic integrity scores
averaged_biotic_df['Biotic_Integrity_Rating'] = averaged_biotic_df['Biotic_Integrity_CSCI'].apply(categorize_csci)

# Calculate the biotic score for each SEZ
averaged_biotic_df['Biotic_Integrity_Score'] = averaged_biotic_df['Biotic_Integrity_Rating'].apply(score_indicator)

averaged_biotic_df['Biotic_Integrity_Score']=averaged_biotic_df['Biotic_Integrity_Score'].astype(int)

# Output the resulting DataFrame
print(averaged_biotic_df)

#-------------------
# Headcuts 
#------------------
#Reorganize dfHeadcuts to drop small medium large headcut columns
# Drop the columns 'small', 'medium', and 'large'
dfheadcuts = dfheadcuts.drop(columns=['small', 'medium', 'large'])

# Print the DataFrame to see the changes
print(dfheadcuts)

#---------------
#add year to data source so we can drop the year column later (Dont double run this)
#---------------
#Create Dictionary of Dataframes to adjust year to be in datashource column and not its own column
yeartodatasource = {
    'dfbanks': dfbanks,
    'dfheadcuts': dfheadcuts,
    'dfincision': dfincision
}

# Iterate over each DataFrame in meadowdata
for name, df in yeartodatasource.items():
    # Iterate over columns in the DataFrame
    for col in df.columns:
        # Check if the column name contains 'Data'
        if 'Data_' in col:
            # Add Year to the column if it contains 'Data'
            df[col] = df[col] + ', ' + df['Year'].astype(str)
#------------#
#not sure we need this
#Prep SEZ Baseline Data for assessment unit...will need to rethink if acreage changes.. or just manually change in sde
#keep_columns = ['SHAPE', 'SEZ_ID', 'Feature_Type', 'SEZ_Type', 'Ownership_Primary', 'Ownership_Secondary', 'Ownership_Secondary_2', 'Ownership_Secondary_3', 'Acres', 'Comments']
#dfSEZ is assessment unit information from SDE?
#dfSEZinfo=dfSEZ.loc[:,keep_columns].copy()

#dfSEZinfo['SEZ_ID']= dfSEZinfo['SEZ_ID'].astype(int)

         Assessment_Unit_Name  Year  Biotic_Integrity_CSCI  \
0    Angora Creek - tributary  2013               0.996000   
1        Angora Creek - upper  2013               0.996000   
2          Angora meadows - 1  2019               0.690000   
3          Angora meadows - 2  2019               0.820000   
4          Angora meadows - 3  2017               0.940000   
..                        ...   ...                    ...   
226       Woods Creek - lower  2018               0.874000   
227      Woods Creek - middle  2015               1.014000   
228       Woods Creek - upper  2010               1.101000   
229            small meadow 1  2018               1.010000   
230           small meadow 57  2022               0.564053   

    Biotic_Integrity_Data_Source Biotic_Integrity_Rating  \
0          TRPA, 634S13217, 2013                       A   
1          TRPA, 634S13217, 2013                       A   
2          TRPA, 634S19606, 2019                       C   
3          TRPA

# Prep/Process IPI Scores

## Score IPI

In [14]:
#Import IPI Data in set up block
#IPIfolder = "F:\Research and Analysis\Fisheries\Streams\Bioassessment"
#IPI22 = os.path.join(IPIfolder, "2022", "IPI_22.csv")
#IPI20 = os.path.join(IPIfolder, "2020", "IPI_20.csv")

#Create IPI Dataframes
#IPI22df = pd.read_csv(IPI22)
#IPI20df = pd.read_csv(IPI20)

#IPI22df['IPIYear']= '2022'
#IPI20df['IPIYear']= '2020'

#Merge dataframes into one 
#concatIPI_df = pd.concat([IPI22df, IPI20df], axis=0, ignore_index=True)

#Calculate Scores in IPI
#Code for Grading IPI
#Define Grade for TRPA's IPI Score - Used only for Stream HAbitat Condition
#def categorize_phab(IPI):
 #    if   IPI >= 0.94:
  #      return 'A'
   #  elif 0.83 < IPI < 0.94:
    #    return 'B'
#     elif 0.7 < IPI <= 0.83:
 #       return 'C'
  #   else:
   #     return 'D'

concatIPI_df['IPI_Rating']=concatIPI_df['IPI'].apply(categorize_phab)
concatIPI_df['IPI_Score']= concatIPI_df['IPI_Rating'].apply(score_indicator)



concatIPI_df.head()

columns_to_keep = ['StationCode', 'IPI', 'IPI_Rating', 'IPI_Score', 'IPIYear']

concatIPI_df = concatIPI_df[columns_to_keep]
concatIPI_df['IPIYear']= concatIPI_df['IPIYear'].astype(str)
concatIPI_df['IPI_DataSource']= 'TRPA, ' + concatIPI_df['StationCode'] +', ' + concatIPI_df['IPIYear']

#Tune Up DATA Source Column to include year and station code

print(concatIPI_df)

concatIPI_df

#Join on station code(StationCode) to streamsdf(SITE_NAME)


   StationCode   IPI IPI_Rating IPI_Score IPIYear         IPI_DataSource
0    634EDG001  0.89          B         9    2022  TRPA, 634EDG001, 2022
1    634REFBMW  0.96          A        12    2022  TRPA, 634REFBMW, 2022
2    634REFSAX  0.84          B         9    2022  TRPA, 634REFSAX, 2022
3    634REFTRT  0.94          A        12    2022  TRPA, 634REFTRT, 2022
4    634REFUTR  0.88          B         9    2022  TRPA, 634REFUTR, 2022
..         ...   ...        ...       ...     ...                    ...
85   634TPB151  0.99          A        12    2020  TRPA, 634TPB151, 2020
86   634TPB155  1.09          A        12    2020  TRPA, 634TPB155, 2020
87   634TRT003  0.75          C         6    2020  TRPA, 634TRT003, 2020
88   634UTR002  0.99          A        12    2020  TRPA, 634UTR002, 2020
89   634UTR006  1.07          A        12    2020  TRPA, 634UTR006, 2020

[90 rows x 6 columns]


Unnamed: 0,StationCode,IPI,IPI_Rating,IPI_Score,IPIYear,IPI_DataSource
0,634EDG001,0.89,B,9,2022,"TRPA, 634EDG001, 2022"
1,634REFBMW,0.96,A,12,2022,"TRPA, 634REFBMW, 2022"
2,634REFSAX,0.84,B,9,2022,"TRPA, 634REFSAX, 2022"
3,634REFTRT,0.94,A,12,2022,"TRPA, 634REFTRT, 2022"
4,634REFUTR,0.88,B,9,2022,"TRPA, 634REFUTR, 2022"
...,...,...,...,...,...,...
85,634TPB151,0.99,A,12,2020,"TRPA, 634TPB151, 2020"
86,634TPB155,1.09,A,12,2020,"TRPA, 634TPB155, 2020"
87,634TRT003,0.75,C,6,2020,"TRPA, 634TRT003, 2020"
88,634UTR002,0.99,A,12,2020,"TRPA, 634UTR002, 2020"


## IPI site locations

In [21]:
#  Join on ipi (StationCode) to streamsdf(SITE_NAME) so we have a location for the sites
merged_df = pd.merge(concatIPI_df, streamsdf, left_on='StationCode', right_on='SITE_NAME', how='left')

#Clean up physical habitat dataframe
# Keep only phab data and spatial data from Stream data do i need latitude and longitude.. i don't think so... (, 'LATITUDE', 'LONGITUDE')
phab_columns = ['StationCode', 'IPI', 'IPI_Rating', 'IPI_Score', 'IPIYear', 'IPI_DataSource','SHAPE']

Phabsdf = merged_df[phab_columns]

#Spatial join to assessment units - if this stays as outer join i can maybe just add othe rindicator data to this?
PHABSEZsdf= dfSEZ.spatial.join(Phabsdf, how='right')

#Spatial Join to Stream Miles? MAkes sense but I want to process all indicators at onces


In [19]:
#Spatial Join to Streamsite data for CSCI scores so IPI scores have location
# 
# Try same as abov ebut join to larger SEZ Units so that it captures points that aren't right on the stream
#Collect IPI SEZ location information of based on stationcode
#  Join on ipi (StationCode) to streamsdf(SITE_NAME) so we have a location for the sites
merged_df = pd.merge(concatIPI_df, streamsdf, left_on='StationCode', right_on='SITE_NAME', how='left')

#Clean up physical habitat dataframe
# Keep only phab data and spatial data from Stream data do i need latitude and longitude.. i don't think so... (, 'LATITUDE', 'LONGITUDE')
phab_columns = ['StationCode', 'IPI', 'IPI_Rating', 'IPI_Score', 'IPIYear', 'IPI_DataSource','SHAPE']

Phabsdf = merged_df[phab_columns]

#Spatial join to assessment units - if this stays as outer join i can maybe just add othe rindicator data to this?
PHABSEZsdf= dfSEZ.spatial.join(Phabsdf, how='right')



# Clean up duplicates drop duplicates directly if that's more appropriate for your case:
cleaned_sdf = PHABSEZsdf.drop_duplicates(subset=['SEZ_ID', 'IPIYear', 'IPI_Score'])
cleaned_sdf['Year'] = cleaned_sdf['IPIYear'] 
cleaned_sdf['Year'] = pd.to_numeric(cleaned_sdf['Year'], errors='coerce') 
IPI_columns = ['StationCode', 'IPI', 'IPI_Rating', 'IPI_Score', 'Year', 'IPI_DataSource','SHAPE', 'SEZ_ID', 'Assessment_Unit_Name']

IPI_sdf = cleaned_sdf[IPI_columns]
print(IPI_sdf)




#Get most recent

    StationCode   IPI IPI_Rating IPI_Score  Year         IPI_DataSource  \
0     634EDG001  0.89          B         9  2022  TRPA, 634EDG001, 2022   
1     634REFBMW  0.96          A        12  2022  TRPA, 634REFBMW, 2022   
9     634REFSAX  0.84          B         9  2022  TRPA, 634REFSAX, 2022   
17    634REFTRT  0.94          A        12  2022  TRPA, 634REFTRT, 2022   
26    634REFUTR  0.88          B         9  2022  TRPA, 634REFUTR, 2022   
..          ...   ...        ...       ...   ...                    ...   
279   634TPB145  0.96          A        12  2020  TRPA, 634TPB145, 2020   
284   634TPB151  0.99          A        12  2020  TRPA, 634TPB151, 2020   
293   634TRT003  0.75          C         6  2020  TRPA, 634TRT003, 2020   
294   634TRT003  0.75          C         6  2020  TRPA, 634TRT003, 2020   
297   634UTR002  0.99          A        12  2020  TRPA, 634UTR002, 2020   

                                                 SHAPE  SEZ_ID  \
0                                

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_sdf['Year'] = cleaned_sdf['IPIYear']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_sdf['Year'] = pd.to_numeric(cleaned_sdf['Year'], errors='coerce')


# Prep Riverine Indicators

In [16]:
#RiverineIndicators = ['AOP_Score', 'Bank_Stability_Score', 'Biotic_Integrity_Score', 'Habitat_Frag_Score', 'Incision_Score', 'Headcuts_Score']
#SEZID's aren't correct in score tables so I need to do all of these in order to get the correct SEZ IDS?
# Same for meadow(large polygon) and riverine(small polygon) data drop these columns because not needed in final merge, will assign SEZ ID later
columns_to_drop = {'Year', 'SEZ_ID', 'GlobalID', 'last_edited_user', 'created_date', 'OBJECTID', 'created_user', 'last_edited_date'}

#Name dataframes so we can reference later
largepolygondata= {'dfbanks': dfbanks, 
             'dfaveraged_biotic':averaged_biotic_df,
                'dfincision': dfincision,
                'dfhabitat': dfhabitat,
                'dfheadcuts': dfheadcuts,
                'dfAOP': dfAOP,
                'dfIPI': IPI_sdf
}


#Staging Tables Riverine/ small polygons
smallpolygondata = {'dfbanks': dfbanks, 
                'dfaveraged_biotic':averaged_biotic_df,
                'dfincision': dfincision,
                'dfhabitat': dfhabitat,
                'dfheadcuts': dfheadcuts,
                'dfAOP': dfAOP,
                'dfIPI': IPI_sdf
}

#Get most recent year of data for each Assessment Unit NAme
# Function to get the most recent year of data
# Function to get the most recent year of data
def get_most_recent_scores(df, groupfield):
    return df.loc[df.groupby(groupfield)['Year'].idxmax()]

#most_recent_small = get_most_recent_scores(smallpolygondata, 'Assessment_Unit_Name')
#mosrecent_large = get_most_recent_scores(largepolygondata, 'Assessment_Unit_Name')

# Function to drop unnecessary columns from DataFrames
def drop_columns(df, columns_to_drop):
    return df.drop(columns=[col for col in columns_to_drop if col in df.columns])


# Function to assign SEZ_ID to each DataFrame using the provided lookup dictionary
def assign_sez_ids(df, sezid_dict):
    df['SEZ_ID'] = df['Assessment_Unit_Name'].map(sezid_dict)
    df = df.dropna(subset=['SEZ_ID'])
    
    # Use .loc to modify SEZ_ID safely
    df.loc[:, 'SEZ_ID'] = df['SEZ_ID'].astype(int)
    
    return df

# Process data for large and small polygons
def process_data(data_dict, sezid_dict, columns_to_drop):
    processed_data = {}
    for key, df in data_dict.items():
        # Step 1: Get most recent scores
        df_most_recent = get_most_recent_scores(df, 'Assessment_Unit_Name')
        
        # Step 2: Drop unnecessary columns
        df_cleaned = drop_columns(df_most_recent, columns_to_drop)
        
        # Step 3: Assign SEZ_ID
        df_with_sez_id = assign_sez_ids(df_cleaned, sezid_dict)
        
        # Store the processed DataFrame
        processed_data[key] = df_with_sez_id
    return processed_data

# Process large polygon (meadow) and small polygon (riverine) data
processed_largepolygon_data = process_data(largepolygondata, lookup_dict, columns_to_drop)
processed_smallpolygon_data = process_data(smallpolygondata, lookup_riverine, columns_to_drop)


In [22]:
# Function to merge all DataFrames on multiple keys
def merge_dataframes(data_dict, keys):
    return reduce(lambda left, right: pd.merge(left, right, on=keys, how='outer'), data_dict.values())

# Merge small polygon DataFrames
smallpolygon_df = merge_dataframes(processed_smallpolygon_data, ['SEZ_ID', 'Assessment_Unit_Name'])

# Merge large polygon DataFrames
largepolygon_df = merge_dataframes(processed_largepolygon_data, ['SEZ_ID', 'Assessment_Unit_Name'])

# Append smallpolygon_df to largepolygon_df
final_combined_df = pd.concat([largepolygon_df, smallpolygon_df], ignore_index=True)


# Print the final combined DataFrame to check
print("Final Combined DataFrame:")
print(final_combined_df)

Final Combined DataFrame:
             Assessment_Unit_Name Bank_Stability_Data_Source  \
0        Angora Creek - tributary                 TRPA, 2023   
1              Angora meadows - 1                 TRPA, 2019   
2              Angora meadows - 2                 TRPA, 2019   
3              Angora meadows - 3                 TRPA, 2022   
4              Angora meadows - 6                 TRPA, 2019   
..                            ...                        ...   
637     Rubicon Creek - tributary                        NaN   
638   Secret Harbor Creek - lower                        NaN   
639  Slaughterhouse Creek - upper                        NaN   
640         Third Creek - upper 3                        NaN   
641       Third Creek meadows - 3                        NaN   

     Bank_Stability_Percent_Unstable Bank_Stability_Rating  \
0                           0.859340                     A   
1                           0.000000                     A   
2                  

## Stream Miles- keep only data that correlates to Streams

In [23]:

#Some tributaries were taken out to get the Stream miles shape.. 

# Step 1: Extract SEZ IDs from Stream_Miles_sdf
sez_ids_to_keep = Stream_Miles_sdf['SEZ_ID'].unique()

# Step 2: Filter the final_df to keep only the rows with SEZ IDs present in Stream_Miles_sdf
Readytoscore_df = final_combined_df[final_combined_df['SEZ_ID'].isin(sez_ids_to_keep)]

# Now, filtered_final_df contains only rows where the SEZ ID is in Stream_Miles_sdf --has 184 and original fc has 186 units/?
print(Stream_Miles_sdf)


     OBJECTID     Acres                Assessment              SEZ_Type  \
0           1  1.935925  Big Meadow Creek - upper  Riverine (Perennial)   
1           2   6.85073              UTR - middle  Riverine (Perennial)   
2           3  4.089876  UTR - Christmas Valley 3  Riverine (Perennial)   
3           4  5.695386  Saxon Creek - headwaters  Riverine (Perennial)   
4           5   6.32458       Trout Creek - upper  Riverine (Perennial)   
..        ...       ...                       ...                   ...   
181       182  1.362636   Snow Creek wetlands - 1  Riverine (Perennial)   
182       183  1.362636   Snow Creek wetlands - 1  Riverine (Perennial)   
183       184  3.981049     Incline Creek - upper  Riverine (Perennial)   
184       185  2.426747        Mill Creek - upper  Riverine (Perennial)   
185       186  1.781435   Deer Creek - headwaters  Riverine (Perennial)   

              Ownership_    Final_Rati  \
0                   USFS             A   
1              

In [27]:
# Define stream habitat indicators
StreamHabitatIndicators = ['IPI_Score', 'AOP_Score', 'Bank_Stability_Score', 'Biotic_Integrity_Score', 'Habitat_Frag_Score', 'Incision_Score', 'Headcuts_Score']

# Ensure that only the score columns are numeric
Readytoscore_df[StreamHabitatIndicators] = Readytoscore_df[StreamHabitatIndicators].apply(pd.to_numeric, errors='coerce')

# Function to calculate the final points and points possible
def calculate_scores(row):
    score_columns = StreamHabitatIndicators
    total_points = row[score_columns].sum(skipna=True)
    points_possible = row[score_columns].notna().sum() * 12
    return pd.Series([total_points, points_possible])

# Apply the score calculation to each row
Readytoscore_df[['Final_Total_Points', 'Final_Points_Possible']] = Readytoscore_df.apply(calculate_scores, axis=1)

# Calculate the final percent
Readytoscore_df['Final_Percent'] = Readytoscore_df['Final_Total_Points'] / Readytoscore_df['Final_Points_Possible'].replace(0, np.nan)  # Handle division by zero

# Define functions to rate and score stream segment
#def rate_stream(percent):
 #   if pd.isna(percent):
  #      return 'No Rating'
   # if percent >= 0.9:
#        return 'Excellent'
 #   elif percent >= 0.7:
  #      return 'Good'
   # elif percent >= 0.5:
    #    return 'Fair'
#    else:
 #       return 'Poor'

#???def score_indicator(rating):
 #   scores = {'Excellent': 4, 'Good': 3, 'Fair': 2, 'Poor': 1, 'No Rating': 0}
  #  return scores.get(rating, 0)

# Calculate the final rating and score
Readytoscore_df['Final_Rating'] = Readytoscore_df['Final_Percent'].apply(rate_stream)
Readytoscore_df['Final_Score'] = Readytoscore_df['Final_Rating'].apply(score_indicator)

# Display the final DataFrame
Final_df = Readytoscore_df.copy()
print(Final_df)


             Assessment_Unit_Name Bank_Stability_Data_Source  \
415      Angora Creek - tributary                 TRPA, 2023   
416            Angora meadows - 1                 TRPA, 2019   
417            Angora meadows - 2                 TRPA, 2019   
418            Angora meadows - 3                 TRPA, 2022   
419            Angora meadows - 6                 TRPA, 2019   
..                            ...                        ...   
636         Meeks Bay meadows - 3                        NaN   
638   Secret Harbor Creek - lower                        NaN   
639  Slaughterhouse Creek - upper                        NaN   
640         Third Creek - upper 3                        NaN   
641       Third Creek meadows - 3                        NaN   

     Bank_Stability_Percent_Unstable Bank_Stability_Rating  \
415                         0.859340                     A   
416                         0.000000                     A   
417                         0.000000         

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Readytoscore_df[StreamHabitatIndicators] = Readytoscore_df[StreamHabitatIndicators].apply(pd.to_numeric, errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Readytoscore_df[['Final_Total_Points', 'Final_Points_Possible']] = Readytoscore_df.apply(calculate_scores, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#r

In [25]:
# Calculate Scores
StreamHabitatIndicators = ['IPI_Score', 'AOP_Score', 'Bank_Stability_Score', 'Biotic_Integrity_Score', 'Habitat_Frag_Score', 'Incision_Score', 'Headcuts_Score']
#define 'Score_Columns'
# Function to get the score columns (no dependency on SEZ_Type)
#def get_score_columns():
 #   return StreamHabitatIndicators

# Apply `get_score_columns` to each row to create the 'Score_Columns' column
Readytoscore_df['Score_Columns'] = Readytoscore_df[StreamHabitatIndicators]

for col in StreamHabitatIndicators:
    Readytoscore_df['Score_Columns'] = pd.to_numeric(Readytoscore_df[col], errors='coerce')
# Function to calculate the final points and points possible
def calculate_scores(row):
    score_columns = row['Score_Columns']
    if not score_columns:
        return pd.Series([None, None])
    total_points = row[score_columns].sum(skipna=True)
    points_possible = row[score_columns].notna().sum() * 12
    return pd.Series([total_points, points_possible])

# Apply the score calculation to each row
Readytoscore_df[['Final_Total_Points', 'Final_Points_Possible']] = Readytoscore_df.apply(calculate_scores, axis=1)

# Calculate the final percent
Readytoscore_df['Final_Percent'] = Readytoscore_df['Final_Total_Points'] / Readytoscore_df['Final_Points_Possible']

# Calculate the final rating and score
Readytoscore_df['Final_Rating'] = Readytoscore_df['Final_Percent'].apply(rate_SEZ)
Readytoscore_df['Final_Score'] = Readytoscore_df['Final_Rating'].apply(score_indicator)

# Drop the temporary 'Score_Columns' column
Final_df = Readytoscore_df.drop(columns=['Score_Columns'])


ValueError: Cannot set a DataFrame with multiple columns to the single column Score_Columns

In [None]:
print(Readytoscore_df.dtypes)
