In [1]:
import pandas as pd
import geopandas as gpd
import os
import pathlib as pl
import numpy as np
from IPython.display import display
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from matplotlib.font_manager import FontProperties

In [2]:
cwd = os.getcwd()
truth_dir = pl.Path('..','Refined_Truth_Model','Modflow_Input_Files').resolve()
base_model_grid_dir = pl.Path('..','Locations', 'Base_Model_Grid').resolve()
print(cwd)
print(truth_dir)
print(base_model_grid_dir)

C:\Users\farnut1\Desktop\How-Many-Realizations-main\PRT_Notebook
C:\Users\farnut1\Desktop\How-Many-Realizations-main\Refined_Truth_Model\Modflow_Input_Files
C:\Users\farnut1\Desktop\How-Many-Realizations-main\Locations\Base_Model_Grid


##### In order to compare the IES capture zones to the true capture zone, a 1 to 1 grid comparison needs to be made.  The following cells import the "true capture", join it to the coarser "base model grid" and then filter to only those that cells that were touched by the true capture.

In [3]:
# import true capture zone
true_cap = gpd.read_file(os.path.join(truth_dir, 'true_cap.shp'), engine='pyogrio')
# drop all but x, y and geometry
columns_to_keep = ['x','y','geometry']
columns_to_drop = true_cap.columns.difference(columns_to_keep)
true_cap = true_cap.drop(columns=columns_to_drop)
# set crs for joining
true_cap.set_crs(crs='EPSG:26915', inplace=True)
true_cap

Unnamed: 0,x,y,geometry
0,2115.62,8415.62,"POLYGON ((2112.5 8418.75, 2118.75 8418.75, 211..."
1,2121.88,8415.62,"POLYGON ((2118.75 8418.75, 2125 8418.75, 2125 ..."
2,2128.12,8415.62,"POLYGON ((2125 8418.75, 2131.25 8418.75, 2131...."
3,2134.38,8415.62,"POLYGON ((2131.25 8418.75, 2137.5 8418.75, 213..."
4,2140.62,8415.62,"POLYGON ((2137.5 8418.75, 2143.75 8418.75, 214..."
...,...,...,...
95818,2653.12,1309.38,"POLYGON ((2650 1312.5, 2656.25 1312.5, 2656.25..."
95819,2659.38,1309.38,"POLYGON ((2656.25 1312.5, 2662.5 1312.5, 2662...."
95820,2665.62,1309.38,"POLYGON ((2662.5 1312.5, 2668.75 1312.5, 2668...."
95821,2671.88,1309.38,"POLYGON ((2668.75 1312.5, 2675 1312.5, 2675 13..."


In [4]:
# import base model grid
base_model_grid = gpd.read_file(os.path.join(base_model_grid_dir,'Base_Model_Grid.shp'), engine='pyogrio')
# set crs for joining
base_model_grid.set_crs(crs='EPSG:26915', inplace=True)
base_model_grid

Unnamed: 0,node,row,column,idomain_1,x,y,geometry
0,1,1,1,1,12.5,9987.5,"POLYGON ((0 10000, 25 10000, 25 9975, 0 9975, ..."
1,2,1,2,1,37.5,9987.5,"POLYGON ((25 10000, 50 10000, 50 9975, 25 9975..."
2,3,1,3,1,62.5,9987.5,"POLYGON ((50 10000, 75 10000, 75 9975, 50 9975..."
3,4,1,4,1,87.5,9987.5,"POLYGON ((75 10000, 100 10000, 100 9975, 75 99..."
4,5,1,5,1,112.5,9987.5,"POLYGON ((100 10000, 125 10000, 125 9975, 100 ..."
...,...,...,...,...,...,...,...
79995,79996,400,196,0,4887.5,12.5,"POLYGON ((4875 25, 4900 25, 4900 0, 4875 0, 48..."
79996,79997,400,197,0,4912.5,12.5,"POLYGON ((4900 25, 4925 25, 4925 0, 4900 0, 49..."
79997,79998,400,198,0,4937.5,12.5,"POLYGON ((4925 25, 4950 25, 4950 0, 4925 0, 49..."
79998,79999,400,199,0,4962.5,12.5,"POLYGON ((4950 25, 4975 25, 4975 0, 4950 0, 49..."


In [5]:
# Join true cap with base model grid
true_cap_to_base_grid = gpd.sjoin(base_model_grid, true_cap, how='inner')
true_cap_to_base_grid.reset_index(drop=True, inplace=True)
true_cap_to_base_grid = true_cap_to_base_grid.drop_duplicates(subset=['node'])
true_cap_to_base_grid.reset_index(drop=True, inplace=True)
true_cap_to_base_grid = true_cap_to_base_grid.drop('index_right', axis=1)
true_cap_to_base_grid['prob'] = 101
true_cap_to_base_grid

Unnamed: 0,node,row,column,idomain_1,x_left,y_left,geometry,x_right,y_right,prob
0,12680,64,80,1,1987.5,8412.5,"POLYGON ((1975 8425, 2000 8425, 2000 8400, 197...",1984.38,8396.88,101
1,12681,64,81,1,2012.5,8412.5,"POLYGON ((2000 8425, 2025 8425, 2025 8400, 200...",2003.12,8396.88,101
2,12682,64,82,1,2037.5,8412.5,"POLYGON ((2025 8425, 2050 8425, 2050 8400, 202...",2021.88,8396.88,101
3,12683,64,83,1,2062.5,8412.5,"POLYGON ((2050 8425, 2075 8425, 2075 8400, 205...",2053.12,8396.88,101
4,12684,64,84,1,2087.5,8412.5,"POLYGON ((2075 8425, 2100 8425, 2100 8400, 207...",2096.88,8396.88,101
...,...,...,...,...,...,...,...,...,...,...
6536,69508,348,108,1,2687.5,1312.5,"POLYGON ((2675 1325, 2700 1325, 2700 1300, 267...",2678.12,1309.38,101
6537,69509,348,109,1,2712.5,1312.5,"POLYGON ((2700 1325, 2725 1325, 2725 1300, 270...",2709.38,1315.62,101
6538,69510,348,110,1,2737.5,1312.5,"POLYGON ((2725 1325, 2750 1325, 2750 1300, 272...",2721.88,1315.62,101
6539,69511,348,111,1,2762.5,1312.5,"POLYGON ((2750 1325, 2775 1325, 2775 1300, 275...",2746.88,1321.88,101


##### Read IES shapefiles and compare to truth capture

In [6]:
# Get list of all folders in base_scenario_notebook folder
folders = [f.name for f in os.scandir(cwd) if f.is_dir()]

# Filter to just our possible scenarios
scenario_folders = []
for file in folders:
    if file.startswith('R25'):
        scenario_folders.append(file)
for file in folders:
    if file.startswith('C25'):
        scenario_folders.append(file)
for file in folders:
    if file.startswith('R100'):
        scenario_folders.append(file)
for file in folders:
    if file.startswith('C100'):
        scenario_folders.append(file)
print(scenario_folders)

['R25_pp10_real10', 'R25_pp10_real100', 'R25_pp10_real1000', 'R25_pp10_real2000', 'R25_pp10_real25', 'R25_pp10_real250', 'R25_pp10_real50', 'R25_pp10_real500', 'R25_pp25_real10', 'R25_pp25_real100', 'R25_pp25_real1000', 'R25_pp25_real2000', 'R25_pp25_real25', 'R25_pp25_real250', 'R25_pp25_real50', 'R25_pp25_real500', 'R25_pp50_real10', 'R25_pp50_real100', 'R25_pp50_real1000', 'R25_pp50_real2000', 'R25_pp50_real25', 'R25_pp50_real250', 'R25_pp50_real50', 'R25_pp50_real500', 'C25_pp10_real10', 'C25_pp10_real100', 'C25_pp10_real1000', 'C25_pp10_real2000', 'C25_pp10_real25', 'C25_pp10_real250', 'C25_pp10_real50', 'C25_pp10_real500', 'C25_pp25_real10', 'C25_pp25_real100', 'C25_pp25_real1000', 'C25_pp25_real2000', 'C25_pp25_real25', 'C25_pp25_real250', 'C25_pp25_real50', 'C25_pp25_real500', 'C25_pp50_real10', 'C25_pp50_real100', 'C25_pp50_real1000', 'C25_pp50_real2000', 'C25_pp50_real25', 'C25_pp50_real250', 'C25_pp50_real50', 'C25_pp50_real500', 'R100_pp10_real10', 'R100_pp10_real100', 'R10

In [7]:
# Make new shapefile at prob > 0, 10, 20, 30, etc. intervals, showing what nodes are the same (TP), sim only(FP), true only (FN)
# export back into scenario folder
for scenario in scenario_folders:
    files = [f for f in os.listdir(scenario) if os.path.isfile(os.path.join(scenario,f))]
    capzoneshpfiles = []
    for file in files:
        if file.endswith('rej.shp'):
            capzoneshpfiles+=[os.path.join(scenario,file)]
    print(capzoneshpfiles)
    prob_vals = np.arange(0,110,10)
    #prob_vals = np.array([10, 50, 90])
    counts_df = pd.DataFrame({"Compare":['TruePositive','FalsePositive','FalseNegative']})
    for i in range(len(capzoneshpfiles)):
        basename = os.path.basename(capzoneshpfiles[i])
        capzoneshp = gpd.read_file(capzoneshpfiles[i], engine='pyogrio')
        for val in prob_vals:
            capzoneshp = capzoneshp[capzoneshp['prob']>=val]
        
            compare_df = pd.merge(true_cap_to_base_grid, capzoneshp, on='node', how='outer')
            compare_df['prob_x'] = compare_df['prob_x'].replace(np.nan,0)
            compare_df['prob_y'] = compare_df['prob_y'].replace(np.nan,0)
        
            for i in range(len(compare_df)):
                if compare_df.at[i, 'prob_x']> compare_df.at[i,'prob_y']:
                    compare_df.at[i, 'Compare'] = 'FalseNegative'
                if compare_df.at[i, 'prob_x']< compare_df.at[i,'prob_y']!=np.nan:
                    compare_df.at[i, 'Compare'] = 'FalsePositive'
                if compare_df.at[i, 'geometry_x']==compare_df.at[i,'geometry_y']:
                    compare_df.at[i, 'Compare'] = 'TruePositive'
        
            # Fill geometry values into 1 geocolumn
            compare_df.loc[compare_df['geometry_y'].isnull(), 'geometry_y'] = compare_df.loc[compare_df['geometry_y'].isnull(), 'geometry_x']
            compare_df.loc[compare_df['geometry_x'].isnull(), 'geometry_x'] = compare_df.loc[compare_df['geometry_x'].isnull(), 'geometry_y']
            compare_df['geometry']=compare_df['geometry_x']
        
            #Filter out columns to only those below
            compare_df = compare_df[['node','Compare','geometry']]
            compare_df.to_file(os.path.join(scenario,f'comp_prob_{val}_{basename}'))

['R25_pp10_real10\\R25_pp10_real10_0_rej.shp', 'R25_pp10_real10\\R25_pp10_real10_1_rej.shp', 'R25_pp10_real10\\R25_pp10_real10_2_rej.shp', 'R25_pp10_real10\\R25_pp10_real10_3_rej.shp', 'R25_pp10_real10\\R25_pp10_real10_4_rej.shp']
['R25_pp10_real100\\R25_pp10_real100_0_rej.shp', 'R25_pp10_real100\\R25_pp10_real100_1_rej.shp', 'R25_pp10_real100\\R25_pp10_real100_2_rej.shp', 'R25_pp10_real100\\R25_pp10_real100_3_rej.shp', 'R25_pp10_real100\\R25_pp10_real100_4_rej.shp']
['R25_pp10_real1000\\R25_pp10_real1000_0_rej.shp', 'R25_pp10_real1000\\R25_pp10_real1000_1_rej.shp', 'R25_pp10_real1000\\R25_pp10_real1000_2_rej.shp', 'R25_pp10_real1000\\R25_pp10_real1000_3_rej.shp', 'R25_pp10_real1000\\R25_pp10_real1000_4_rej.shp']
['R25_pp10_real2000\\R25_pp10_real2000_0_rej.shp', 'R25_pp10_real2000\\R25_pp10_real2000_1_rej.shp', 'R25_pp10_real2000\\R25_pp10_real2000_2_rej.shp', 'R25_pp10_real2000\\R25_pp10_real2000_3_rej.shp', 'R25_pp10_real2000\\R25_pp10_real2000_4_rej.shp']
['R25_pp10_real25\\R25_pp1

##### Get a summary at each probability

In [8]:
for scenario in scenario_folders:
    print(scenario)
    files = [f for f in os.listdir(scenario) if os.path.isfile(os.path.join(scenario,f))]
    capzoneshpfiles = []
    for file in files:
        if file.endswith('rej.shp'):
            capzoneshpfiles+=[os.path.join(scenario,file)]
    comp_capzoneshpfiles = []
    for phrase in capzoneshpfiles:
        if 'comp_prob_0_' in phrase:
            comp_capzoneshpfiles+=[phrase]
        if 'comp_prob_10_' in phrase:
            comp_capzoneshpfiles+=[phrase]
        if 'comp_prob_20_' in phrase:
            comp_capzoneshpfiles+=[phrase]
        if 'comp_prob_30_' in phrase:
            comp_capzoneshpfiles+=[phrase]
        if 'comp_prob_40_' in phrase:
            comp_capzoneshpfiles+=[phrase]
        if 'comp_prob_50_' in phrase:
             comp_capzoneshpfiles+=[phrase]
        if 'comp_prob_60_' in phrase:
            comp_capzoneshpfiles+=[phrase]
        if 'comp_prob_70_' in phrase:
            comp_capzoneshpfiles+=[phrase]
        if 'comp_prob_80_' in phrase:
            comp_capzoneshpfiles+=[phrase]
        if 'comp_prob_90_' in phrase:
            comp_capzoneshpfiles+=[phrase]
        if 'comp_prob_100_' in phrase:
            comp_capzoneshpfiles+=[phrase]
    print(comp_capzoneshpfiles)

    # Iteration 0
    comp_capzoneshpfiles_0 = []
    for phrase in comp_capzoneshpfiles:
        if '_0_rej.shp' in phrase:
            comp_capzoneshpfiles_0+=[phrase]
   
    # Iteration 1
    comp_capzoneshpfiles_1 = []
    for phrase in comp_capzoneshpfiles:
        if '_1_rej.shp' in phrase:
            comp_capzoneshpfiles_1+=[phrase]
    
    # Iteration 2
    comp_capzoneshpfiles_2 = []
    for phrase in comp_capzoneshpfiles:
        if '_2_rej.shp' in phrase:
            comp_capzoneshpfiles_2+=[phrase]
    
    # Iteration 3
    comp_capzoneshpfiles_3 = []
    for phrase in comp_capzoneshpfiles:
        if '_3_rej.shp' in phrase:
            comp_capzoneshpfiles_3+=[phrase] 

    # Iteration 4
    comp_capzoneshpfiles_4 = []
    for phrase in comp_capzoneshpfiles:
        if '_4_rej.shp' in phrase:
            comp_capzoneshpfiles_4+=[phrase] 

    # Iteration 0
    counts_df = pd.DataFrame({"Compare":['TruePositive','FalsePositive','FalseNegative']})
    directory = os.path.dirname(comp_capzoneshpfiles_0[0])
    for i in comp_capzoneshpfiles_0:
        current_real = gpd.read_file(i, engine='pyogrio')
        countsloop = current_real['Compare'].value_counts()
        countsloop = countsloop.reset_index()
        basename = os.path.basename(i)
        countsloop.columns=['Compare', basename]
        counts_df = counts_df.merge(countsloop, on='Compare', how="left")
    counts_df.to_csv(os.path.join(directory,'iteration_0.csv'))
    
    # Iteration 1
    counts_df = pd.DataFrame({"Compare":['TruePositive','FalsePositive','FalseNegative']})
    directory = os.path.dirname(comp_capzoneshpfiles_1[0])
    for i in comp_capzoneshpfiles_1:
        current_real = gpd.read_file(i, engine='pyogrio')
        countsloop = current_real['Compare'].value_counts()
        countsloop = countsloop.reset_index()
        basename = os.path.basename(i)
        countsloop.columns=['Compare', basename]
        counts_df = counts_df.merge(countsloop, on='Compare', how="left")
    counts_df.to_csv(os.path.join(directory,'iteration_1.csv'))
    
    # Iteration 2
    counts_df = pd.DataFrame({"Compare":['TruePositive','FalsePositive','FalseNegative']})
    directory = os.path.dirname(comp_capzoneshpfiles_2[0])
    for i in comp_capzoneshpfiles_2:
        current_real = gpd.read_file(i, engine='pyogrio')
        countsloop = current_real['Compare'].value_counts()
        countsloop = countsloop.reset_index()
        basename = os.path.basename(i)
        countsloop.columns=['Compare', basename]
        counts_df = counts_df.merge(countsloop, on='Compare', how="left")
    counts_df.to_csv(os.path.join(directory,'iteration_2.csv'))
    
    # Iteration 3
    counts_df = pd.DataFrame({"Compare":['TruePositive','FalsePositive','FalseNegative']})
    directory = os.path.dirname(comp_capzoneshpfiles_3[0])
    for i in comp_capzoneshpfiles_3:
        current_real = gpd.read_file(i, engine='pyogrio')
        countsloop = current_real['Compare'].value_counts()
        countsloop = countsloop.reset_index()
        basename = os.path.basename(i)
        countsloop.columns=['Compare', basename]
        counts_df = counts_df.merge(countsloop, on='Compare', how="left")
    counts_df.to_csv(os.path.join(directory,'iteration_3.csv'))

    # Iteration 4
    counts_df = pd.DataFrame({"Compare":['TruePositive','FalsePositive','FalseNegative']})
    directory = os.path.dirname(comp_capzoneshpfiles_4[0])
    for i in comp_capzoneshpfiles_4:
        current_real = gpd.read_file(i, engine='pyogrio')
        countsloop = current_real['Compare'].value_counts()
        countsloop = countsloop.reset_index()
        basename = os.path.basename(i)
        countsloop.columns=['Compare', basename]
        counts_df = counts_df.merge(countsloop, on='Compare', how="left")
    counts_df.to_csv(os.path.join(directory,'iteration_4.csv'))

R25_pp10_real10
['R25_pp10_real10\\comp_prob_0_R25_pp10_real10_0_rej.shp', 'R25_pp10_real10\\comp_prob_0_R25_pp10_real10_1_rej.shp', 'R25_pp10_real10\\comp_prob_0_R25_pp10_real10_2_rej.shp', 'R25_pp10_real10\\comp_prob_0_R25_pp10_real10_3_rej.shp', 'R25_pp10_real10\\comp_prob_0_R25_pp10_real10_4_rej.shp', 'R25_pp10_real10\\comp_prob_100_R25_pp10_real10_0_rej.shp', 'R25_pp10_real10\\comp_prob_100_R25_pp10_real10_1_rej.shp', 'R25_pp10_real10\\comp_prob_100_R25_pp10_real10_2_rej.shp', 'R25_pp10_real10\\comp_prob_100_R25_pp10_real10_3_rej.shp', 'R25_pp10_real10\\comp_prob_100_R25_pp10_real10_4_rej.shp', 'R25_pp10_real10\\comp_prob_10_R25_pp10_real10_0_rej.shp', 'R25_pp10_real10\\comp_prob_10_R25_pp10_real10_1_rej.shp', 'R25_pp10_real10\\comp_prob_10_R25_pp10_real10_2_rej.shp', 'R25_pp10_real10\\comp_prob_10_R25_pp10_real10_3_rej.shp', 'R25_pp10_real10\\comp_prob_10_R25_pp10_real10_4_rej.shp', 'R25_pp10_real10\\comp_prob_20_R25_pp10_real10_0_rej.shp', 'R25_pp10_real10\\comp_prob_20_R25_pp10

In [9]:
# Get max number of particles used for all scenarios.  This value will be basis for True Negatives, otherwise there are a LOT of true negatives.
scenario_max_val = []
for scenario in scenario_folders:
    print(scenario)
    files = [f for f in os.listdir(scenario) if os.path.isfile(os.path.join(scenario,f))]
    iterfiles = []
    for file in files:
        if file.startswith('iteration_'):
            iterfiles+=[os.path.join(scenario,file)]
    max_val = []
    for i in iterfiles:
        temp1 = pd.read_csv(i)
        temp1 = temp1.fillna(0)
        temp1.loc['3'] = temp1.sum(numeric_only=True)
        temp1 = temp1.drop('Unnamed: 0', axis=1)
        temp1 = temp1.drop('Compare', axis=1)
        max_val.append(temp1.iloc[3].max())
    scenario_max_val.append(max(max_val))

# All scenario max value
scenario_max_val = max(scenario_max_val)
print(scenario_max_val)

R25_pp10_real10
R25_pp10_real100
R25_pp10_real1000
R25_pp10_real2000
R25_pp10_real25
R25_pp10_real250
R25_pp10_real50
R25_pp10_real500
R25_pp25_real10
R25_pp25_real100
R25_pp25_real1000
R25_pp25_real2000
R25_pp25_real25
R25_pp25_real250
R25_pp25_real50
R25_pp25_real500
R25_pp50_real10
R25_pp50_real100
R25_pp50_real1000
R25_pp50_real2000
R25_pp50_real25
R25_pp50_real250
R25_pp50_real50
R25_pp50_real500
C25_pp10_real10
C25_pp10_real100
C25_pp10_real1000
C25_pp10_real2000
C25_pp10_real25
C25_pp10_real250
C25_pp10_real50
C25_pp10_real500
C25_pp25_real10
C25_pp25_real100
C25_pp25_real1000
C25_pp25_real2000
C25_pp25_real25
C25_pp25_real250
C25_pp25_real50
C25_pp25_real500
C25_pp50_real10
C25_pp50_real100
C25_pp50_real1000
C25_pp50_real2000
C25_pp50_real25
C25_pp50_real250
C25_pp50_real50
C25_pp50_real500
R100_pp10_real10
R100_pp10_real100
R100_pp10_real1000
R100_pp10_real2000
R100_pp10_real25
R100_pp10_real250
R100_pp10_real50
R100_pp10_real500
R100_pp25_real10
R100_pp25_real100
R100_pp25_re

In [10]:
# Subtract the scenario max value from the sum of each of the columns to get the true negative value
for scenario in scenario_folders:
    print(scenario)
    files = [f for f in os.listdir(scenario) if os.path.isfile(os.path.join(scenario,f))]
    iterfiles = []
    for file in files:
        if file.startswith('iteration_'):
            iterfiles+=[os.path.join(scenario,file)]
    for i in iterfiles:
        temp1 = pd.read_csv(i)
        temp1 = temp1.fillna(0)
        temp1.loc['3'] = temp1.sum(numeric_only=True)
        temp1 = temp1.drop('Unnamed: 0', axis=1)
        temp1.iloc[3] = scenario_max_val - temp1.iloc[3]
        temp1.iloc[3, 0] = 'TrueNegative'
        # Transpose dataframe and save
        temp1.T.to_csv(i, header=None)

R25_pp10_real10
R25_pp10_real100
R25_pp10_real1000
R25_pp10_real2000
R25_pp10_real25
R25_pp10_real250
R25_pp10_real50
R25_pp10_real500
R25_pp25_real10
R25_pp25_real100
R25_pp25_real1000
R25_pp25_real2000
R25_pp25_real25
R25_pp25_real250
R25_pp25_real50
R25_pp25_real500
R25_pp50_real10
R25_pp50_real100
R25_pp50_real1000
R25_pp50_real2000
R25_pp50_real25
R25_pp50_real250
R25_pp50_real50
R25_pp50_real500
C25_pp10_real10
C25_pp10_real100
C25_pp10_real1000
C25_pp10_real2000
C25_pp10_real25
C25_pp10_real250
C25_pp10_real50
C25_pp10_real500
C25_pp25_real10
C25_pp25_real100
C25_pp25_real1000
C25_pp25_real2000
C25_pp25_real25
C25_pp25_real250
C25_pp25_real50
C25_pp25_real500
C25_pp50_real10
C25_pp50_real100
C25_pp50_real1000
C25_pp50_real2000
C25_pp50_real25
C25_pp50_real250
C25_pp50_real50
C25_pp50_real500
R100_pp10_real10
R100_pp10_real100
R100_pp10_real1000
R100_pp10_real2000
R100_pp10_real25
R100_pp10_real250
R100_pp10_real50
R100_pp10_real500
R100_pp25_real10
R100_pp25_real100
R100_pp25_re

In [11]:
# Do some stats on each iteration
for scenario in scenario_folders:
    print(scenario)
    files = [f for f in os.listdir(scenario) if os.path.isfile(os.path.join(scenario,f))]
    iterfiles = []
    for file in files:
        if file.startswith('iteration_'):
            iterfiles+=[os.path.join(scenario,file)]
    for i in iterfiles:
        temp1 = pd.read_csv(i)
        
        # move prob100 to end
        row_to_move = temp1.iloc[1]
        temp1 = pd.concat([temp1.drop(temp1.index[1]), pd.DataFrame([row_to_move])]).reset_index(drop=True)
        
        # Add TPR (True Positive Rate), FPR (False Positive Rate), PPV (Positive Preditice Value), FOR (False omission rate), f1-score,
        # (ACC) Accuracy, MCC (Matthews correlation coefficient)
        temp1['TPR'] = temp1['TruePositive'] / (temp1['TruePositive'] + temp1['FalseNegative'])
        temp1['FPR'] = temp1['FalsePositive'] / (temp1['FalsePositive'] + temp1['TrueNegative'])
        temp1['PPV'] = temp1['TruePositive'] / (temp1['TruePositive'] + temp1['FalsePositive'])
        temp1['FOR'] = temp1['FalseNegative'] / (temp1['TrueNegative'] + temp1['FalseNegative'])
        temp1['F1'] = (2*temp1['TruePositive']) / (2*(temp1['TruePositive']+temp1['FalsePositive']+temp1['FalseNegative']))
        temp1['ACC'] = (temp1['TruePositive'] + temp1['TrueNegative']) / (temp1['TruePositive']+temp1['FalsePositive']+temp1['TrueNegative']+temp1['FalseNegative'])
        temp1['MCC'] = ((temp1['TPR']*(1-temp1['FPR'])*temp1['PPV']*(1-temp1['FOR']))**0.5) - (((1-temp1['TPR'])*temp1['FPR']*temp1['FOR']*(1-temp1['PPV']))**0.5)
        temp1['Prob'] = temp1.index*10

        # Add Group number column
        if 'R25_pp10' in os.path.dirname(i):
            temp1['Group'] = 1
        if 'R100_pp10' in os.path.dirname(i):
            temp1['Group'] = 2
        if 'C25_pp10' in os.path.dirname(i):
            temp1['Group'] = 3
        if 'C100_pp10' in os.path.dirname(i):
            temp1['Group'] = 4
        if 'R25_pp25' in os.path.dirname(i):
            temp1['Group'] = 5
        if 'R100_pp25' in os.path.dirname(i):
            temp1['Group'] = 6
        if 'C25_pp25' in os.path.dirname(i):
            temp1['Group'] = 7
        if 'C100_pp25' in os.path.dirname(i):
            temp1['Group'] = 8
        if 'R25_pp50' in os.path.dirname(i):
            temp1['Group'] = 9
        if 'R100_pp50' in os.path.dirname(i):
            temp1['Group'] = 10
        if 'C25_pp50' in os.path.dirname(i):
            temp1['Group'] = 11
        if 'C100_pp50' in os.path.dirname(i):
            temp1['Group'] = 12
        
        # Save back to csv
        temp1.to_csv(i, index=False)

R25_pp10_real10
R25_pp10_real100
R25_pp10_real1000
R25_pp10_real2000
R25_pp10_real25
R25_pp10_real250
R25_pp10_real50
R25_pp10_real500
R25_pp25_real10
R25_pp25_real100
R25_pp25_real1000
R25_pp25_real2000
R25_pp25_real25
R25_pp25_real250
R25_pp25_real50
R25_pp25_real500
R25_pp50_real10
R25_pp50_real100
R25_pp50_real1000
R25_pp50_real2000
R25_pp50_real25
R25_pp50_real250
R25_pp50_real50
R25_pp50_real500
C25_pp10_real10
C25_pp10_real100
C25_pp10_real1000
C25_pp10_real2000
C25_pp10_real25
C25_pp10_real250
C25_pp10_real50
C25_pp10_real500
C25_pp25_real10
C25_pp25_real100
C25_pp25_real1000
C25_pp25_real2000
C25_pp25_real25
C25_pp25_real250
C25_pp25_real50
C25_pp25_real500
C25_pp50_real10
C25_pp50_real100
C25_pp50_real1000
C25_pp50_real2000
C25_pp50_real25
C25_pp50_real250
C25_pp50_real50
C25_pp50_real500
R100_pp10_real10
R100_pp10_real100
R100_pp10_real1000
R100_pp10_real2000
R100_pp10_real25
R100_pp10_real250
R100_pp10_real50
R100_pp10_real500
R100_pp25_real10
R100_pp25_real100
R100_pp25_re

In [12]:
# Combine to 1 large dataframe
list_of_df = []
for scenario in scenario_folders:
    print(scenario)
    files = [f for f in os.listdir(scenario) if os.path.isfile(os.path.join(scenario,f))]
    iterfiles = []
    for file in files:
        if file.startswith('iteration_'):
            iterfiles+=[os.path.join(scenario,file)]
 
    for i in iterfiles:
        temp1 = pd.read_csv(i)
        list_of_df.append(temp1)
        
combined_df = pd.concat(list_of_df, ignore_index=True)

R25_pp10_real10
R25_pp10_real100
R25_pp10_real1000
R25_pp10_real2000
R25_pp10_real25
R25_pp10_real250
R25_pp10_real50
R25_pp10_real500
R25_pp25_real10
R25_pp25_real100
R25_pp25_real1000
R25_pp25_real2000
R25_pp25_real25
R25_pp25_real250
R25_pp25_real50
R25_pp25_real500
R25_pp50_real10
R25_pp50_real100
R25_pp50_real1000
R25_pp50_real2000
R25_pp50_real25
R25_pp50_real250
R25_pp50_real50
R25_pp50_real500
C25_pp10_real10
C25_pp10_real100
C25_pp10_real1000
C25_pp10_real2000
C25_pp10_real25
C25_pp10_real250
C25_pp10_real50
C25_pp10_real500
C25_pp25_real10
C25_pp25_real100
C25_pp25_real1000
C25_pp25_real2000
C25_pp25_real25
C25_pp25_real250
C25_pp25_real50
C25_pp25_real500
C25_pp50_real10
C25_pp50_real100
C25_pp50_real1000
C25_pp50_real2000
C25_pp50_real25
C25_pp50_real250
C25_pp50_real50
C25_pp50_real500
R100_pp10_real10
R100_pp10_real100
R100_pp10_real1000
R100_pp10_real2000
R100_pp10_real25
R100_pp10_real250
R100_pp10_real50
R100_pp10_real500
R100_pp25_real10
R100_pp25_real100
R100_pp25_re

In [13]:
# split out compare column
combined_df[['comp','prob','prob1','obs','pp','reals','iter','rej']] = combined_df['Compare'].str.split('_',expand=True)
# Drop ones not needed
combined_df.drop(columns=['comp','prob','prob1'], axis=1, inplace=True)
# replace text in iter column
combined_df['iter'] = combined_df['iter'].str.replace('_rej.shp','', regex=False)
combined_df['iter'] = combined_df['iter'].astype(int)
# replace text in reals column
combined_df['reals'] = combined_df['reals'].str.replace('real','', regex=False)
combined_df['reals'] = combined_df['reals'].astype(int)

# Save to summary stats
combined_df.to_csv('Summary_Stats.csv')