In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import os 
import string
os.getcwd()

# Path to data file
# Note that you may need to add an extra slash ("\") in front of the existing slashes to avoid errors
data_path = Path("C:\\Users\\jsoto\\DOI\\BGC Projects (v3) - Documents\\Mapping Data Workflow\\Mapping Directory for Spatial Join and Statistics\\Toe Pulse 2018\\Data\\Spatial Join\\2018-10-04_fts_20secMed_spatialJoin.csv")

df_data = pd.read_csv(data_path)
df_data.head()

# First lets see what columns we have
df_data.columns.values

# Make a variable for the polygon id column name
# Change this to the name you identified in the list above
poly_id_col = "Poly_ID"

print(len(df_data))
df_data = df_data.dropna(subset=[poly_id_col])
print(len(df_data))

# Now lets make a list of the ones we don't want to include
# I am just copy/pasting non-constituent columns from above into this list
cols_to_drop = [
    'FTS Timestamp', 'Latitude', 'Longitude', 'index_right',
       'Location', 'Shape_Leng', 'Shape_Area'
]

df_data = df_data.drop(cols_to_drop, axis="columns")
df_data.columns.values


# Compute Stats
# First group the data by poly_id_col
df_grouped = df_data.groupby(poly_id_col)

# Now lets define the statistics we want to compute in a list that we can pass to the pandas aggregation function
# For more information on what can go into this list check out: 
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.agg.html
#KO: added std to stats list 5/19/2020

stats = [
    "min",
    "max",
    "mean",
    "median",
    "std"
]

# Compute the statistics defined above for each polygon
df_stats = df_grouped.agg(stats)

df_stats

# Flatten the hierarchical columns
df_stats.columns = [' '.join(col).strip() for col in df_stats.columns.values]

df_stats.head()

# run the loop desired number of times. final output file will have the stats based on the final loop. 
# adjust number of iterations to make sure values converge using printed stat below
num_iterations = 12
tmp_df_data = df_data.copy() #copy of original df_data to preserve data
tmp_df_stats = df_stats.copy() 
for num in range(num_iterations):
    
    # merge original data and stats dataframes on poly ID
    df_merged = df_data.merge(tmp_df_stats, how = 'outer', left_on = 'Poly_ID', right_on = 'Poly_ID', )
    
    convergence = 0
    for col in df_data.columns:
        # skip location, poly_id, lat, long columns when filtering data
        if col in ('Location','Poly_ID','Latitude','Longitude'):
            continue 

        # creates series to hold mean and standard deviation values
        col_mean = col + ' mean' 
        col_std = col + ' std' # standard deviation

        # calculates and creates a column for 2x std
        col_2xstd = 2*df_merged[col_std]

        # calculates the difference between the instantaneous value and the mean value of its corresponding polygon
        col_diff_mean = abs(df_merged[col] - df_merged[col_mean])

        # if the difference between the instantenous value and the mean is greater than 2xstd replace the value with a NAN
        remove_mask = col_diff_mean > col_2xstd
        df_merged[col][remove_mask] = np.nan   
        convergence += col_2xstd.sum()
    print("Sum of standard deviations converging on: ", convergence)

    # creates dataframe, df_data_filtered
    # dropping extra columns created during filtering
    # filtered data retained, and columns kept are based on columns that existed in df_data
    tmp_df_data = df_merged[df_data.columns]
    
    # Re-run stats on filtered data
    df_grouped_filtered = tmp_df_data.groupby(poly_id_col)

    # Compute the statistics defined above for each polygon
    tmp_df_stats = df_grouped_filtered.agg(stats)
    
    # Flatten the hierarchical columns
    tmp_df_stats.columns = [' '.join(col).strip() for col in tmp_df_stats.columns.values]

# change name of dataframe for clarity after processing
df_stats_filtered = tmp_df_stats

tmp_df_stats

# stats for each polygon based on final iteration of above loop
# Write output file
out_dir = "C:\\Users\\jsoto\\DOI\\BGC Projects (v3) - Documents\\Mapping Data Workflow\\Mapping Directory for Spatial Join and Statistics\\Toe Pulse 2018\\Data\\Statistics"
out_fname = data_path.name.split(".")[0] + "_stats.csv"
out_path = Path(out_dir, out_fname)

# Flatten the hierarchical columns
df_stats_filtered.columns = [''.join(col).strip() for col in df_stats_filtered.columns.values]


# Write the csv
df_stats_filtered.to_csv(out_path)

# use a left join to append lat, long values of delta polygon center points
# Path to center point file
# Note that you may need to add an extra slash ("\") in front of the existing slashes to avoid errors
point_path = Path("C:\\Users\\jsoto\\DOI\\BGC Projects (v3) - Documents\\Mapping Data Workflow\\Mapping Directory for Spatial Join and Statistics\\Center Points xlsx\\DeltaCenterPoints_01.xlsx")

df_point = pd.read_excel(point_path)
df_point.head()

merged_left = pd.merge(left=df_stats_filtered, right=df_point, how='left', left_on='Poly_ID', right_on='CL_ID')


# Remove punctuation
merged_left.columns = merged_left.columns.str.strip().str.replace('[^\w\s]', '')
merged_left

# stats for each polygon based on final iteration of above loop joined with CL_ID
# Write output file
out_dir2 = "C:\\Users\\jsoto\\DOI\\BGC Projects (v3) - Documents\\Mapping Data Workflow\Mapping Directory for Spatial Join and Statistics\\Toe Pulse 2018\\Data\\CL_ID Join"
out_fname2 = data_path.name.split(".")[0] + "_stats_join.csv"
out_path2 = Path(out_dir2, out_fname2)



# Flatten the hierarchical columns
merged_left.columns = [''.join(col).strip() for col in merged_left.columns.values]

# Write the csv
merged_left.to_csv(out_path2)






9432
9415
Sum of standard deviations converging on:  194169.70012091252
Sum of standard deviations converging on:  162807.56002930983


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Sum of standard deviations converging on:  154326.34823196928
Sum of standard deviations converging on:  149518.32289899004
Sum of standard deviations converging on:  146507.3594203793
Sum of standard deviations converging on:  144924.38801379502
Sum of standard deviations converging on:  144163.78425121625
Sum of standard deviations converging on:  143682.12211611963
Sum of standard deviations converging on:  143479.71012407992
Sum of standard deviations converging on:  143325.36335703358
Sum of standard deviations converging on:  143191.29028805584
Sum of standard deviations converging on:  142962.24485037566
