In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import os 
os.getcwd()

# 1. Read the file produced by `assign_polys.py`

In [None]:
# Path to data file
# Note that you may need to add an extra slash ("\") in front of the existing slashes to avoid errors
data_path = Path("C:\\Users\\kodonnel\\Documents\\Projects\\ToePulse\\TP_testSpatialJoin.csv")

df_data = pd.read_csv(data_path)
df_data.head()

# 2. A little bit of clean up

It doesn't make sense to compute statistics for some columns in the dataset (like latitude and longitude), so lets identify the columns we don't want to include and drop them from the dataframe.

*Note: we could do this the other way - by identifying the columns we want to keep - but it is my hope that the columns we want to drop will be more or less consistant from dataset to dataset, more so than the constituent columns. So hopefully identifying the columns to drop will mean that little or no modification is needed from dataset to dataset. We will see...*

In [None]:
# First lets see what columns we have
df_data.columns.values

In [None]:
# Now lets make a list of the ones we don't want to include
# I am just copy/pasting non-constituent columns from above into this list
cols_to_drop = [
    'FTS Timestamp', 
    'FTS Latitude', 
    'FTS Longitude',
    'UCI Timestamp',
     'index_right',
    'Shape_Leng', 'Shape_Area'
]

In [None]:
df_data = df_data.drop(cols_to_drop, axis="columns")
df_data.columns.values

Cool now we just have columns that we are going to use in our statistics computation. Now we need to identify the column that contains the polygon id that was assigned by `assign_polys.py`.

In this case, the column is **`'CL_ID'`**

But keep in mind that this could be different in a different dataset. Look for something that looks like it means "centerline id" or "polygon id".

We can't do anything with rows that do not have a polygon id, so the next step will be to drop any rows with a missing polygon id

In [None]:
# Make a variable for the polygon id column name
# Change this to the name you identified in the list above
poly_id_col = "Poly_ID2"

print(len(df_data))
df_data = df_data.dropna(subset=[poly_id_col])
print(len(df_data))

We can see that the length updated if there were any rows missing a polygon id. Now we are ready to compute statistics.

# 3. Compute statistics: Run 1

In [None]:
# First group the data by poly_id_col
df_grouped = df_data.groupby(poly_id_col)

In [None]:
# Now lets define the statistics we want to compute in a list that we can pass to the pandas aggregation function
# For more information on what can go into this list check out: 
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.agg.html
#KO: added std to stats list 5/19/2020

stats = [
    "min",
    "max",
    "mean",
    "median",
    "std"
]

# Compute the statistics defined above for each polygon
df_stats = df_grouped.agg(stats)

In [None]:
# Flatten the hierarchical columns
df_stats.columns = [' '.join(col).strip() for col in df_stats.columns.values]

In [None]:
df_stats.head()

# 4. Outlier removal and re-run stats

Merged df_data (with original, input data) with dt_stats on Poly ID. 
<br>Might need to change Poly_ID2 to CL_ID, depending on how the spatial join code runs?

Filter out all field values that are greater than 2 times the standard deviation (2xstd)

If the difference between the field value and its corresponding polygon mean value is greater 
than 2xstd of that polygon.

All field values considered outliers are replaced with NaN. 

This process is iterative - user determines how many times to run the loop, and thus how many times to re-filter data for outlier removal. Mean and standard deviation are re-computed with each loop and the 'new' values are used to determine outliers. 

Summation of all standard deviations is printed to track convergence and can be used as a guide for the number of times to loop.

Each loop writes overwrite the data in the file from the previous loop - data resulting from final loop is what will be in the file. 

In [None]:
# run the loop desired number of times. final output file will have the stats based on the final loop. 
# adjust number of iterations to make sure values converge using printed stat below
num_iterations = 12
tmp_df_data = df_data.copy() #copy of original df_data to preserve data
tmp_df_stats = df_stats.copy() 
for num in range(num_iterations):
    
    # merge original data and stats dataframes on poly ID
    df_merged = df_data.merge(tmp_df_stats, how = 'outer', left_on = 'Poly_ID2', right_on = 'Poly_ID2', )
    
    convergence = 0
    for col in df_data.columns:
        # skip location, poly_id, lat, long columns when filtering data
        if col in ('Location','Poly_ID2','Latitude','Longitude'):
            continue 

        # creates series to hold mean and standard deviation values
        col_mean = col + ' mean' 
        col_std = col + ' std' # standard deviation

        # calculates and creates a column for 2x std
        col_2xstd = 2*df_merged[col_std]

        # calculates the difference between the instantaneous value and the mean value of its corresponding polygon
        col_diff_mean = abs(df_merged[col] - df_merged[col_mean])

        # if the difference between the instantenous value and the mean is greater than 2xstd replace the value with a NAN
        remove_mask = col_diff_mean > col_2xstd
        df_merged[col][remove_mask] = np.nan   
        convergence += col_2xstd.sum()
    print("Sum of standard deviations converging on: ", convergence)

    # creates dataframe, df_data_filtered
    # dropping extra columns created during filtering
    # filtered data retained, and columns kept are based on columns that existed in df_data
    tmp_df_data = df_merged[df_data.columns]
    
    # Re-run stats on filtered data
    df_grouped_filtered = tmp_df_data.groupby(poly_id_col)

    # Compute the statistics defined above for each polygon
    tmp_df_stats = df_grouped_filtered.agg(stats)

    # Flatten the hierarchical columns
    tmp_df_stats.columns = [' '.join(col).strip() for col in tmp_df_stats.columns.values]

# change name of dataframe for clarity after processing
df_stats_filtered = tmp_df_stats


In [None]:
# stats for each polygon based on final iteration of above loop

df_stats_filtered.to_excel('data_stats_filtered.xlsx')