#### Removing spatially autocorrelated training and validation points

This scripts removes points that are less than 30m from each other and share status as val/training and are same location-date

Used the Near tool with val/training and location-date as match fields to prep the point data for running here

In [8]:

import pandas as pd

# Load the file of points that are less than 30m apart from another point on the same date/location
# does also include clusters of more than two points
# didn't work well to join back to full dataset, so I recommend only using the full point file below
df = pd.read_csv("data/AllTraining_080425prj_closepts.csv")

# load the file with all training points with near distance and near feature id
all = pd.read_csv("data/AllTraining_080425prj.csv")

# Set the threshold distance (m)
threshold = 30

# all

#### Pull in full dataset, remove points that are too close together
Preferentially remove points that are paired with more than one other point 

In [None]:
# Track pairs and points to delete
delete_ids = set()
processed_pairs = set()

# Iterate through rows
for _, row in all.iterrows():
    oid = row['Orig_FID']
    near_fid = row['NEAR_FID']
    near_dist = row['NEAR_DIST']

    if pd.notnull(near_fid) and near_dist < threshold:
        pair = tuple(sorted([int(oid), int(near_fid)]))
        # If either oid or near_fid is already marked for deletion, skip this pair
        #  as it has already been remedied 
        if int(oid) in delete_ids or int(near_fid) in delete_ids:
            pass
        else:
            # Check if oid or near_fid appears elsewhere in NEAR_FID column (excluding this row)
            oid_in_near_fid = ((all['NEAR_FID'] == int(oid)) & (all.index != row.name)).any()
            near_fid_in_near_fid = ((all['NEAR_FID'] == int(near_fid)) & (all.index != row.name)).any()
            # if oid appears in NEAR_FID but near_fid does not, delete oid
            if oid_in_near_fid and not near_fid_in_near_fid:
                delete_ids.add(int(oid))
            # if near_fid appears in NEAR_FID but oid does not, delete near_fid
            elif near_fid_in_near_fid and not oid_in_near_fid:
                delete_ids.add(int(near_fid))
            else:
                # If both or neither appears elsewhere, delete the point with higher OID
                delete_ids.add(max(pair))
        processed_pairs.add(pair) 


# Filter out the points to delete
filtered_df = all[~all['Orig_FID'].isin(delete_ids)]

# Save the cleaned data
filtered_df.to_csv("data/AllTraining_080425prj_FilteredPts.csv")

print(f"Deleted {len(delete_ids)} points within {threshold} meters of another point.")
filtered_df


# save the points that were deleted as a csv
# deleted_df = df[df['Orig_FID'].isin(delete_ids)]
# deleted_df.to_csv("data/AllTraining_080425prj_DeletedPts.csv")

Deleted 744 points within 30 meters of another point.


Unnamed: 0,Location,Date,Type,Region,Latitude,Longitude,Split,Date_Loc,NEAR_FID,NEAR_DIST,Orig_FID
0,CA,5/26/2023,other,Morgan Hill,37.188061,-121.725901,1,2023-05-26 CA,,,1
1,CA,5/26/2023,other,Morgan Hill,37.189068,-121.724510,1,2023-05-26 CA,,,2
2,CA,5/26/2023,other,Morgan Hill,37.190223,-121.722561,1,2023-05-26 CA,,,3
3,CA,5/26/2023,other,Morgan Hill,37.187451,-121.719603,1,2023-05-26 CA,,,4
4,CA,5/26/2023,other,Morgan Hill,37.189402,-121.726455,1,2023-05-26 CA,,,5
...,...,...,...,...,...,...,...,...,...,...,...
4992,Santa Maria,2/6/2021,mulch,,34.990410,-120.497203,2,2021-02-06 Santa Maria,,,5899
4993,Santa Maria,2/6/2021,mulch,,34.990676,-120.501473,2,2021-02-06 Santa Maria,,,5900
4994,Santa Maria,2/6/2021,mulch,,34.989570,-120.501519,2,2021-02-06 Santa Maria,,,5901
4995,Santa Maria,2/6/2021,mulch,,34.990623,-120.502547,2,2021-02-06 Santa Maria,,,5902


In [None]:
# Original method that doesn't preferentially delete points with more than one pair
# or account for points already marked for deletion

# Track pairs and points to delete
delete_ids = set()
processed_pairs = set()

# Iterate through rows
for _, row in df.iterrows():
    oid = row['Orig_FID']
    near_fid = row['NEAR_FID']
    near_dist = row['NEAR_DIST']

    if pd.notnull(near_fid) and near_dist < threshold:
        pair = tuple(sorted([int(oid), int(near_fid)]))
        if pair not in processed_pairs:
            delete_ids.add(max(pair))  # delete the one with higher OID
            processed_pairs.add(pair)

# Filter out the points to delete
filtered_df = df[~df['Orig_FID'].isin(delete_ids)]

print(f"Deleted {len(delete_ids)} points within {threshold} meters of another point.")

# export csv of deleted points
deleted_df = df[df['Orig_FID'].isin(delete_ids)]
deleted_df.to_csv("data/AllTraining_080425prj_DeletedPts2.csv")


Deleted 941 points within 30 meters of another point.
