### Determining optimal threshold
12.3.24

Testing all of the possible threshold values between 0 and 1.0 for hoop and mulch based on our training data

In [1]:
import pandas as pd
import numpy as np

In [8]:
# import datasets
# Santa Maria
sm_csv = 'N:/OCEANS_Program/Plastics/Agricultural_Plastics/AgPlastics_Pro/thresholding/SantaMaria_val_o_withPlasticProbs_p95.csv'
sm = pd.read_csv(sm_csv)

# Watsonville (training)
wv_csv = 'N:/OCEANS_Program/Plastics/Agricultural_Plastics/AgPlastics_Pro/thresholding/Watsonville_Points_BM_withPlasticProbs_p95_WY2022.csv'
wv = pd.read_csv(wv_csv)

# Oxnard 
ox_csv = 'N:/OCEANS_Program/Plastics/Agricultural_Plastics/AgPlastics_Pro/thresholding/Oxnard_Points_withPlasticProbs_p95_WY2022.csv'
ox = pd.read_csv(ox_csv)
# Convert 'blackmulch' to 'mulch' in the 'Type' column
ox['Type'] = ox['Type'].replace('blackmulch', 'mulch')

# Watsonville (random points)
wr_csv = 'N:/OCEANS_Program/Plastics/Agricultural_Plastics/AgPlastics_Pro/thresholding/Wats_RandomPoints_GE2021.csv'
wr = pd.read_csv(wr_csv)
# Convert name of column from Name to Type
wr.rename(columns={'Name': 'Type'}, inplace=True)

# Watsonville (training) WY 2021
wv_csv21 = 'N:/OCEANS_Program/Plastics/Agricultural_Plastics/AgPlastics_Pro/thresholding/Watsonville_Points_BM_withPlasticProbs_p95_WY2021.csv'
wv_21 = pd.read_csv(wv_csv21)
wv_21.drop(columns=['hoop_p95', 'mulch_p95'], inplace=True)
wv_21.rename(columns={'hoop_p95_WY21': 'hoop_p95', 'mulch_p95_WY21': 'mulch_p95'}, inplace=True)

# print("Unique values in 'Type' column for Santa Maria dataset:", sm['Type'].unique())
# print("Unique values in 'Type' column for Watsonville dataset:", wv['Type'].unique())
# print("Unique values in 'Type' column for Oxnard dataset:", ox['Type'].unique())
# print("Unique values in 'Type' column for Wats Random dataset:", wr['Type'].unique())

# Define which dataset to use
# data = ox
data = pd.concat([ox, wv, sm], ignore_index=True)
# data = wv_21
# data = pd.concat([ox, wv, sm, wr, wv_21], ignore_index=True)


In [9]:
print(f"Number of rows: {data.shape[0]}")

Number of rows: 1247


#### Comparing mulch and hoop confusion

If we take points that are classified as both mulch and hoop based on whatever thresholds we design, 
it will work most of the time. When a point is classified higher as mulch, it is mulch. 
There is some confusion:

- In Santa Maria, before thresholding, this approach would misclassify hoop as mulch 1.3% of the time, and misclassify mulch as hoop 7.4% of the time
- In Watsonville, before thresholding, this approach misclassifies hoop as mulch 0.6% of the time, and misclassifies mulch as hoop 0 times
- In Oxnard, no confusion
- Overall, 1% of hoop points would be misclassified as mulch and 2% of mulch points would be misclassified as hoop

In [10]:
hoop = data[data['Type'] == 'hoop']
hoop_count = hoop[hoop['hoop_p95'] > hoop['mulch_p95']].shape[0]
mulch_count = hoop[hoop['mulch_p95'] > hoop['hoop_p95']].shape[0]

if hoop_count == 0: # oxnard has no hoop points in 2022
    print('No points classified as "hoop":')
else:
    print('Of the points that are classified as "hoop"')
    print(f"Hoop count: {hoop_count}")
    print(f"Mulch count: {mulch_count}")
    print(f'Confusion rate: {(mulch_count / (hoop_count + mulch_count))*100:.2f}%')

mulch = data[data['Type'] == 'mulch']
hoop_count = mulch[mulch['hoop_p95'] > mulch['mulch_p95']].shape[0]
mulch_count = mulch[mulch['mulch_p95'] > mulch['hoop_p95']].shape[0]

print('Of the points that are classified as "mulch":')
print(f"Hoop count: {hoop_count}")
print(f"Mulch count: {mulch_count}")
print(f'Confusion rate: {(hoop_count / (hoop_count + mulch_count))*100:.2f}%')


Of the points that are classified as "hoop"
Hoop count: 296
Mulch count: 3
Confusion rate: 1.00%
Of the points that are classified as "mulch":
Hoop count: 11
Mulch count: 482
Confusion rate: 2.23%


In [23]:
# Function to calculate the accuracy for given thresholds based on our rules
def calculate_accuracy(thresholds, data):
    mulch_threshold, hoop_threshold = thresholds
    correct_classifications = 0
    other_weight = 0.7 # between 0 and 1
    
    for _, row in data.iterrows():
        # 
        if row['Type'] == 'mulch' and row['mulch_p95'] >= mulch_threshold and (
            row['hoop_p95'] < hoop_threshold or (row['hoop_p95'] - hoop_threshold) < (row['mulch_p95'] - mulch_threshold)):
            # row['hoop_p95'] < hoop_threshold or row['hoop_p95'] < row['mulch_p95']): # old way
            correct_classifications += 1
        elif row['Type'] == 'hoop' and row['hoop_p95'] >= hoop_threshold and (
            row['mulch_p95'] < mulch_threshold or (row['mulch_p95'] - mulch_threshold) < (row['hoop_p95'] - hoop_threshold)):
            # row['mulch_p95'] < mulch_threshold or row['mulch_p95'] < row['hoop_p95']): # old way
            correct_classifications += 1
        elif row['Type'] == 'other' and row['mulch_p95'] < mulch_threshold and row['hoop_p95'] < hoop_threshold:
            correct_classifications += 0.5
    other_count = data[data['Type'] == 'other'].shape[0]
    subtract = other_count * (1 - other_weight)
    return correct_classifications / (len(data) - subtract)

# Define the range of thresholds to test
thresholds_range = np.arange(0.0, 1.05, 0.05)

# Initialize variables to store the best thresholds and highest accuracy
best_thresholds = (0, 0)
highest_accuracy = 0
accuracy_table = pd.DataFrame(columns=['mulch_threshold', 'hoop_threshold', 'accuracy'])

# Iterate over all possible combinations of thresholds
for mulch_threshold in thresholds_range:
    for hoop_threshold in thresholds_range:
        accuracy = calculate_accuracy((mulch_threshold, hoop_threshold), data)
        if accuracy > highest_accuracy:
            highest_accuracy = accuracy
            best_thresholds = (mulch_threshold, hoop_threshold)
        accuracy_row = pd.DataFrame({'mulch_threshold': [mulch_threshold], 'hoop_threshold': [hoop_threshold], 'accuracy': [accuracy]})
        accuracy_table = pd.concat([accuracy_table, accuracy_row], ignore_index=True)

print(f"Best thresholds: Mulch: {best_thresholds[0]}, Hoop: {best_thresholds[1]:.2f}")
print(f"Highest accuracy: {highest_accuracy*100:.2f}%")
# Sort the accuracy table by highest accuracy first
accuracy_table = accuracy_table.sort_values(by='accuracy', ascending=False)
display(accuracy_table.head(10))

  accuracy_table = pd.concat([accuracy_table, accuracy_row], ignore_index=True)


Best thresholds: Mulch: 0.42, Hoop: 0.50
Highest accuracy: 80.65%


Unnamed: 0,mulch_threshold,hoop_threshold,accuracy
1098,0.42,0.54,0.806509
1096,0.42,0.5,0.806509
1097,0.42,0.52,0.805613
1094,0.42,0.46,0.804716
1101,0.42,0.6,0.804716
1100,0.42,0.58,0.804716
1099,0.42,0.56,0.804716
1102,0.42,0.62,0.80382
1095,0.42,0.48,0.80382
1093,0.42,0.44,0.802923


#### Resulting thresholds

**With the test for hoop/mulch confusion:**
- In Santa Maria, it's Mulch 0.5 and Hoop 0.75 (91% accurate)
- In Watsonville, it's Mulch 0.05 and Hoop 0.05 (81% accurate)
- In Oxnard, it's Mulch 0.45 and Hoop 0.35 (96% accurate)
- Watsonville random points, it's Mulch 0.7 and Hoop 0.65 (87% accurate)
- Watsonville 2021 - mulch 0.05, hoop 0.75 (83%)

Overall for training data from 2022, it's Mulch 0.45 and Hoop 0.75 (86% accurate)

Overall (training data 2022, wr, wv_21), it's mulch 0.35 and hoop 0.75 (79% accurate)

**Without the inter-plastic confusion test:**
- In Santa Maria, it's Mulch 0.5 and Hoop 0.5 (93% accurate)
- In Watsonville, it's the same - Mulch 0.05 and Hoop 0.05 (81% accurate)
- In Oxnard, it's the same -  Mulch 0.45 and Hoop 0.35 (96% accurate)
- Watsonville grid, it's the same
- Watsonville 2021 - mulch 0.05, hoop 0.05 (85%)

Overall for training data from 2022, it's Mulch 0.45 and Hoop 0.70 (86% accurate)

Overall (training data 2022, wr, wv_21), it's mulch 0.35 and hoop 0.50 (80% accurate)

Clearly something weird is going on in Watsonville with one of the classes
Could try again with 2021? 