In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from src.utils import visualize_metrics_plots

In [2]:
data_742 = pd.read_csv("data_anno/crops_742.csv", sep=",", low_memory=True, nrows=2000) # Too large of a file xd, 9625 rows reduced to 2000
data_742 = data_742[data_742[data_742.columns[0]] != 2] # Filter out label 2 ['human-park-motorcycle'] only 2 samples
#features = data_742[data_742.columns[1:]]
#labels = data_742[data_742.columns[0]]

print(data_742)

      label  pixel_0  pixel_1  pixel_2  pixel_3  pixel_4  pixel_5  pixel_6  \
0         5      0.0      0.0      0.0      0.0      0.0      0.0      0.0   
1         5      0.0      0.0      0.0      0.0      0.0      0.0      0.0   
2         5      0.0      0.0      0.0      0.0      0.0      0.0      0.0   
3         5      0.0      0.0      0.0      0.0      0.0      0.0      0.0   
4         8      0.0      0.0      0.0      0.0      0.0      0.0      0.0   
...     ...      ...      ...      ...      ...      ...      ...      ...   
1995      8      0.0      0.0      0.0      0.0      0.0      0.0      0.0   
1996      5      0.0      0.0      0.0      0.0      0.0      0.0      0.0   
1997      5      0.0      0.0      0.0      0.0      0.0      0.0      0.0   
1998      5      0.0      0.0      0.0      0.0      0.0      0.0      0.0   
1999      5      0.0      0.0      0.0      0.0      0.0      0.0      0.0   

      pixel_7  pixel_8  ...  pixel_41640  pixel_41641  pixel_41

In [2]:
# Reading the data
# Uncomment to use group 741 coords data
#data_coords_741 = pd.read_csv("data_anno/coords_741.csv", sep=",")
#features = data_coords_741[(data_coords_741.columns[:-1])]
#labels = data_coords_741[data_coords_741.columns[-1]]

# Uncomment to use group 742 coords data
#data_coords_742 = pd.read_csv("data_anno/coords_742.csv", sep=",")
#features = data_coords_742[(data_coords_742.columns[:-1])]
#labels = data_coords_742[data_coords_742.columns[-1]]

# Uncomment to use group 741 crop data
data_741 = pd.read_csv("data_anno/crops_741.csv", sep=",")
features = data_741[data_741.columns[:-1]]
labels = data_741[data_741.columns[-1]]

# Uncomment to use group 742 crop data
#data_742 = pd.read_csv("data_anno/crops_742.csv", sep=",", low_memory=True, nrows=5000) # Too large of a file xd, 9625 rows reduced to 5000
#data_742 = pd.read_csv("data_anno/crops_742.csv", sep=",", low_memory=True, nrows=2000) # Too large of a file xd, 9625 rows reduced to 2000
#data_742 = data_742[data_742[data_742.columns[0]] != 2] # Filter out label 2 ['human-park-motorcycle'] only 2 samples
#features = data_742[data_742.columns[1:]]
#labels = data_742[data_742.columns[0]]

# Stratified splitting for train and test
# Stratified means that it maintains similar distributions of classes for both train and test
X_train, X_test, y_train, y_test = train_test_split(
    features, labels, test_size=0.33, random_state=42, stratify=labels
)

# Oversampling
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train, y_train)

# Testing different parameters:
    Estimators = 1000
    seed = 0
    Depth = 5, 10, None
    Resampling_Strategy = "all", "auto" (similar to not minority)
    and balanced RFC on SMOTE dataset

In [None]:
# Training both balanced and unbalanced datasets with a stratifying random forest
ests = 1000
seed = 0
depth = [5, 10, None]
strat = ["all", "auto"]

for i in range(3):
    clf_unbalanced = RandomForestClassifier(max_depth=depth[i], random_state=seed, n_estimators=ests, n_jobs=-1)
    clf_unbalanced.fit(X_train, y_train)
    params = f"Unbalanced dataset {i}\ndepth = {depth[i]}"
    print(f"======================================================")
    print(params)
    visualize_metrics(clf_unbalanced, X_test, y_test, params,f"Unbalanced dataset {i}")
    print(f"======================================================")

for i in range(3):
    for j in range(2):
        clf_semibalanced = BalancedRandomForestClassifier(
            sampling_strategy=strat[j],
            replacement=True,
            max_depth=depth[i],
            random_state=seed,
            n_estimators=ests,
            n_jobs=-1,
        )
        clf_semibalanced.fit(X_train, y_train)
        params = f"Semi-balanced dataset {i}-{strat[j]}\ndepth = {depth[i]}\nresampling_strategy = {strat[j]}"
        print(f"======================================================")
        print(params)
        visualize_metrics(clf_semibalanced, X_test, y_test, params, f"Semi-balanced dataset {depth[i]}-{strat[j]}")
        print(f"======================================================")
        
for i in range(3):
    clf_balanced = RandomForestClassifier(max_depth=depth[i], random_state=seed, n_estimators=ests, n_jobs=-1)
    clf_balanced.fit(X_res, y_res)
    params = f"Balanced dataset {i}\ndepth = {depth[i]}"
    print(f"======================================================")
    print(params)
    visualize_metrics(clf_balanced, X_test, y_test,params,f"Balanced dataset {i}")
    print(f"======================================================")
    



In [3]:
# Run separately, runs out of memory lols, lowering data from 5000 rows to 4000
ests = 1000
seed = 0
depth = [5, 10, None]
strat = ["all", "auto"]

for i in range(3):
    for j in range(2):
        clf_doublebalanced = BalancedRandomForestClassifier(
            sampling_strategy=strat[j],
            replacement=True,
            max_depth=depth[i],
            random_state=seed,
            n_estimators=ests,
            n_jobs=-1,
        )
        clf_doublebalanced.fit(X_res, y_res)
        params = f"Double-balanced dataset {i}-{strat[j]}\ndepth = {depth[i]}\nresampling_strategy = {strat[j]}"
        print(f"======================================================")
        print(params)
        visualize_metrics_plots(clf_doublebalanced, X_test, y_test, params, f"Double-balanced dataset {depth[i]}-{strat[j]}")
        print(f"======================================================")

Double-balanced dataset 0-all
depth = 5
resampling_strategy = all
Accuracy: 0.6612021857923497 
Precision: 0.6496666783724306 
Recall: 0.75618961352657 
F1 Score: 0.6486042692939245 
Classification Report:
               precision    recall  f1-score   support

           0       0.43      1.00      0.60         9
           1       0.95      0.66      0.78       276
           2       0.17      0.59      0.26        32
           3       0.70      0.53      0.60        36
           4       1.00      1.00      1.00        13

    accuracy                           0.66       366
   macro avg       0.65      0.76      0.65       366
weighted avg       0.84      0.66      0.72       366

Double-balanced dataset 0-auto
depth = 5
resampling_strategy = auto
Accuracy: 0.6775956284153005 
Precision: 0.6527658081938921 
Recall: 0.7605374396135265 
F1 Score: 0.652097930661155 
Classification Report:
               precision    recall  f1-score   support

           0       0.41      1.00      

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>