In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os

# Path

In [2]:
train_path = '../data_round_2/best_model_train_data_yolo_annotations.csv'
val_path = '../data_round_2/best_model_val_data_yolo_annotations.csv'

gbif_csv = "../gbif-cropped/inaturalist.csv"
ina_csv = "../data_round_2/inaturalist_anno.csv"

In [3]:
mosAlert_dir = "../data_round_2/final/"
gbif_dir = "../gbif-cropped/"

In [4]:
gbif_df = pd.read_csv(gbif_csv)
gbif_df['img_fName'] = gbif_dir + gbif_df['img_fName']

df_train = pd.read_csv(train_path)
df_val = pd.read_csv(val_path)
df = pd.concat([df_train, df_val])
df.img_fName = mosAlert_dir + df.img_fName
new_df = df.drop_duplicates(subset='img_fName', keep='first')

new_lux_df = pd.concat([new_df, gbif_df])

In [None]:
new_lux_df.class_label.value_counts()

In [None]:
ina_df = pd.read_csv(ina_csv)
ina_df.head()

In [None]:
ina_df['class_label'] = ina_df['class_label'].replace(['koreicus', 'japonicus'], 'japonicus/koreicus')
ina_df['class_label'] = ina_df['class_label'].replace('anopheies', 'anopheles')
ina_df.head()

In [None]:
new_df = pd.concat([new_lux_df, ina_df])
new_df.class_label.value_counts()

In [9]:
culex_df = new_df[new_df['class_label'] == 'culex'].sample(n=3000, random_state=42)
albopictus_df = new_df[new_df['class_label'] == 'albopictus'].sample(n=3000, random_state=42)

df_ = new_df[~new_df['class_label'].isin(['culex', 'albopictus'])]
df_ma_gbif_ina = pd.concat([culex_df, albopictus_df, df_]).sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
df_ma_gbif_ina.class_label.value_counts()

In [None]:
df_ma_gbif_ina.shape

In [13]:
df_ma_gbif_ina.to_csv('../data_round_2/ma_gbif_ina.csv', index=False)

# Unknown Class

In [14]:
unknown_class = ['aegypti', 'anopheles']

mask = df_ma_gbif_ina['class_label'].isin(unknown_class)
df_ma_gbif_ina_ = df_ma_gbif_ina[~mask]

# StratifiedKFold Open

In [6]:
from sklearn.model_selection import train_test_split, StratifiedKFold
import numpy as np
import pandas as pd
import os

save_dir = '../data_round_2/Folds/'
os.makedirs(save_dir, exist_ok=True)

val_annotations_csv = "../data_round_2/mosAlert_new_annotation_2/val_annotation_2.csv"
train_annotations_csv = "../data_round_2/mosAlert_new_annotation_2/train_annotation_2.csv"
train_df = pd.read_csv(train_annotations_csv)
val_df = pd.read_csv(val_annotations_csv)

train_val_data = pd.concat([train_df, val_df])

X_train_val = train_val_data[['img_fName', 'img_w', 'img_h', 'bbx_xtl', 'bbx_ytl', 'bbx_xbr', 'bbx_ybr']]
y_train_val = train_val_data['class_label']

skf = StratifiedKFold(n_splits=5)

for fold, (train_index, val_index) in enumerate(skf.split(X_train_val, y_train_val)):
    X_train = X_train_val.iloc[train_index]
    X_val = X_train_val.iloc[val_index]
    y_train = y_train_val.iloc[train_index]
    y_val = y_train_val.iloc[val_index]

    train_data = pd.concat([X_train, y_train], axis=1)
    val_data = pd.concat([X_val, y_val], axis=1)

    train_data.to_csv(f'{save_dir}train_fold_{fold}.csv', index=False)
    val_data.to_csv(f'{save_dir}val_fold_{fold}.csv', index=False)

# Partition

## Closed-set 6 classes (ma + gbif)

In [7]:
# Train/Val/Test split for Known classes
df_train, df_K_temp = train_test_split(new_lux_df, test_size=0.3, random_state=42)
df_val, df_test = train_test_split(df_K_temp, test_size=0.5, random_state=42)

path = '../data_round_2/closedSet/'
os.makedirs(path, exist_ok=True)

train_path = path + '/train.csv'
val_path = path + '/val.csv'
test_path = path + '/test.csv'

df_train.to_csv(train_path, index=False)
df_test .to_csv(test_path, index=False)
df_val.to_csv(val_path, index=False)

## Anno 2.5: anopheles + jp/kr (full)

In [15]:
# Train/Val/Test split for Known classes
df_train, df_K_temp = train_test_split(df_ma_gbif_ina_, test_size=0.3, random_state=42)
df_val, df_K_test = train_test_split(df_K_temp, test_size=0.5, random_state=42)

# Selecting Unknown classes using the mask and assigning 'mosquito' as the class label
df_U = df_ma_gbif_ina[mask].copy()  # Use the mask to select unknown class data
df_U = df_U.sample(n=450, random_state=42)
df_U['class_label'] = ['mosquito'] * df_U.shape[0]

# Combine the Test set of Known and Unknown classes
df_test = pd.concat([df_K_test, df_U])
df_test['class_label'].value_counts()

path = '../data_round_2/Anno2_5'
os.makedirs(path, exist_ok=True)

train_path = path + '/train.csv'
val_path = path + '/val.csv'
test_path = path + '/test.csv'

df_train.to_csv(train_path, index=False)
df_test.to_csv(test_path, index=False)
df_val.to_csv(val_path, index=False)

In [None]:
df_test.class_label.value_counts()

## Anno 4: anopheles (full)

In [44]:
# Train/Val/Test split for Known classes
df_ma_gbif_ina_ = df_ma_gbif_ina_[df_ma_gbif_ina_['class_label']!='japonicus/koreicus']
df_train, df_K_temp = train_test_split(df_ma_gbif_ina_, test_size=0.3, random_state=42)
df_val, df_K_test = train_test_split(df_K_temp, test_size=0.5, random_state=42)

# Selecting Unknown classes using the mask and assigning 'mosquito' as the class label
df_U = df_ma_gbif_ina[mask].copy()  # Use the mask to select unknown class data
df_U = df_U.sample(n=450, random_state=42)
df_U['class_label'] = ['mosquito'] * df_U.shape[0]

# Combine the Test set of Known and Unknown classes
df_test = pd.concat([df_K_test, df_U])
df_test['class_label'].value_counts()

path = '../data_round_2/Anno4'
os.makedirs(path, exist_ok=True)

train_path = path + '/train.csv'
val_path = path + '/val.csv'
test_path = path + '/test.csv'

df_train.to_csv(train_path, index=False)
df_test.to_csv(test_path, index=False)
df_val.to_csv(val_path, index=False)

## Anno 3: japonicus/koreicus (lux full)

In [None]:
# Train/Val/Test split for Known classes
df_train, df_K_temp = train_test_split(df_ma_gbif_ina_, test_size=0.3, random_state=42)
df_val, df_K_test = train_test_split(df_K_temp, test_size=0.5, random_state=42)

# Selecting Unknown classes using the mask and assigning 'mosquito' as the class label
df_U = df_ma_gbif_ina[mask].copy()  # Use the mask to select unknown class data
df_U = df_U.sample(n=450, random_state=42)
df_U['class_label'] = ['mosquito'] * df_U.shape[0]

# Combine the Test set of Known and Unknown classes
df_test = pd.concat([df_K_test, df_U])
df_test['class_label'].value_counts()

In [30]:
path = '../data_round_2/Anno3'
os.makedirs(path, exist_ok=True)

train_path = path + '/train.csv'
val_path = path + '/val.csv'
test_path = path + '/test.csv'

df_train.to_csv(train_path, index=False)
df_test.to_csv(test_path, index=False)
df_val.to_csv(val_path, index=False)

In [None]:
ina_df['img_fName']

In [None]:
from PIL import Image
import matplotlib.pyplot as plt

# Open an image file
image_path = ina_df['img_fName'][0] # Replace with your image file path
image = Image.open(image_path)

# Convert the image to a format matplotlib can handle
image_rgb = image.convert('RGB')

# Plot the image using matplotlib
plt.imshow(image_rgb)
plt.axis('off')  # Hide axes
plt.show()


In [None]:
should_stop = True

if should_stop:
    raise RuntimeError("Stop")

## Mask 1: Culiseta + Anopheles

In [None]:
mask1 = new_lux_df['class_label'].isin(['culiseta', 'anopheles'])
df_K1 = new_lux_df[~mask1]
df_K1['class_label'].value_counts()

In [None]:
# Train/Val/Test of Known
df_train1, df_K_temp1 = train_test_split(df_K1, test_size=0.3, random_state=42)
df_val1, df_K_test1 = train_test_split(df_K_temp1, test_size=0.35, random_state=42)

# Unknown
df_U1 = new_lux_df[mask1]
df_U1.class_label = ['mosquito'] * df_U1.shape[0]

# Test of Known and Unknown
df_test1 = pd.concat([df_K_test1, df_U1])
df_test1['class_label'].value_counts()

In [46]:
path1 = '../data_round_2/mosAlert_new_annotation_1'
os.makedirs(path1, exist_ok=True)

train_path1 = path1 + '/train_annotation_1.csv'
val_path1 = path1 + '/val_annotation_1.csv'
test_path1 = path1 + '/test_annotation_1.csv'

df_train1.to_csv(train_path1, index=False)
df_test1.to_csv(test_path1, index=False)
df_val1.to_csv(val_path1, index=False)

## Mask 0: Without Lux1

In [49]:
mask = new_df['class_label'].isin(['culiseta', 'anopheles'])
df_K = new_df[~mask]

In [None]:
# Known
df_train, df_K_temp = train_test_split(df_K, test_size=0.3, random_state=42)
df_val, df_K_test = train_test_split(df_K_temp, test_size=0.35, random_state=42)

# Unknown
df_U = new_df[mask]
df_U.class_label = ['mosquito'] * df_U.shape[0]

# Test
df_test = pd.concat([df_K_test, df_U])
df_test['class_label'].value_counts()

In [51]:
path = '../data_round_2/mosAlert_new_annotation_0'
os.makedirs(path, exist_ok=True)

train_path = path + '/train_annotation_0.csv'
val_path = path + '/val_annotation_0.csv'
test_path = path + '/test_annotation_0.csv'

df_train.to_csv(train_path, index=False)
df_test.to_csv(test_path, index=False)
df_val.to_csv(val_path, index=False)

## Mask 2: Aegypti + Anopheles

In [53]:
mask2 = new_lux_df['class_label'].isin(['aegypti', 'anopheles'])
df_K2 = new_lux_df[~mask2]

In [None]:
# Known
df_train2, df_K_temp2 = train_test_split(df_K2, test_size=0.3, random_state=42)
df_val2, df_K_test2 = train_test_split(df_K_temp2, test_size=0.35, random_state=42)

# Unknown
df_U2 = new_lux_df[mask2]
df_U2.class_label = ['mosquito'] * df_U2.shape[0]

# Test
df_test2 = pd.concat([df_K_test2, df_U2])
df_test2['class_label'].value_counts()

In [58]:
path2 = '../data_round_2/mosAlert_new_annotation_2'
os.makedirs(path2, exist_ok=True)

train_path2 = path2 + '/train_annotation_2.csv'
val_path2 = path2 + '/val_annotation_2.csv'
test_path2 = path2 + '/test_annotation_2.csv'

df_train2.to_csv(train_path2, index=False)
df_test2.to_csv(test_path2, index=False)
df_val2.to_csv(val_path2, index=False)

## YOLO train-test

In [2]:
anno_train = '../data_round_2/mosAlert_new_annotation_2/train_annotation_2.csv'
anno_val = '../data_round_2/mosAlert_new_annotation_2/val_annotation_2.csv'
anno_test = '../data_round_2/mosAlert_new_annotation_2/test_annotation_2.csv'

df_tr = pd.read_csv(anno_train)
df_v = pd.read_csv(anno_val)
df_t = pd.read_csv(anno_test)

In [3]:
df_tr = df_tr[~df_tr['img_fName'].str.startswith('../gbif-cropped')]
df_v = df_v[~df_v['img_fName'].str.startswith('../gbif-cropped')]
df_t = df_t[~df_t['img_fName'].str.startswith('../gbif-cropped')]

In [4]:
pathyolo = '../data_round_2/yolo'
os.makedirs(pathyolo, exist_ok=True)
df_t.to_csv(pathyolo+'/test_yolo_ma.csv', index=False)
df_tr.to_csv(pathyolo+'/train_yolo_ma.csv', index=False)
df_v.to_csv(pathyolo+'/val_yolo_ma.csv', index=False)

In [None]:
df_K = df[df['class_label']!='mosquito']
df_K.class_label.value_counts()

In [None]:
yolo_train_val, yolo_test = train_test_split(df_K, test_size=0.1, random_state=42)
yolo_test.class_label.value_counts()

In [10]:
pathyolo = '../data_round_2/yolo'
os.makedirs(pathyolo, exist_ok=True)

In [12]:
yolo_test.to_csv(pathyolo+'/test_yolo.csv', index=False)
yolo_train_val.to_csv(pathyolo+'/train_val_yolo.csv', index=False)

# Double check

In [None]:
df_train['class_label'].value_counts()

In [None]:
df_train1['class_label'].value_counts()

In [None]:
df_train2['class_label'].value_counts()

In [None]:
df_val['class_label'].value_counts()

In [None]:
df_val1['class_label'].value_counts()

In [None]:
df_val2['class_label'].value_counts()

In [None]:
df_test['class_label'].value_counts()

In [None]:
df_test1['class_label'].value_counts()

In [None]:
df_test2['class_label'].value_counts()

In [None]:
new_lux_df