In [16]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import random
import math
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import re


In [17]:
# Get the list of all files and directories
path = "./M3_Database/"
dir_list = os.listdir(path)
 
print("Files and directories in '", path, "' :")
 
# prints all files
print(dir_list)

Files and directories in ' ./M3_Database/ ' :
['aedes_aegypti_f_NEWORLEANS_image0072.jpg', 'aedes_aegypti_f_NEWORLEANS_image0007.jpg', 'anopheles_atroparvus_f_EBRO_image0020.jpg', 'aedes_aegypti_f_NEWORLEANS_image0143.jpg', 'anopheles_arabiensis_m_DONGOLA_image0049.jpg', 'anopheles_coluzzi_f_MOPTI_image0012.jpg', 'anopheles_albimanus_f_STECLA_image0009.jpg', 'anopheles_arabiensis_m_DONGOLA_image0009.jpg', 'aedes_aegypti_m_NEWORLEANS_image0049.jpg', 'anopheles_arabiensis_f_DONGOLA_image0067.jpg', 'aedes_aegypti_f_NEWORLEANS_image0049.jpg', 'anopheles_arabiensis_f_DONGOLA_image0050.jpg', 'anopheles_arabiensis_f_DONGOLA_image0076.jpg', 'anopheles_atroparvus_f_EBRO_image0023.jpg', 'anopheles_coluzzi_f_MOPTI_image0050.jpg', 'anopheles_arabiensis_f_DONGOLA_image0106.jpg', 'aedes_aegypti_f_NEWORLEANS_image0090.jpg', 'aedes_aegypti_f_NEWORLEANS_image0148.jpg', 'anopheles_farauti_m_FAR1_image0008.jpg', 'anopheles_arabiensis_f_DONGOLA_image0065.jpg', 'anopheles_arabiensis_f_DONGOLA_image0064.jpg

In [18]:
image_names = []

for i in dir_list:
    image_names.append(i[:-4])

print(image_names)

['aedes_aegypti_f_NEWORLEANS_image0072', 'aedes_aegypti_f_NEWORLEANS_image0007', 'anopheles_atroparvus_f_EBRO_image0020', 'aedes_aegypti_f_NEWORLEANS_image0143', 'anopheles_arabiensis_m_DONGOLA_image0049', 'anopheles_coluzzi_f_MOPTI_image0012', 'anopheles_albimanus_f_STECLA_image0009', 'anopheles_arabiensis_m_DONGOLA_image0009', 'aedes_aegypti_m_NEWORLEANS_image0049', 'anopheles_arabiensis_f_DONGOLA_image0067', 'aedes_aegypti_f_NEWORLEANS_image0049', 'anopheles_arabiensis_f_DONGOLA_image0050', 'anopheles_arabiensis_f_DONGOLA_image0076', 'anopheles_atroparvus_f_EBRO_image0023', 'anopheles_coluzzi_f_MOPTI_image0050', 'anopheles_arabiensis_f_DONGOLA_image0106', 'aedes_aegypti_f_NEWORLEANS_image0090', 'aedes_aegypti_f_NEWORLEANS_image0148', 'anopheles_farauti_m_FAR1_image0008', 'anopheles_arabiensis_f_DONGOLA_image0065', 'anopheles_arabiensis_f_DONGOLA_image0064', 'anopheles_coluzzi_f_MOPTI_image0024', 'aedes_aegypti_f_NEWORLEANS_image0013', 'anopheles_arabiensis_f_DONGOLA_image0060', 'ano

In [19]:
def check_if_substring(string, substr):
    if substr in string:
        return True
    else:
        return False

male_mosquitos = []
female_mosquitos = []
male_mosquito_names = []
female_mosquito_names = []

# counter is used to pick 33% of female mosquiros. 
# this to get around 150 female mosquitos
count = 0

for i in dir_list:
    if check_if_substring(i, "_m_"):
        male_mosquitos.append("M3_Database/" + i)
        male_mosquito_names.append(i)
    if check_if_substring(i, "_f_"):
        if count %2 == 0:
            female_mosquitos.append("M3_Database/" + i)
            female_mosquito_names.append(i)
        count += 1

# print(male_mosquitos)
print("Number of Male Mosquitos: ", len(male_mosquitos))
print("Number of female mosquitos: ", len(female_mosquitos))

Number of Male Mosquitos:  280
Number of female mosquitos:  230


In [20]:
all_mosquitos = male_mosquitos + female_mosquitos
print(len(all_mosquitos))

510


In [21]:
male_female_df = pd.DataFrame()
male_female_df['path'] = all_mosquitos

# create corresponding labels for all the male images
male_female_label = ["M"] * len(male_mosquitos) + ["F"] * len(female_mosquitos)
male_female_df['label'] = male_female_label

# male_df = male_df.set_index('path')


In [22]:
male_female_df

Unnamed: 0,path,label
0,M3_Database/anopheles_arabiensis_m_DONGOLA_ima...,M
1,M3_Database/anopheles_arabiensis_m_DONGOLA_ima...,M
2,M3_Database/aedes_aegypti_m_NEWORLEANS_image00...,M
3,M3_Database/anopheles_farauti_m_FAR1_image0008...,M
4,M3_Database/anopheles_coluzzi_m_MOPTI_image001...,M
...,...,...
505,M3_Database/aedes_aegypti_f_NEWORLEANS_image00...,F
506,M3_Database/anopheles_freeborni_f_F1_image0021...,F
507,M3_Database/anopheles_coluzzi_f_MOPTI_image000...,F
508,M3_Database/anopheles_albimanus_f_STECLA_image...,F


In [23]:
# selecting rows based on condition
male_df1 = male_female_df.loc[(male_female_df["label"] == "M")]
female_df1 = male_female_df.loc[(male_female_df["label"] == "F")]

print(male_df1.shape)
print(female_df1.shape)

(280, 2)
(230, 2)


In [24]:
# one train and test split
from sklearn.model_selection import train_test_split

train_data1, test_data1 = train_test_split(male_df1, test_size=0.15)
train_data2, test_data2 = train_test_split(female_df1, test_size=0.15)


train_data = pd.concat([train_data1, train_data2], ignore_index=True)
test_data = pd.concat([test_data1, test_data2], ignore_index=True)


train_data = train_data.sample(frac=1).reset_index(drop=True)
test_data = test_data.sample(frac=1).reset_index(drop=True)

print(train_data.shape)
print(test_data.shape)

(433, 2)
(77, 2)


In [25]:
train_data

Unnamed: 0,path,label
0,M3_Database/anopheles_arabiensis_m_DONGOLA_ima...,M
1,M3_Database/anopheles_atroparvus_f_EBRO_image0...,F
2,M3_Database/anopheles_freeborni_f_F1_image0036...,F
3,M3_Database/anopheles_arabiensis_f_DONGOLA_ima...,F
4,M3_Database/anopheles_arabiensis_m_DONGOLA_ima...,M
...,...,...
428,M3_Database/aedes_aegypti_f_NEWORLEANS_image01...,F
429,M3_Database/aedes_aegypti_f_NEWORLEANS_image00...,F
430,M3_Database/anopheles_coluzzi_m_MOPTI_image001...,M
431,M3_Database/anopheles_freeborni_f_F1_image0015...,F


In [26]:
from torchvision import transforms
mosquito_transforms = transforms.Compose([
    #transforms.ToPILImage(),
    transforms.Resize([299,299]),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
])


In [27]:
from PIL import Image, ImageOps
def make_M3_data(d):
    data_path = "M3_Database"
    data = []
    label = []
    paths = []
    sex = []
    F_count = 0
    G_count = 0
    male_count = 0 
    female_count = 0
    df = pd.DataFrame()
    for i in range(d.shape[0]):
        specimen = d.loc[i, 'path']
        paths.append(specimen)
        image = Image.open(specimen)
        image = mosquito_transforms(image)
        data.append(image)
        if d.loc[i,'label'] == "F":
            sex.append(0)
            female_count += 1

        if d.loc[i,'label'] == "M":
            sex.append(1)
            male_count += 1
                    
    stacked_data = torch.stack(data)
    df['path'] = paths
    df['gender'] = sex 
    print("female_count: ", female_count)
    print("male_count: ", male_count)
    d1={'gender':sex}
                
    return stacked_data, d1, df

In [28]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle = True)
target = train_data.loc[:,'label']
count = 1
for train_index, valid_index in skf.split(train_data, target):
    print(f"Fold{count}: ")
    
    train = train_data.loc[train_index,:]
    valid = train_data.loc[valid_index,:]
    train = train.reset_index(drop=True)
    valid = valid.reset_index(drop=True)
    print("Validation: ")
    d, dic1, path_df = make_M3_data(valid)
    torch.save(d,f"data/sex/CV_1_M3/val_data_fold{count}.pt")
    np.save(f"data/sex/CV_1_M3/val_label_fold{count}.npy", dic1)
    path_df.to_csv(f"data/sex/CV_1_M3/val_datapath_fold{count}.csv", index = False)
    print("Train: ")
    d2, dic2, path_df2 = make_M3_data(train)
    torch.save(d2,f"data/sex/CV_1_M3/train_data_fold{count}.pt")
    np.save(f"data/sex/CV_1_M3/train_label_fold{count}.npy", dic2)
    path_df2.to_csv(f"data/sex/CV_1_M3/train_datapath_fold{count}.csv", index = False)
    count += 1


Fold1: 
Validation: 
female_count:  39
male_count:  48
Train: 
female_count:  156
male_count:  190
Fold2: 
Validation: 
female_count:  39
male_count:  48
Train: 
female_count:  156
male_count:  190
Fold3: 
Validation: 
female_count:  39
male_count:  48
Train: 
female_count:  156
male_count:  190
Fold4: 
Validation: 
female_count:  39
male_count:  47
Train: 
female_count:  156
male_count:  191
Fold5: 
Validation: 
female_count:  39
male_count:  47
Train: 
female_count:  156
male_count:  191


In [29]:
d, dic, path_df = make_M3_data(test_data)

female_count:  35
male_count:  42


In [30]:
path_df.to_csv('data/sex/CV_1_M3/test_datapath.csv', index = False)
torch.save(d,"data/sex/CV_1_M3/test_data.pt")
np.save("data/sex/CV_1_M3/test_label.npy", dic)