# Bee Data Processing
Get data from: https://www.kaggle.com/jenny18/honey-bee-annotated-images/discussion

In [1]:
import os
import glob
import numpy as np
import pandas as pd 

In [2]:
data_path = "archive"
data = pd.read_csv(os.path.join(data_path,"bee_data.csv"))

In [3]:
data.head()

Unnamed: 0,file,date,time,location,zip code,subspecies,health,pollen_carrying,caste
0,041_066.png,8/28/18,16:07,"Alvin, TX, USA",77511,-1,hive being robbed,False,worker
1,041_072.png,8/28/18,16:07,"Alvin, TX, USA",77511,-1,hive being robbed,False,worker
2,041_073.png,8/28/18,16:07,"Alvin, TX, USA",77511,-1,hive being robbed,False,worker
3,041_067.png,8/28/18,16:07,"Alvin, TX, USA",77511,-1,hive being robbed,False,worker
4,041_059.png,8/28/18,16:07,"Alvin, TX, USA",77511,-1,hive being robbed,False,worker


How many samples do not have subspecies information?

In [4]:
print(("Number of samples without species: {}").format(len(data[data["subspecies"] == "-1"])))

Number of samples without species: 428


Remove samples with subspecies of values of -1

In [5]:
data.drop(data[data['subspecies'] == "-1"].index, inplace = True) 
data = data.reset_index(drop=True)

In [6]:
data.head()

Unnamed: 0,file,date,time,location,zip code,subspecies,health,pollen_carrying,caste
0,017_029.png,8/6/18,13:21,"Saratoga, CA, USA",95070,Italian honey bee,healthy,False,worker
1,017_015.png,8/6/18,13:21,"Saratoga, CA, USA",95070,Italian honey bee,healthy,False,worker
2,017_001.png,8/6/18,13:21,"Saratoga, CA, USA",95070,Italian honey bee,healthy,False,worker
3,017_000.png,8/6/18,13:21,"Saratoga, CA, USA",95070,Italian honey bee,healthy,False,worker
4,017_014.png,8/6/18,13:21,"Saratoga, CA, USA",95070,Italian honey bee,healthy,False,worker


Rename samples and give dataframe new column for new image names

In [7]:
pic_names_keep = data["file"].values
print(pic_names_keep[:5])

['017_029.png' '017_015.png' '017_001.png' '017_000.png' '017_014.png']


In [8]:
# Delete pics not in pic_names
pic_names = glob.glob(data_path+"/bee_imgs/bee_imgs/*.png")
pic_names = [name.split("/")[-1] for name in pic_names]

In [9]:
print(pic_names[:5])
print(len(pic_names))

['030_204.png', '031_017.png', '038_362.png', '038_404.png', '021_003.png']
5172


In [10]:
pic_names_remove = set(pic_names) - set(pic_names_keep)
print(len(pic_names_remove))

428


In [11]:
# Remove non species labeled images
[os.remove(data_path+"/bee_imgs/bee_imgs/"+name) for name in pic_names_remove]

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

In [12]:
# Check new size
print(len(glob.glob(data_path+"/bee_imgs/bee_imgs/*.png")))

4744


In [13]:
data["id"] = range(0,len(data))
print(data.head())

          file    date   time           location  zip code         subspecies  \
0  017_029.png  8/6/18  13:21  Saratoga, CA, USA     95070  Italian honey bee   
1  017_015.png  8/6/18  13:21  Saratoga, CA, USA     95070  Italian honey bee   
2  017_001.png  8/6/18  13:21  Saratoga, CA, USA     95070  Italian honey bee   
3  017_000.png  8/6/18  13:21  Saratoga, CA, USA     95070  Italian honey bee   
4  017_014.png  8/6/18  13:21  Saratoga, CA, USA     95070  Italian honey bee   

    health  pollen_carrying   caste  id  
0  healthy            False  worker   0  
1  healthy            False  worker   1  
2  healthy            False  worker   2  
3  healthy            False  worker   3  
4  healthy            False  worker   4  


In [14]:
# Rename files 
path_img = data_path+"/bee_imgs/bee_imgs/"
for i, name in enumerate(pic_names_keep):
    os.rename(path_img+name,path_img+str(i)+".png")

Examine classes of bees 

In [15]:
data['subspecies'].value_counts(ascending=True)

Western honey bee          37
VSH Italian honey bee     199
1 Mixed local stock 2     472
Carniolan honey bee       501
Russian honey bee         527
Italian honey bee        3008
Name: subspecies, dtype: int64

Add int value to represent types of bees

In [16]:
bee_types = data['subspecies'].values
values = []
for bee in bee_types:
    if bee == "Western honey bee":
        values.append(0)
    elif bee == "VSH Italian honey bee":
        values.append(1)
    elif bee == "1 Mixed local stock 2":
        values.append(2)
    elif bee == "Carniolan honey bee":
        values.append(3)
    elif bee == "Russian honey bee":
        values.append(4)
    elif bee == "Italian honey bee":
        values.append(5)

In [17]:
data['class_id'] = values
print(data.head())

          file    date   time           location  zip code         subspecies  \
0  017_029.png  8/6/18  13:21  Saratoga, CA, USA     95070  Italian honey bee   
1  017_015.png  8/6/18  13:21  Saratoga, CA, USA     95070  Italian honey bee   
2  017_001.png  8/6/18  13:21  Saratoga, CA, USA     95070  Italian honey bee   
3  017_000.png  8/6/18  13:21  Saratoga, CA, USA     95070  Italian honey bee   
4  017_014.png  8/6/18  13:21  Saratoga, CA, USA     95070  Italian honey bee   

    health  pollen_carrying   caste  id  class_id  
0  healthy            False  worker   0         5  
1  healthy            False  worker   1         5  
2  healthy            False  worker   2         5  
3  healthy            False  worker   3         5  
4  healthy            False  worker   4         5  


In [18]:
data.to_csv(data_path+"/bee_data.csv")