<a href="https://colab.research.google.com/github/wildlifeai/pepeketua_zooniverse/blob/main/frog_zooniverse.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook contains the scripts to upload photos of Archey's frogs to a Zooniverse project and download labels of the landmarks of the frogs to train ML algorithms.

#Requirements

We use the "panoptes_client" package to communicate with Zooniverse. If you don't have it installed, run the command below.

In [None]:
!pip install panoptes_client

Load generic libraries

In [None]:
import io
import zipfile
import json
import pandas as pd
import numpy as np

from google.colab import drive
from datetime import date
from panoptes_client import (
    SubjectSet,
    Subject,
    Project,
    Panoptes,
) 

# Download frog photos

###Add shortcuts to the compressed photos

To download the photos of the frogs into this Google Colab you first need to add shortcuts in your Google drive to the [five zipped folders](https://drive.google.com/file/d/1XXSrATFX1l-J0CUE4m6UfoOBp9zv3XOr/view?usp=sharing) with the photos. 

To add the shortcuts:
* go to the "Shared with me" section in your Google drive,
* find the five zipped folders,
* click on "Add shorcut to Drive" and
* save the shortcuts (we created a folder called "frog_photos" and saved them there).

*Specify* the folder in your Google drive where you saved the shortcuts to the photos (in our case "frog_photos").

In [4]:
dir_shortcuts = "/content/drive/My Drive/frog_photos/"

*If you can't access the five zipped folders please [email Victor](victor@wildlife.ai). 

###Download the compressed photos

To download the five zip folders with the photos you will need to grant access to the Google file stream. 



In [None]:
# Mount the drive in colab
drive.mount('/content/drive/')

# Load the five zipped files
whareorino_a = zipfile.ZipFile(dir_shortcuts + "whareorino_a.zip", 'r')
whareorino_b = zipfile.ZipFile(dir_shortcuts + "whareorino_b.zip", 'r')
whareorino_c = zipfile.ZipFile(dir_shortcuts + "whareorino_c.zip", 'r')
whareorino_d = zipfile.ZipFile(dir_shortcuts + "whareorino_d.zip", 'r')
pukeokahu = zipfile.ZipFile(dir_shortcuts + "pukeokahu.zip", 'r')

# Extract the filepath of the photos of individual frogs
zips = [whareorino_a, whareorino_b, whareorino_c, whareorino_d, pukeokahu]
pdList = []

for zip_file in zips:
  zip_pd = pd.DataFrame(
      [x for x in zip_file.namelist() if 'Individual Frogs' in x and not x.endswith(('.db','/','Store'))]
      )
  pdList.append(zip_pd)

# Combine the file paths of the five grids into a single data frame
frog_df = pd.concat(pdList)


#Create a data frame with frog information

Create a data frame to keep track of the photos uploaded to Zooniverse

###Prepare information related to the photos

In [6]:
# Rename the column of df
frog_df = frog_df.rename(columns={0: "zip_path"})

# Add new columns using directory and filename information
directories = frog_df['zip_path'].str.split("/", n = 4, expand = True)

# Add the grid, frog_id, filename, and capture cols 
frog_df["grid"] = directories[0]
frog_df["frog_id"] = directories[2] 
frog_df["filename"] = directories[3] 
frog_df["capture"] = frog_df["filename"].str.split(".",1, expand = True)[0].str.replace('_', '-').str.rsplit("-",1, expand = True)[1] 
                                

###Prepare information related to Zooniverse subjects

You need to specify your Zooniverse username and password. Uploading and downloading information from Zooniverse is only accessible to those user with access to the project.

In [7]:
zoo_user = "user"
zoo_pass = "pass"

In [8]:
# Connect to Zooniverse with your username and password
auth = Panoptes.connect(username=zoo_user, password=zoo_pass)

if not auth.logged_in:
    raise AuthenticationError("Your credentials are invalid. Please try again.")

# Connect to the Zooniverse project (our frog project # is 13355)
project = Project(13355)

# Get info of subjects uploaded to the project
export = project.get_export("subjects")

# Save the subjects info as pandas data frame
subjects_df = pd.read_csv(
    io.StringIO(export.content.decode("utf-8")),
    usecols=[
        "subject_id",
        "metadata",
    ],
)

# Reset index of df
subj_df = subjects_df.reset_index(drop=True).reset_index()

# Flatten the metadata from the uploaded subjects
meta_df = pd.json_normalize(subj_df.metadata.apply(json.loads))

# Drop metadata and index columns from original df
subj_df = subj_df.drop(columns=["metadata", "index",]).rename(
    columns={"id": "subject_id"}
)

# Combine the flatten metadata with the subjects df
subj_df = pd.concat([subj_df, meta_df], axis=1)

# Add the subject_id of photos already uploaded to Zooniverse
frog_df = pd.merge(frog_df, subj_df, 
                   how="left", on=["grid", "capture", "frog_id", "filename"])


#Upload new photos to Zooniverse

###Temporarily download photos to colab

In [None]:
# Specify the directory in colab to temporarily save the photos
tmp_dir = 'photos_ulpoad/'

# Specify the number of photos to upload
n_photos = 200

# Select n number of photos to upload to Zooniverse
photos_upload = frog_df[frog_df['subject_id'].isnull()].sample(n_photos)
photos_upload["photo_path"] = np.nan


for zip_file in zips:
  # Get a list of all archived file names from the zip
  listOfFileNames = zip_file.namelist()
  # Iterate over the file names
  for fileName in listOfFileNames:
      # Check filename endswith csv
      if fileName in photos_upload['zip_path'].values:
          # Extract a single file from zip
          zip_file.extract(fileName, tmp_dir) 
          # Include the colab path of the photo in the df
          photos_upload.loc[photos_upload['zip_path'].eq(fileName),'photo_path'] = tmp_dir + fileName
                                
print(len(photos_upload.index), "photos have been temporarily downloaded to", tmp_dir)

Check metadata info makes sense before uploading the photos

In [None]:
photos_upload

####Upload photos to Zooniverse

In [None]:
# Select the photo_path and other columns that will be used as metadata
photos_upload = photos_upload[
                            [
                             "photo_path",
                             "filename",
                             "capture" ,
                             "frog_id",
                             "grid",
                             ]
                            ]
        
# Save the df as the subject metadata
subject_metadata = photos_upload.set_index('photo_path').to_dict('index')

# Create a subjet set in Zooniverse to host the photos
subject_set = SubjectSet()

subject_set.links.project = project
subject_set.display_name = "training_200" + date.today().strftime("_%d_%m_%Y")

subject_set.save()

print("Zooniverse subject set created")


# Upload the photos to Zooniverse (with metadata)
new_subjects = []

for photo_path, metadata in subject_metadata.items():
    subject = Subject()

    subject.links.project = project
    subject.add_location(photo_path)

    subject.metadata.update(metadata)

    subject.save()
    new_subjects.append(subject)

# Upload frames
subject_set.add(new_subjects)

print("Subjects uploaded to Zooniverse")


#Download Zooniverse annotations

In [10]:
# Get the export classifications
export = project.get_export("classifications")

# Save the response as pandas data frame
classifications = pd.read_csv(
    io.StringIO(export.content.decode("utf-8")),
    usecols=[
             "user_name",
             "subject_ids",
             "subject_data",
             "classification_id",
             #"workflow_id",
             #"workflow_version",
             "annotations",
             ],
             )
# Convert JSON strings into Python dictionaries, providing access to key-value pairs.
classifications['annotations'] = [json.loads(q) for q in classifications.annotations]

# Flatten annotations
x =[]
y = []
label = []
classification_id = []

for i,row in classifications.iterrows():
  class_id = row['classification_id']
  
  for t in row['annotations']:
    # Select survey Task = T0
    if t['task'] == 'T0':
      if len(t['value']) > 0:
        for l in t['value']:
          x.append(l['x'])
          y.append(l['y'])
          label.append(l['tool_label'])
          classification_id.append(class_id)
      else:
        x.append('')
        y.append('')
        label.append('')
        classification_id.append(class_id)
    
# Combine all the annotations into a data frame
annotations = pd.concat([
                     pd.DataFrame(x, columns =['x']),
                     pd.DataFrame(y, columns =['y']),
                     pd.DataFrame(label, columns =['label']),
                     pd.DataFrame(classification_id, columns =['classification_id'])],
                    axis=1)

# Drop metadata and index columns from original df
classifications = classifications.drop(columns=["annotations"])

# Add metadata information based on the classification id
flat_anotations = pd.merge(annotations, classifications, 
                           how="left", on=["classification_id"])


flat_anotations

Unnamed: 0,x,y,label,classification_id,user_name,subject_data,subject_ids
0,629.945068,877.188782,Tip of snout,278017541,victorav,"{""50445441"":{""retired"":null,""grid"":""Grid A"",""c...",50445441
1,1227.169678,910.180786,Vent,278017541,victorav,"{""50445441"":{""retired"":null,""grid"":""Grid A"",""c...",50445441
2,733.764771,775.934021,Right eye,278017541,victorav,"{""50445441"":{""retired"":null,""grid"":""Grid A"",""c...",50445441
3,738.509338,991.895691,Left eye,278017541,victorav,"{""50445441"":{""retired"":null,""grid"":""Grid A"",""c...",50445441
4,912.628296,750.835022,Right front leg,278017541,victorav,"{""50445441"":{""retired"":null,""grid"":""Grid A"",""c...",50445441
...,...,...,...,...,...,...,...
169,855.117126,768.186340,Vent,278610494,not-logged-in-01bb5723fed659cb81db,"{""50445442"":{""retired"":null,""grid"":""Grid D"",""c...",50445442
170,462.959106,673.481506,Right eye,278610494,not-logged-in-01bb5723fed659cb81db,"{""50445442"":{""retired"":null,""grid"":""Grid D"",""c...",50445442
171,445.618744,861.557312,Left eye,278610494,not-logged-in-01bb5723fed659cb81db,"{""50445442"":{""retired"":null,""grid"":""Grid D"",""c...",50445442
172,587.009094,678.817078,Right front leg,278610494,not-logged-in-01bb5723fed659cb81db,"{""50445442"":{""retired"":null,""grid"":""Grid D"",""c...",50445442


Compare the accuracy difference between three different users

In [48]:
duplicated_annotations = flat_anotations.groupby(['subject_ids','label']).filter(lambda x: len(x) == 3)

duplicated_annotations.sort_values(by=['label','subject_ids'])[['x','y','label','user_name','subject_ids']].round({'x': 1, 'y': 1})

#max_min_duplicates=duplicated_annotations.groupby(['subject_ids','label']).agg({'x':['max','min'],'y':['max','min']})


Unnamed: 0,x,y,label,user_name,subject_ids
33,1354.6,988.9,Left eye,victorav,50445427
123,1350.6,993.3,Left eye,not-logged-in-9b11fa685869df9202f1,50445427
153,1348.0,989.7,Left eye,not-logged-in-56f43c6d74e436d31d10,50445427
117,455.2,854.5,Left eye,victorav,50445442
129,441.4,852.9,Left eye,not-logged-in-3722cd2cf454b732ca5e,50445442
171,445.6,861.6,Left eye,not-logged-in-01bb5723fed659cb81db,50445442
35,1457.4,990.9,Left front leg,victorav,50445427
125,1460.2,993.3,Left front leg,not-logged-in-9b11fa685869df9202f1,50445427
155,1460.8,979.1,Left front leg,not-logged-in-56f43c6d74e436d31d10,50445427
119,583.4,863.7,Left front leg,victorav,50445442


Code leftovers

In [None]:
import os 
import pandas as pd

# Create a df of the photos found in the tmp folder
data = []
# Loop through each folder in the tmp directory
for grid in os.listdir('../tmp/'):
  if 'Grid' in grid:
    grid_path = '../tmp/' + grid
    # Loop through each subfolder in the 'Grid' directories
    for subfolder in os.listdir(grid_path):
      if 'Individual' in subfolder:
        subfolder_path = grid_path + "/" + subfolder
        # Loop through each individual frog in the "individual frog" directoy
        for ind in os.listdir(subfolder_path):
          if not ind.endswith('db'):
            ind_path = subfolder_path + "/" + ind
            # Loop through each photo of the "individual" frog
            for doc in os.listdir(ind_path):
              #Save information about the photo and the frog
              if not doc.endswith('db'):
                fpath = ind_path + "/" + doc
                capt = doc.split(".",1)[0].replace('_', '-').rsplit("-",1)[1]
                data.append((doc, fpath, capt, ind, grid))

df = pd.DataFrame(data,columns = ['filename', 'file_path', 'capture', 'frog_id', 'grid'])