# Koster data to excel

The following scripts are set up to retrieve the annotations, comments and tags from the Koster seafloor observatory and translate them to excel-friendly format.

# Requirements

### Install required packages

We use the "panoptes_client" package to communicate with Zooniverse. If you don't have it installed, run the command below.

In [1]:
!pip install panoptes_client

Collecting panoptes_client
  Downloading https://files.pythonhosted.org/packages/55/6d/09aee478aedcbdc87825eb39bb8593392dc1743b3066d25ba9ec35aa75b0/panoptes_client-1.3.0.tar.gz
Collecting python-magic<0.5,>=0.4
  Downloading https://files.pythonhosted.org/packages/59/77/c76dc35249df428ce2c38a3196e2b2e8f9d2f847a8ca1d4d7a3973c28601/python_magic-0.4.18-py2.py3-none-any.whl
Collecting redo>=1.7
  Downloading https://files.pythonhosted.org/packages/f0/df/6eaeece84b3b6a51663075ae25089ec9b49e90b687ddca6f1fe0f93ab091/redo-2.0.4.tar.gz
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Building wheels for collected packages: redo
  Building wheel for redo (PEP 517) ... [?25l[?25hdone
  Created wheel for redo: filename=redo-2.0.4-cp36-none-any.whl size=11931 sha256=85c19a343973632767be3c204c02b06092e1887ac85976452b2ec6e647c6b2c6
  Stored in directory: /root/.cache/pip/wheels/7e/ca/39/

### Load required libraries

In [79]:
import io
import zipfile
import json
import gzip
import pandas as pd
import numpy as np

from google.colab import drive
from datetime import date
from panoptes_client import (
    SubjectSet,
    Subject,
    Project,
    Panoptes,
) 

### Connect to Zooniverse

In [80]:
zoo_user = "user"
zoo_pass = "pass"

# Connect to Zooniverse with your username and password
auth = Panoptes.connect(username=zoo_user, password=zoo_pass)

if not auth.logged_in:
    raise AuthenticationError("Your credentials are invalid. Please try again.")

# Connect to the Zooniverse project (our project # is 9747)
project = Project(9747)

# Download Zooniverse subjects information

In [81]:
# Get info of subjects uploaded to the project
export = project.get_export("subjects")

# Save the subjects info as pandas data frame
subjects_df = pd.read_csv(
    io.StringIO(export.content.decode("utf-8")),
    usecols=[
        "subject_id",
        "metadata",
        "created_at",
        "workflow_id",
        "subject_set_id",
        "classifications_count",
        "retired_at",
        "retirement_reason",
    ],
)

## Format subject information

### Define project-specific functions

Function to extract the metadata from subjects

In [82]:
def extract_metadata(subj_df):

    # Reset index of df
    subj_df = subj_df.reset_index(drop=True).reset_index()

    # Flatten the metadata information
    meta_df = pd.json_normalize(subj_df.metadata.apply(json.loads))

    # Drop metadata and index columns from original df
    subj_df = subj_df.drop(columns=["metadata", "index",])

    return subj_df, meta_df

### Format subjects uploaded automatically

In [83]:
# Specify the date when we first started uploading subjects automatically
first_auto_upload = "2020-05-29 00:00:00 UTC"

# Select automatically uploaded frames
auto_subjects_df = subjects_df[subjects_df["created_at"] > first_auto_upload]

# Extract metadata from automatically uploaded frames
auto_subjects_df, auto_subjects_meta = extract_metadata(auto_subjects_df)

# Combine metadata info with the subjects df
auto_subjects_df = pd.concat([auto_subjects_df, auto_subjects_meta], axis=1)

# Select only relevant columns
auto_subjects_df = auto_subjects_df[
    ["subject_id", "retired_at", "subject_type"]
]

### Format subjects uploaded manually

In [84]:
# Specify the starting date when clips were manually uploaded
first_manual_upload = "2019-11-17 00:00:00 UTC"

# Select subjects uploaded manually
man_clips_df = (
    subjects_df[
        (subjects_df["metadata"].str.contains(".mp4"))
        & (
            subjects_df["created_at"].between(
                first_manual_upload, first_auto_upload
            )
        )
    ]
    .reset_index(drop=True)
    .reset_index()
)

# Specify the type of subject
man_clips_df["subject_type"] = "clip"

# Extract metadata from manually uploaded clips
man_clips_df, man_clips_meta = extract_metadata(man_clips_df)

# Combine metadata info with the subjects df
man_clips_df = pd.concat([man_clips_df, man_clips_meta], axis=1)

# Select only relevant columns
man_clips_df = man_clips_df[
    ["subject_id", "retired_at", "subject_type"]
]

# Combine all uploaded subjects
subjects = pd.merge(man_clips_df, auto_subjects_df, how="outer")

# Download Zooniverse classifications information

In [85]:
# Get classifications from Zooniverse
export = project.get_export("classifications")

# Save the response as pandas data frame
class_df = pd.read_csv(
    io.StringIO(export.content.decode("utf-8")),
    usecols=[
        "subject_ids",
        "classification_id",
        "workflow_id",
        "workflow_version",
        "annotations",
        "created_at",
        "user_name",
    ],
)

## Specify the video and frame workflows

In [86]:
workflow_clip = 11767
workflow_clip_version = 227
workflow_frame = 12852
workflow_frame_version = 21.85 #Should this be 21.43?

### Format video annotations

In [87]:
# Filter clip classifications
class_clip = class_df[
    (class_df.workflow_id >= workflow_clip)
    & (class_df.workflow_version >= workflow_clip_version)
].reset_index()

# Create an empty list
rows_list = []

# Loop through each classification submitted by the users
for index, row in class_clip.iterrows():
    # Load annotations as json format
    annotations = json.loads(row["annotations"])

    # Select the information from the species identification task
    for ann_i in annotations:
        if ann_i["task"] == "T4":

            # Select each species annotated and flatten the relevant answers
            for value_i in ann_i["value"]:
                choice_i = {}
                # If choice = 'nothing here', set follow-up answers to blank
                if value_i["choice"] == "NOTHINGHERE":
                    f_time = ""
                    inds = ""
                # If choice = species, flatten follow-up answers
                else:
                    answers = value_i["answers"]
                    for k in answers.keys():
                        if "FIRSTTIME" in k:
                            f_time = answers[k].replace("S", "")
                        if "INDIVIDUAL" in k:
                            inds = answers[k]

                # Save the species of choice, class and subject id
                choice_i.update(
                    {
                        "classification_id": row["classification_id"],
                        "label": value_i["choice"],
                        "first_seen": f_time,
                        "how_many": inds,
                    }
                )

                rows_list.append(choice_i)

# Create a data frame with annotations as rows
class_clips_df = pd.DataFrame(
    rows_list, columns=["classification_id", "label", "first_seen", "how_many"]
)

# Specify the type of columns of the df
class_clips_df["how_many"] = pd.to_numeric(class_clips_df["how_many"])
class_clips_df["first_seen"] = pd.to_numeric(class_clips_df["first_seen"])

# Add subject id to each annotation
class_clips_df = pd.merge(
    class_clips_df,
    class_clip.drop(columns=["annotations"]),
    how="left",
    on="classification_id",
)

## Format frame annotations

In [88]:
# Filter frame classifications
class_frame = class_df[
    (class_df.workflow_id >= workflow_frame)
    & (class_df.workflow_version >= workflow_frame_version)
].reset_index()    

# Create an empty list
rows_list = []

# Loop through each classification submitted by the users
for index, row in class_frame.iterrows():
    # Load annotations as json format
    annotations = json.loads(row["annotations"])

    # Select the information from each annotation
    for ann_i in annotations:
      choice_i = {}

      if not ann_i["value"]:
        # Save the annotation and class id
        choice_i.update(
            {
                "classification_id": row["classification_id"],
                "label": "no_coral",
            }
        )

      else:
        # Save the annotation and class id
        choice_i.update(
            {
                "classification_id": row["classification_id"],
                "label": "coral",
            }
        )
        
        
      rows_list.append(choice_i)

# Create a data frame with annotations as rows
class_frame_df = pd.DataFrame(
    rows_list, columns=["classification_id", "label"]
)

# Add subject id to each annotation
class_frame_df = pd.merge(
    class_frame_df,
    class_frame.drop(columns=["annotations"]),
    how="left",
    on="classification_id",
)

## Combine classifications and subject information

In [89]:
# Combine video and frame classifications
annot_df = pd.merge(class_clips_df, class_frame_df, how="outer")

# Drop workflow and n_users columns
annot_df = annot_df.drop(columns=["workflow_id", "workflow_version"])

# Rename the subject_id field
annot_df = annot_df.rename(
    columns={"subject_ids": "subject_id"}
)

# Add the subject information
annot_df = pd.merge(
    annot_df,
    subjects,
    how="left",
    on="subject_id",
)

## Save classifications as csv file

In [90]:
annot_df.to_csv('annotations_data.csv')

# Download Zooniverse comments

In [91]:
# Get comments from Zooniverse
export = project.get_export('talk_comments')
export = gzip.decompress(export.content)

# Save the response as pandas data frame
data = json.loads(export.decode('utf-8')[export.decode('utf-8').find('['):export.decode('utf-8').rfind(']')+1])
comment_df = pd.DataFrame(data)[[
        "board_title",
        "comment_body",
        "comment_focus_id",
        "comment_id",
        "discussion_title",
        "comment_created_at",
        "comment_user_login",
    ]]

IndexError: ignored

## Combine comments and subject Information

In [92]:
# Rename the subject_id field
comment_df = comment_df.rename(
    columns={"comment_focus_id": "subject_id"}
)

# Add the subject information
comment_df = pd.merge(
    comment_df,
    subjects,
    how="left",
    on="subject_id",
)

NameError: ignored

In [None]:
# Remove comments from the Zooniverse team (i.e. non-user comments)
comment_df = comment_df.dropna(subset=['subject_id'])

# Download Zooniverse tags

In [None]:
# Get comments from Zooniverse
export = project.get_export('talk_tags')
export = gzip.decompress(export.content)

# Save the response as pandas data frame
data = json.loads(export.decode('utf-8')[export.decode('utf-8').find('['):export.decode('utf-8').rfind(']')+1])
tag_df = pd.DataFrame(data)[["name", "comment_id"]]

## Combine tags and comments information

In [None]:
# Add the comments information
comment_df = pd.merge(
    comment_df,
    tag_df,
    how="left",
    on="comment_id",
)

## Save comments as csv file

In [None]:
comment_df.to_csv('comments_data.csv')

Find out the period when the clip and frame workflows were active 


In [107]:
# Filter only for subjects that are frames
annot_frames = annot_df[(annot_df.subject_type == "frame")]

# Select the first frame annotation
first_day = annot_frames['created_at'].min()

# Date when the last frame was retired
last_day = annot_frames['retired_at'].max()

In [108]:
# May 16-19 classifications (old subject set)

In [109]:
class_df[(class_df.created_at < '2020-05-20') & (class_df.workflow_id >= 12852) & (class_df.workflow_version >= 21.43)]['created_at'].min()

'2020-05-16 21:33:04 UTC'

In [110]:
class_df[(class_df.created_at < '2020-05-20') & (class_df.workflow_id >= 12852) & (class_df.workflow_version >= 21.43)]['created_at'].max()

'2020-05-19 22:34:12 UTC'

In [111]:
# New subject set

In [112]:
first_day

'2020-05-29 07:39:06 UTC'

In [113]:
last_day

'2020-06-01 15:08:30 UTC'

In [None]:
# END