# Process Zooniverse classifications of clips
Script to retrieve clip classification data from Zooniverse and update the Koster lab database.

## Download classifications from Zooniverse
Download the most up-to-date classifications provided by Zooniverse users to the Koster lab project (#9747) using the [Python SDK for Panoptes!](https://github.com/zooniverse/panoptes-python-client).
Note, only Zooniverse project collaborators can retrieve classifications from the Koster lab Zooniverse project.

### Import required packages

In [1]:
import os
import json
import numpy
import pandas as pd
import io
from panoptes_client import Project, Panoptes

### Specify project-specific info

In [2]:
# Connect to Zooniverse with your username and password
Panoptes.connect(username='', password='')

# Specify the project number of the koster lab
project = Project(9747)

# Specify the workflow of interest and its version
workflow_1 = 11767
workflow_1_version = 227

### Download the Zooniverse classifications

In [3]:
#get the export classifications
export = project.get_export('classifications')

#save the response as pandas data frame
rawdata = pd.read_csv(io.StringIO(export.content.decode('utf-8')),
                      usecols = ['subject_ids','classification_id', 'workflow_id',
                                'workflow_version','annotations'])

### Select workflow 1 classifications

In [4]:
#Filter w1 classifications
w1_data = rawdata[(rawdata.workflow_id >= workflow_1) &
                  (rawdata.workflow_version >= workflow_1_version)].reset_index()

#Drop worflow columns
w1_data = w1_data.drop(columns=['workflow_id','workflow_version'])

### Flatten the classifications

In [5]:
#Create empty df
flat_data = pd.DataFrame(columns=['classification_id', 'single_ann_clip_choice',
                                  'single_ann_clip_first_time', 'single_ann_clip_how_many'])

for index, row in w1_data.iterrows():
    #load annotations as json format
    annotations = json.loads(row['annotations'])
    
    #select the information from the species identification task
    for task_i in annotations:
        try:
            if task_i['task'] == 'T4':
                #select each species annotation and flatten the relevant answers
                for species in task_i['value']:
                    try:
                        #loop through the answers and add them to the row
                        answers = species['answers']
                        if len(answers)==0:
                            f_time = ""
                            inds = ""
                        else:
                            for k in answers.keys():
                                try:
                                    if 'FIRSTTIME' in k:
                                        f_time = answers[k].replace("S","")
                                    if 'INDIVIDUAL' in k:
                                        inds = answers[k]
                                except KeyError:
                                    continue
                                        
                        #include a new row with the species of choice, class and subject ids                
                        flat_data = flat_data.append({'classification_id': row['classification_id'],
                                                      'single_ann_clip_choice': species['choice'],
                                                      'single_ann_clip_first_time': f_time,
                                                      'single_ann_clip_how_many': inds},
                                                     ignore_index=True)
                    except KeyError:
                        continue
        except KeyError:
            continue

## Aggregrate Zooniverse classifications
Combine the classifications of multiple users and aggregrate them to have one classification per clip. For example, clip_001 contains "lobster" and "sponge" at second 1 and 6, respectively)

In [7]:
#Specify the type of columns
flat_data['single_ann_clip_how_many'] = pd.to_numeric(flat_data['single_ann_clip_how_many'])
flat_data['single_ann_clip_first_time'] = pd.to_numeric(flat_data['single_ann_clip_first_time'])

#Add the subject_ids to the dataframe
class_data = pd.merge(flat_data, w1_data.drop(columns=['annotations']), how='left', on='classification_id')

#Calculate the number of different classifications per subject 
class_data["class_subject"] = class_data.groupby('subject_ids')['classification_id'].transform('nunique')

#Select subjects with at least 4 different classifications
class_data = class_data[class_data.class_subject > 3]

#Calculate the proportion of users that agreed on their classifications
class_data["class_n"] = class_data.groupby(['subject_ids','single_ann_clip_choice'])['classification_id'].transform('count')
class_data["class_prop"] = class_data.class_n/class_data.class_subject

#Select subjects where at least 80% of the users agree in their classification
class_data = class_data[class_data.class_prop > .8]

#extract the median of the second where the animal/object is and the number of animals
class_data = class_data.groupby(['subject_ids','single_ann_clip_choice'], as_index=False)
class_data = pd.DataFrame(class_data[['single_ann_clip_how_many', 'single_ann_clip_first_time']].median())

#class_data.head()
#save csv file of the classified subjects
#class_data.to_csv(agg_w1_class, index = False, header=True)


Unnamed: 0,subject_ids,single_ann_clip_choice,single_ann_clip_how_many,single_ann_clip_first_time
0,39384186,NOTHINGHERE,,
1,39384253,DEEPSEAKINGCRAB,1.0,0.0
2,39384267,FISHANYSPECIES,1.0,1.0
3,39384291,NOTHINGHERE,,
4,39384292,NORWAYLOBSTER,2.0,0.0


### Update the koster lab database with aggregated classifications