# Process Zooniverse classifications of clips
Script to retrieve clip classification data from Zooniverse and update the Koster lab database.

## Download classifications from Zooniverse
Download the most up-to-date classifications provided by Zooniverse users to the Koster lab project (#9747) using the [Python SDK for Panoptes!](https://github.com/zooniverse/panoptes-python-client).
Note, only Zooniverse project collaborators can retrieve classifications from the Koster lab Zooniverse project.

### Import required packages

In [1]:
import os
import csv
import json
import sys
import operator
import requests
import pandas as pd
from datetime import datetime
from panoptes_client import Project, Panoptes

csv.field_size_limit(sys.maxsize)

131072

### Specify project-specific info

In [2]:
# Connect to Zooniverse with your username and password
Panoptes.connect(username='', password='')

# Specify the project number of the koster lab
project = Project(9747)

# Specify the workflow of interest and its version
workflow_1 = 11767
workflow_1_version = 227

# Specify the location to write the csv files                
all_class = '../all_classifications.csv'
w1_class = '../workflow1_classifications.csv'
out_w1_class = '../flatten_class_w1.csv' 
agg_w1_class = '../agg_class_w1.csv' 

### Define the functions to download the classifications

In [3]:
def download_file(url, dstn):
    request = requests.get(url, stream=True)
    with open(dstn, 'wb') as dstn_f:
        for chunk in request.iter_content(chunk_size=4096):
            dstn_f.write(chunk)
    return dstn


def download_exports(projt, dstn_cl):
    # replace path and filename strings for where you want the exports saved in the next two lines:
    try:
        meta_class = projt.describe_export('classifications')
        generated = meta_class['media'][0]['updated_at'][0:19]
        tdelta = (datetime.now() - datetime.strptime(generated, '%Y-%m-%dT%H:%M:%S')).total_seconds()
        age = (300 + int(tdelta / 60))
        print(str(datetime.now())[0:19] + '  Classifications export', age, ' hours old')
        url_class = meta_class['media'][0]['src']
        file_class = download_file(url_class, dstn_cl)
        print(str(datetime.now())[0:19] + '  ' + file_class + ' downloaded')
    except:
        print(str(datetime.now())[0:19] + '  Classifications download did not complete')
        return False
    return True


def include_class(class_record):
    #  define a function that returns True or False based on whether the argument record is to be included or not in
    #  the output file based on the conditional clauses.
    #  many other conditions could be set up to determine if a record is to be processed and the flattened data
    #  written to the output file. Any or all of these conditional tests that are not needed can be deleted or
    # commented out with '#' placed in front of the line(s)of code that are not required.

    if int(class_record['workflow_id']) == workflow_1:
        pass  # replace'!= 0000' with '== xxxx' where xxxx is the workflow to include.  This is also useful to
        # exclude a specific workflow as well.
    else:
        return False
    if float(class_record['workflow_version']) >= workflow_1_version:
        pass  # replace '001.01' with first version of the workflow to include.
    else:
        return False
    if '2100-00-10 00:00:00 UTC' >= class_record['created_at'] >= '2000-00-10 00:00:00 UTC':
        pass  # replace earliest and latest created_at date and times to select records commenced in a
        #  specific time period
    else:
        return False
    # otherwise :
    return True


def slice_exports(dstn_cl, out_location_cl):
    with open(out_location_cl, 'w', newline='') as file:
        fieldnames = ['classification_id',
                      'user_name', 'user_id',
                      'user_ip', 'workflow_id',
                      'workflow_name',
                      'workflow_version',
                      'created_at',
                      'gold_standard',
                      'expert',
                      'metadata',
                      'annotations',
                      'subject_data',
                      'subject_ids']
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        writer.writeheader()

        # this area for initializing counters, status lists and loading pick lists into memory:
        i = 0
        j = 0

        #  open the zooniverse data file using dictreader
        with open(dstn_cl) as f:
            r = csv.DictReader(f)
            for row in r:
                i += 1
                if include_class(row):
                    j += 1
                    # This set up the writer to match the field names above and the variable names of their values:
                    writer.writerow({'classification_id': row['classification_id'],
                                     'user_name': row['user_name'],
                                     'user_id': row['user_id'],
                                     'user_ip': row['user_ip'],
                                     'workflow_id': row['workflow_id'],
                                     'workflow_name': row['workflow_name'],
                                     'workflow_version': row['workflow_version'],
                                     'created_at': row['created_at'],
                                     'gold_standard': row['gold_standard'],
                                     'expert': row['expert'],
                                     'metadata': row['metadata'],
                                     'annotations': row['annotations'],
                                     'subject_data': row['subject_data'],
                                     'subject_ids': row['subject_ids']})

    # This area prints some basic process info and status
    print(str(datetime.now())[0:19] + '  Classification file:' +
          ' ' + str(i) + ' lines read and inspected' + ' ' + str(j) + ' records selected and copied')

    k = 0
    m = 0
    return True

### Download the classifications from Zooniverse

In [4]:
if __name__ == '__main__':
    print(download_exports(project, all_class))
    print(slice_exports(all_class, w1_class))

2020-03-25 12:15:07  Classifications export 2652  hours old
2020-03-25 12:15:09  ../all_classifications.csv downloaded
True
2020-03-25 12:15:09  Classification file: 2031 lines read and inspected 1850 records selected and copied
True


## Aggregrate Zooniverse classifications
Combine the classifications of multiple users and aggregrate them to have one classification per clip. For example, clip_001 contains "lobster" and "sponge" at second 1 and 6, respectively)

### Define the functions to aggregate the classifications

In [5]:
def include(class_record):
    #  define a function that returns True or False based on whether the argument record is to be
    #  included or not in the output file based on the conditional clauses.
    #  many other conditions could be set up to determine if a record is to be processed and the
    #  flattened data written to the output file (see flatten_class_frame for more options).

    if int(class_record['workflow_id']) == workflow_1 :
        pass  # this one selects the workflow to include.
    else:
        return False
    if float(class_record['workflow_version']) >= workflow_1_version:
        pass  # this one selects the first version of the workflow to include. Note the workflows
        #  must be compatible with the structure (task numbers) choices, and questions (they could
        #  differ in confusions, characteristics or other wording differences.)
    else:
        return False
    # otherwise :
    return True

def empty(ques, resp):
    blank = []
    for q1 in range(0, len(ques)):
        blank.append([0 for r1 in resp[q1]])
    return blank



### Set up output file structure with desired fields
The list of field names must include each field required in the output. The names, and order must be exactly the same here as in the writer statement near the end of the program. The names and order are arbitrary - your choice, as long as they are the same in both locations.
Additional fields from the classification file can be added or removed as required.  The other flatten_class blocks could be added to this demo similarly as they are added to flatten-class_frame either the general utility blocks or any other blocks if the workflow has more that the one survey task in it. These blocks should be added before the first survey task immediately after "for task in annotations'. As code blocks are added to flatten the annotations JSON, columns need to be added to contain each newly split out group of data. Add each one using the format "' new_field_name'," .  Similarly fields can be removed from both places to reduce the file size if the information is not needed for the current purpose.

In [6]:
with open(out_w1_class, 'w', newline='') as ou_file:
    fieldnames = ['classification_id',
                  'subject_ids',
                  'created_at',
                  'user_name',
                  'user_ip',
                  'single_ann_clip_choice',
                  'single_ann_clip_how_many',
                  'single_ann_clip_first_time',
                  'single_ann_clip_trash'
                  ]
    writer = csv.DictWriter(ou_file, fieldnames=fieldnames)
    writer.writeheader()

    # this area for initializing counters, status lists and loading pick lists into memory:
    rc2 = 0
    rc1 = 0
    wc1 = 0
     
    #  create a dictionary with the same question and response labels as the project builder.
    questions = ['INDIVIDUAL','FIRSTTIME','TYPEOFOBJECT']
    responses = [['1','2','3','4'], ['0S','1S','2S','3S','4S','5S','6S','7S','8S','9S'], ['FISHING', 'MATERIALLITTER']]

    
    #  open the zooniverse data file using dictreader, and load the more complex json strings
    #  as python objects using json.loads()
    with open(w1_class) as class_file:
        classifications = csv.DictReader(class_file)
        for row in classifications:
            rc2 += 1
            # useful for debugging - set the number of record to process at a low number ~1000
            if rc2 == 150000:  # one more than the last line of zooniverse file to read if not EOF
                break
            if include(row) is True:
                rc1 += 1
                annotations = json.loads(row['annotations'])

                # reset field variables for the survey task for each new row
                choice = ''
                answer = ['' for q4 in questions]

                for task in annotations:
                    # If the workflow has additional tasks or you want to add other general utilities
                    # blocks, put them here before the survey task, so the writer block will have the
                    # all the data it needs prior to the end of the survey task block.

                    # The survey task block:
                    try:
                        #  main survey task recognized by project specific task number - in this case 'T0'
                        #  you need this to match your own project - it may be different!
                        if task['task'] == 'T4':
                            try:
                                for species in task['value']:
                                    choice = species['choice']
                                    answer_vector = empty(questions, responses)
                                    
                        
                                    for q in range(0, len(questions)):
                                        try:
                                            answer[q] = [val for key, val in species['answers'].items() if questions[q] in key]    
                                        except KeyError:
                                            continue
                                            
                                    # This sets up the writer to match the field names above and the
                                    # variable names of their values. Note we write one line per
                                    # subject_choices:
                                    wc1 += 1
                                    writer.writerow({'classification_id': row['classification_id'],
                                                     'subject_ids': row['subject_ids'],
                                                     'created_at': row['created_at'],
                                                     'user_name': row['user_name'],
                                                     'user_ip': row['user_ip'],
                                                     'single_ann_clip_choice': choice,
                                                     'single_ann_clip_how_many': ''.join(map(str, answer[0])),
                                                     'single_ann_clip_first_time': ''.join(map(str, answer[1])).replace("S",""),
                                                     'single_ann_clip_trash': ''.join(map(str, answer[2]))
                                                     })
                            except KeyError:
                                continue
                    except KeyError:
                        continue

# This area prints some basic process info and status
print(rc2, 'lines read and inspected.', rc1, 'records processed and', wc1, 'lines written.')

1850 lines read and inspected. 1850 records processed and 2543 lines written.


## Aggregrate the classifications

In [7]:
#Load the csv as df to handle with pandas
w1_data = pd.read_csv(out_w1_class)

#Calculate the number of different classifications per subject 
w1_data["class_subject"] = w1_data.groupby('subject_ids')['classification_id'].transform('nunique')

#Select subjects with at least 4 different classifications
w1_data = w1_data[w1_data.class_subject > 3]

#Calculate the proportion of users that agreed on their classifications
w1_data["class_n"] = w1_data.groupby(['subject_ids','single_ann_clip_choice'])['classification_id'].transform('count')
w1_data["class_prop"] = w1_data.class_n/w1_data.class_subject

#Select subjects where at least 80% of the users agree in their classification
w1_data = w1_data[w1_data.class_prop > .8]

#extract the median of the second where the animal/object is and the number of animals
w1_data = w1_data.groupby(['subject_ids','single_ann_clip_choice'], as_index=False)
w1_data = pd.DataFrame(w1_data[['single_ann_clip_how_many', 'single_ann_clip_first_time']].median())

#save csv file of the classified subjects
w1_data.to_csv (agg_w1_class, index = False, header=True)


### Update the koster lab database with aggregated classifications