# Process Zooniverse classifications of frames
Script to retrieve frame classification data from Zooniverse and update the Koster lab database.

## Download classifications from Zooniverse
To download the most up-to-date classifications provided by Zooniverse users we need to request the classifications from the Koster lab project (#9747) using the [Python SDK for Panoptes!](https://github.com/zooniverse/panoptes-python-client).

In [1]:
# Import required packages
import os
import csv
import requests
from datetime import datetime
from panoptes_client import Project, Panoptes

In [4]:
# Connect to Zooniverse with your username and password
Panoptes.connect(username='', password='')

# Specify the project number of the koster lab
project = Project(9747)

# Specify the workflow of interest
workflow_2 = 12852

# Specifiy the location to write the csv files
dstn_class = '../all_classifications.csv'
out_location_class = '../workflow2_classifications.csv'


In [5]:
def download_file(url, dstn):
    request = requests.get(url, stream=True)
    with open(dstn, 'wb') as dstn_f:
        for chunk in request.iter_content(chunk_size=4096):
            dstn_f.write(chunk)
    return dstn


def download_exports(projt, dstn_cl):
    # replace path and filename strings for where you want the exports saved in the next two lines:
    try:
        meta_class = projt.describe_export('classifications')
        generated = meta_class['media'][0]['updated_at'][0:19]
        tdelta = (datetime.now() - datetime.strptime(generated, '%Y-%m-%dT%H:%M:%S')).total_seconds()
        age = (300 + int(tdelta / 60))
        print(str(datetime.now())[0:19] + '  Classifications export', age, ' hours old')
        url_class = meta_class['media'][0]['src']
        file_class = download_file(url_class, dstn_cl)
        print(str(datetime.now())[0:19] + '  ' + file_class + ' downloaded')
    except:
        print(str(datetime.now())[0:19] + '  Classifications download did not complete')
        return False

    try:
        meta_subj = projt.describe_export('subjects')
        generated = meta_subj['media'][0]['updated_at'][0:19]
        tdelta = (datetime.now() - datetime.strptime(generated, '%Y-%m-%dT%H:%M:%S')).total_seconds()
        age = (300 + int(tdelta / 60))
        print(str(datetime.now())[0:19] + '  Subject export', age, ' hours old')
    except:
        print(str(datetime.now())[0:19] + '  Subjects download did not complete')
        return False
    return True


def include_class(class_record):
    #  define a function that returns True or False based on whether the argument record is to be included or not in
    #  the output file based on the conditional clauses.
    #  many other conditions could be set up to determine if a record is to be processed and the flattened data
    #  written to the output file. Any or all of these conditional tests that are not needed can be deleted or
    # commented out with '#' placed in front of the line(s)of code that are not required.

    if int(class_record['workflow_id']) == workflow_2:
        pass  # replace'!= 0000' with '== xxxx' where xxxx is the workflow to include.  This is also useful to
        # exclude a specific workflow as well.
    else:
        return False
    if float(class_record['workflow_version']) >= 001.01:
        pass  # replace '001.01' with first version of the workflow to include.
    else:
        return False
    if 100000000 >= int(class_record['subject_ids']) >= 0000:
        pass  # replace upper and lower subject_ids to include only a specified range of subjects - This is
        # a very useful slice since subjects are selected together and can still be aggregated.
    else:
        return False
    if not class_record['gold_standard'] and not class_record['expert']:
        pass  # this excludes gold standard and expert classifications - remove the "not" to select only
        # the gold standard or expert classifications
    else:
        return False
    if '2100-00-10 00:00:00 UTC' >= class_record['created_at'] >= '2000-00-10 00:00:00 UTC':
        pass  # replace earliest and latest created_at date and times to select records commenced in a
        #  specific time period
    else:
        return False
    # otherwise :
    return True


def slice_exports(dstn_cl, out_location_cl):
    with open(out_location_cl, 'w', newline='') as file:
        fieldnames = ['classification_id',
                      'user_name', 'user_id',
                      'user_ip', 'workflow_id',
                      'workflow_name',
                      'workflow_version',
                      'created_at',
                      'gold_standard',
                      'expert',
                      'metadata',
                      'annotations',
                      'subject_data',
                      'subject_ids']
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        writer.writeheader()

        # this area for initializing counters, status lists and loading pick lists into memory:
        i = 0
        j = 0

        #  open the zooniverse data file using dictreader
        with open(dstn_cl) as f:
            r = csv.DictReader(f)
            for row in r:
                i += 1
                if include_class(row):
                    j += 1
                    # This set up the writer to match the field names above and the variable names of their values:
                    writer.writerow({'classification_id': row['classification_id'],
                                     'user_name': row['user_name'],
                                     'user_id': row['user_id'],
                                     'user_ip': row['user_ip'],
                                     'workflow_id': row['workflow_id'],
                                     'workflow_name': row['workflow_name'],
                                     'workflow_version': row['workflow_version'],
                                     'created_at': row['created_at'],
                                     'gold_standard': row['gold_standard'],
                                     'expert': row['expert'],
                                     'metadata': row['metadata'],
                                     'annotations': row['annotations'],
                                     'subject_data': row['subject_data'],
                                     'subject_ids': row['subject_ids']})

    # This area prints some basic process info and status
    print(str(datetime.now())[0:19] + '  Classification file:' +
          ' ' + str(i) + ' lines read and inspected' + ' ' + str(j) + ' records selected and copied')

    k = 0
    m = 0
    return True


if __name__ == '__main__':
    print(download_exports(project, dstn_class))
    print(slice_exports(dstn_class, out_location_class))

2020-02-28 17:02:58  Classifications export 1168  hours old
2020-02-28 17:02:59  ../all_classifications.csv downloaded
2020-02-28 17:02:59  Subject export 37482  hours old
True
2020-02-28 17:02:59  Classification file: 1675 lines read and inspected 13 records selected and copied
True


In [None]:
## Aggregrate the classifications and update the koster sql database

