# DMACP data processing

Load zipped data file and unpack:

In [4]:
import os.path

data_path = "../dmacp_data"

## Merge content files

In [5]:
import json
import re

content_list_filepath = data_path+'/article_list.json'

projects_data_filepath = data_path+'/article_list.json'

merged_content_data = {
    'contribution': {
        'filepath': data_path+'/contributions.json',
        'data': {}
    },
    'field_note': {
        'filepath': data_path+'/field_notes.json',
        'data': {}
    },
    'project': {
        'filepath': data_path+'/projects.json',
        'data': {}
    }
}

# Helper functions

def get_text_from_html(raw_html):
    
    # from https://stackoverflow.com/a/12982689
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    
    cleantext = cleantext.replace('&nbsp;', ' ')
    cleantext = cleantext.replace('\n\n', ' ')
    cleantext = cleantext.replace('\r\n', ' ')
    cleantext = cleantext.replace('\n', ' ')    
    cleantext = cleantext.strip()
    
    return cleantext

def get_embed_content_entry(entry_data):
    if entry_data['title'] or entry_data['caption']:
        return {
            'type': entry_data['acf_fc_layout'],
            'title': entry_data['title'],
            'text': entry_data['caption'],
        }
    
    return {}

def get_text_content_entry(entry_data):
    text = get_text_from_html(entry_data['text'])
    
    if text:
        return {
            'type': entry_data['acf_fc_layout'],
            'text': text,
        }

    return {}

def get_image_content_entry(entry_data):    
    text_elements = []
    for image_data in entry_data['image']:
        if image_data['caption']:
            text_elements.append(image_data['caption'].strip())
        if image_data['description']:
            text_elements.append(image_data['description'].strip())
    
    text = ' '.join(text_elements)
    
    if text:
        return {
            'type': entry_data['acf_fc_layout'],
            'text': get_text_from_html(text)
        }
    
    return {}
    

def get_essentialize_contribution_data(raw_data):
    return get_essentialize_project_data(raw_data)

def get_essentialize_field_note_data(raw_data):
    data = {}
    
    data['link'] = 'https://notes.anthropocene-curriculum.org/id/%s' % raw_data['id']
    
    data['text'] = get_text_from_html(raw_data['acf']['text'])
    
    return data

def get_essentialize_project_data(raw_data):
    data = {}
    
    # title
    data['title'] = raw_data['title']['rendered']
    
    # subtitle
    data['subtitle'] = raw_data['acf']['subtitle']
    
    # abstract
    data['abstract'] = get_text_from_html(raw_data['acf']['abstract'])
    
    # link
    data['link'] = raw_data['link'].replace('content.', 'www.')
        
    # content
    data['content'] = []
    if raw_data['acf']['_content']['content']:
        for entry in raw_data['acf']['_content']['content']:
            
            ignored_types = ['iframe', 'featured', 'featured-children', 'reading-list', 'pdf']
        
            # Get content based on layout type
            content = {}
            if entry['acf_fc_layout'] == 'embed':
                content = get_embed_content_entry(entry)
            elif entry['acf_fc_layout'] == 'text':
                content = get_text_content_entry(entry)
            elif entry['acf_fc_layout'] == 'image':
                content = get_image_content_entry(entry)
                
            elif entry['acf_fc_layout'] in ignored_types:
                # doesn't hold any content. skipped for now
                continue

            else:
                print('Warning: Unhandled content type %s.' % entry['acf_fc_layout'])
            
            # Add content if not empty
            if content:
                data['content'].append(content)

    return data

# Merge content files

with open(content_list_filepath) as content_list_file:
    data = json.load(content_list_file)
    
    for entry in data:
        filepath = 'dmacp_data/%s/%s.json' % (entry['type'], entry['slug'])
        
        with open(filepath) as json_file:
            json_data = json.load(json_file)
        
            essentialized_data = {}
            if entry['type'] == 'contribution':
                essentialized_data = get_essentialize_contribution_data(json_data[0])
            elif entry['type'] == 'field_note':
                essentialized_data = get_essentialize_field_note_data(json_data[0])
            elif entry['type'] == 'project':
                essentialized_data = get_essentialize_project_data(json_data[0])
        
            merged_content_data[entry['type']]['data'][entry['slug']] = essentialized_data

for category in merged_content_data:
    filepath = merged_content_data[category]['filepath']
    data = merged_content_data[category]['data']
    
    with open(filepath, 'w') as outfile:
        json.dump(data, outfile, sort_keys=True, indent=4)

num_contributions = len(merged_content_data['contribution']['data'])
num_field_notes = len(merged_content_data['field_note']['data'])
num_projects = len(merged_content_data['project']['data'])
print('Merged %d contributions, %d field_notes, and %d projects.' % (num_contributions, num_field_notes, num_projects))

Merged 264 contributions, 288 field_notes, and 103 projects.


## Run text analysis

In [6]:
# pip install spacy
# python -m spacy download en_core_web_sm

import json
import spacy

# Load English tokenizer, tagger, parser, NER and word vectors
nlp = spacy.load("en_core_web_sm")

# compressed content files
contributions_file = data_path+'/contributions.json'
field_notes_file = data_path+'/field_notes.json'
projects_file = data_path+'/projects.json'

# Some helper functions

def get_merged_contribution_text(contribution_data):
    texts = [contribution_data['abstract']]
    for entry in contribution_data['content']:
        texts.append(entry['text'])
    return ' '.join(texts)

def get_merged_field_note_text(field_note_data):
    return field_note_data['text']

def get_merged_project_text(project_data):
    texts = [project_data['abstract']]
    for entry in project_data['content']:
        texts.append(entry['text'])
    return ' '.join(texts)

# --- Your code here: ---


# Some field note example code …     
with open(field_notes_file) as f:
    field_notes = json.load(f)
    
    field_note = field_notes['1566946414604-johnwkim']
    
    print(field_note['link'])

    text = get_merged_field_note_text(field_note)
    doc = nlp(text)
    
    # Find named entities, phrases and concepts
    for entity in doc.ents:
        print(entity.text, entity.label_)
        

# Some project example code …
with open(projects_file) as f:
    projects = json.load(f)
    
    project = projects['essay-series']
    
    print(project['title'])
    print(project['link'])

    text = get_merged_project_text(project)
    doc = nlp(text)
    
    # Find named entities, phrases and concepts
    for entity in doc.ents:
        print(entity.text, entity.label_)
        

# Some contribution example code …

with open(contributions_file) as f:
    contributions = json.load(f)
    
    contribution = contributions['navigating-the-anthropocene-river']
    
    print(contribution['title'])
    print(contribution['link'])

    text = get_merged_contribution_text(contribution)
    doc = nlp(text)
    
    # Find named entities, phrases and concepts
    for entity in doc.ents:
        print(entity.text, entity.label_)

https://notes.anthropocene-curriculum.org/id/5650
Frank Bibeau PERSON
Ojibwe PERSON
Manoomin PERSON
2018 DATE
White Earth LOC
Ojibwe GPE
Anishinaabe LOC
White Earth LOC
1855 DATE
Treaty Authority ORG
Manoomin PERSON
Chippewa ORG
After the Industrial River: Essay Collection
https://www.anthropocene-curriculum.org/project/mississippi/field-stations/field-station-1-sediment-settlement-sentiment/essay-series/
the Mississippi River LOC
three CARDINAL
Field Station 1’s ORG
Morgan Adamson ORG
Bruce Braun PERSON
Roopali Phadke PERSON
St. Anthony Falls PERSON
1863 DATE
Benjamin Franklin Upton PERSON
the Twin Cities ORG
the Upper Mississippi ORG
the twenty-first century DATE
the Twin Cities ORG
the Upper St. Anthony Falls Lock ORG
2015 DATE
the National Park Service ORG
NPS ORG
reposition the Lock WORK_OF_ART
the Central Riverfront LOC
Friends of the Lock &amp ORG
Dam ORG
The Falls Initiative WORK_OF_ART
Falls PERSON
October 22, 2019 DATE
the US Army Corps of Engineers ORG
the Upper St. Anthony 