## Taxonomies distribution across data entries

Entries are duplicated because split across different taxonomies present in the collection. This is a preparatory analysis if we want to keep working on time curves. Time curves will traverse one taxonomy by visualizing dataset entries according to the temporal entities they include.

In [1]:
import json
import re

content_list_filepath = '/Users/francescamorini/projects/dmacp/dmacp_data/article_list.json'

projects_data_filepath = '/Users/francescamorini/projects/dmacp/dmacp_data/article_list.json'

merged_content_data = {
    'contribution': {
        'filepath': '/Users/francescamorini/projects/dmacp/dmacp_data/contributions.json',
        'data': {}
    },
    'field_note': {
        'filepath': '/Users/francescamorini/projects/dmacp/dmacp_data/field_notes.json',
        'data': {}
    },
    'project': {
        'filepath': '/Users/francescamorini/projects/dmacp/dmacp_data/projects.json',
        'data': {}
    }
}

# Helper functions

def get_text_from_html(raw_html):
    
    # from https://stackoverflow.com/a/12982689
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    
    cleantext = cleantext.replace('&nbsp;', ' ')
    cleantext = cleantext.replace('\n\n', ' ')
    cleantext = cleantext.replace('\r\n', ' ')
    cleantext = cleantext.replace('\n', ' ')    
    cleantext = cleantext.strip()
    
    return cleantext

def get_embed_content_entry(entry_data):
    if entry_data['title'] or entry_data['caption']:
        return {
            'type': entry_data['acf_fc_layout'],
            'title': entry_data['title'],
            'text': entry_data['caption'],
        }
    
    return {}

def get_text_content_entry(entry_data):
    text = get_text_from_html(entry_data['text'])
    
    if text:
        return {
            'type': entry_data['acf_fc_layout'],
            'text': text,
        }

    return {}

def get_image_content_entry(entry_data):    
    text_elements = []
    for image_data in entry_data['image']:
        if image_data['caption']:
            text_elements.append(image_data['caption'].strip())
        if image_data['description']:
            text_elements.append(image_data['description'].strip())
    
    text = ' '.join(text_elements)
    
    if text:
        return {
            'type': entry_data['acf_fc_layout'],
            'text': get_text_from_html(text)
        }
    
    return {}
    

def get_essentialize_contribution_data(raw_data):
    return get_essentialize_project_data(raw_data)

def get_essentialize_field_note_data(raw_data):
    data = {}

    data['id'] = raw_data['id']

    data['link'] = 'https://notes.anthropocene-curriculum.org/id/%s' % raw_data['id']
    
    data['text'] = get_text_from_html(raw_data['acf']['text'])

    
    # Meta datetime to have some kind of time for each case
    data['meta_datetime'] = raw_data['date_gmt']

    # Getting also keywords, folksonomies and methods to later build correlation with time
    data['keywords'] = raw_data['keyword']
    data['folksonomy'] = raw_data['folksonomy']
    data['methods'] = raw_data['method']

    return data

def get_essentialize_project_data(raw_data):
    data = {}
    data['id'] = raw_data['id']
    # title
    data['title'] = raw_data['title']['rendered']
    
    # subtitle
    data['subtitle'] = raw_data['acf']['subtitle']
    
    # abstract
    data['abstract'] = get_text_from_html(raw_data['acf']['abstract'])
    
    # link
    data['link'] = raw_data['link'].replace('content.', 'www.')

    # Meta datetime to have some kind of time for each case
    data['meta_datetime'] = raw_data['date_gmt']

    #Taxonomies
    
    data['keywords'] = raw_data['keyword']
    data['folksonomy'] = raw_data['folksonomy']
    data['methods'] = raw_data['method']

        
    # content
    data['content'] = []
    if raw_data['acf']['_content']['content']:
        for entry in raw_data['acf']['_content']['content']:
            
            ignored_types = ['iframe', 'featured', 'featured-children', 'reading-list', 'pdf']
        
            # Get content based on layout type
            content = {}
            if entry['acf_fc_layout'] == 'embed':
                content = get_embed_content_entry(entry)
            elif entry['acf_fc_layout'] == 'text':
                content = get_text_content_entry(entry)
            elif entry['acf_fc_layout'] == 'image':
                content = get_image_content_entry(entry)
                
            elif entry['acf_fc_layout'] in ignored_types:
                # doesn't hold any content. skipped for now
                continue

            else:
                print('Warning: Unhandled content type %s.' % entry['acf_fc_layout'])
            
            # Add content if not empty
            if content:
                data['content'].append(content)

    return data

# Merge content files

with open(content_list_filepath) as content_list_file:
    data = json.load(content_list_file)
    
    for entry in data:
        filepath = '/Users/francescamorini/projects/dmacp/dmacp_data/%s/%s.json' % (entry['type'], entry['slug'])
        
        with open(filepath) as json_file:
            json_data = json.load(json_file)
        
            essentialized_data = {}
            if entry['type'] == 'contribution':
                essentialized_data = get_essentialize_contribution_data(json_data[0])
            elif entry['type'] == 'field_note':
                essentialized_data = get_essentialize_field_note_data(json_data[0])
            elif entry['type'] == 'project':
                essentialized_data = get_essentialize_project_data(json_data[0])
        
            merged_content_data[entry['type']]['data'][entry['slug']] = essentialized_data

for category in merged_content_data:
    filepath = merged_content_data[category]['filepath']
    data = merged_content_data[category]['data']
    
    with open(filepath, 'w') as outfile:
        json.dump(data, outfile, sort_keys=True, indent=4)

num_contributions = len(merged_content_data['contribution']['data'])
num_field_notes = len(merged_content_data['field_note']['data'])
num_projects = len(merged_content_data['project']['data'])
print('Merged %d contributions, %d field_notes, and %d projects.' % (num_contributions, num_field_notes, num_projects))

Merged 261 contributions, 288 field_notes, and 104 projects.


In [2]:
import json
import spacy
import pandas as pd

# Load English tokenizer, tagger, parser, NER and word vectors
nlp = spacy.load("en_core_web_sm")

# compressed content files
contributions_file = '/Users/francescamorini/projects/dmacp/dmacp_data/contributions.json'
field_notes_file = '/Users/francescamorini/projects/dmacp/dmacp_data/field_notes.json'
projects_file = '/Users/francescamorini/projects/dmacp/dmacp_data/projects.json'
taxonomy_file = '/Users/francescamorini/projects/dmacp/dmacp_data/taxonomy_list.json'

# Some helper functions

def get_merged_contribution_text(contribution_data):
    texts = [contribution_data['abstract']]
    for entry in contribution_data['content']:
        texts.append(entry['text'])
    return ' '.join(texts)

def get_merged_field_note_text(field_note_data):
    return field_note_data['text']

def get_merged_project_text(project_data):
    texts = [project_data['abstract']]
    for entry in project_data['content']:
        texts.append(entry['text'])
    return ' '.join(texts)

In [3]:
import os.path
taxonomyLabels = []
with open(taxonomy_file) as taxonomy:
    taxonomy_list = json.load(taxonomy)
    for el in taxonomy_list:
        taxonomyLabels.append(el)

print(taxonomyLabels)

['methods', 'keywords', 'folksonomy']


In [4]:
#Helper function to check taxonomy content for each entry
def labelMethod(entryValue, currentTaxonomy):
    with open(taxonomy_file) as taxonomy:
        taxonomy_list = json.load(taxonomy)
        for taxonomy in taxonomy_list[currentTaxonomy]:

            if taxonomy['term_id'] == entryValue:
                return taxonomy['name']

In [41]:
def runTextAnalysis(currentLabel, single_entry):
     if currentLabel in single_entry:
        print(single_entry)
        for element in single_entry['content']:
            if element['text']:
                doc = nlp(element['text'])
                for entity in doc.ents:
                    if entity.label_ in temporalLabels:
                        return entity.text

In [43]:
allFiles = [contributions_file, field_notes_file, projects_file]
typeLabel = ['contribution', 'fieldnote', 'project']
temporalLabels = ['DATE', 'TIME']

index = 0

timedMethods = []
timedKeywords = []
timedFolksonomies = []
datasets = [timedMethods, timedKeywords, timedFolksonomies]

#Initial loops through types of taxonomy types
for taxonomy in taxonomyLabels:

    print('started', taxonomy)
    fileIndex = 0
    #Opening files
    for file in allFiles:
        with open(file) as f:

            entries = json.load(f)
            print('open', file)

              #Loop through entries
            for entry in entries:

                # print(entries[entry])
                single_entry = entries[entry]
                entry_id = single_entry['id']
                metatime = single_entry['meta_datetime']
                if 'title' in single_entry:
                    title = single_entry['title']
                    
                else:
                    title = typeLabel[fileIndex] + ' with no title'
                    
                for tax in single_entry[taxonomy]:
                    temporalEntities = []
                    taxonomy_id = tax
                    taxonomy_name = labelMethod(tax, taxonomy)
                    
#                     abstractEntities = runTextAnalysis('abstract', single_entry)
#                     temporalEntities.append(abstractEntities)
#                     contentEntities = runTextAnalysis('content', single_entry)
#                     temporalEntities.append(contentEntities)
                    if 'abstract' in single_entry:
                        if single_entry['abstract']:
                            doc = nlp(single_entry['abstract'])
                            for entity in doc.ents:
                                if entity.label_ in temporalLabels:
                                    temporalEntities.append(entity.text)
                                    
                    if 'content' in single_entry:
                        for element in single_entry['content']:
                            if element['text']:
                                doc = nlp(element['text'])
                                for entity in doc.ents:
                                    if entity.label_ in temporalLabels:
                                        temporalEntities.append(entity.text)
                    
                
                    datasets[index].append(
                        [
                            title, 
                            entry_id, 
                            metatime, 
                            taxonomy_name, 
                            typeLabel[fileIndex], 
                            taxonomy,
                            temporalEntities,
                            len(temporalEntities)
                        ])
            fileIndex = fileIndex + 1
    index = index + 1
print('done', taxonomy)

started methods
open /Users/francescamorini/projects/dmacp/dmacp_data/contributions.json
open /Users/francescamorini/projects/dmacp/dmacp_data/field_notes.json
open /Users/francescamorini/projects/dmacp/dmacp_data/projects.json
started keywords
open /Users/francescamorini/projects/dmacp/dmacp_data/contributions.json
open /Users/francescamorini/projects/dmacp/dmacp_data/field_notes.json
open /Users/francescamorini/projects/dmacp/dmacp_data/projects.json
started folksonomy
open /Users/francescamorini/projects/dmacp/dmacp_data/contributions.json
open /Users/francescamorini/projects/dmacp/dmacp_data/field_notes.json
open /Users/francescamorini/projects/dmacp/dmacp_data/projects.json
done folksonomy


In [44]:
methods = pd.DataFrame(datasets[0], columns=['title','id', 'time', 'taxonomy_name', 'type_of_entry', 'type_of_taxonomy', 'temporalEntities', 'entitiesNumber'])
keywords = pd.DataFrame(datasets[1], columns=['title','id', 'time', 'taxonomy_name', 'type_of_entry', 'type_of_taxonomy', 'temporalEntities', 'entitiesNumber'])
folksonomies = pd.DataFrame(datasets[2], columns=['title','id', 'time', 'taxonomy_name', 'type_of_entry', 'type_of_taxonomy', 'temporalEntities', 'entitiesNumber'])

In [45]:
keywords['id'].nunique()

510

In [46]:
methods.sort_values('title')

Unnamed: 0,title,id,time,taxonomy_name,type_of_entry,type_of_taxonomy,temporalEntities,entitiesNumber
75,#BlackBoyJoy On the River with Eugene B. Redmond,20440,2020-02-24T13:46:45,Conversation,contribution,methods,[],0
3,30 Days On/30 Days Off with Paul Perkinson,20478,2020-02-24T14:23:37,Conversation,contribution,methods,[his twenties],1
4,A Brief History of Geoengineering,22468,2014-11-23T15:13:02,Film,contribution,methods,[],0
5,A Caribbean Taste of Technology: Creolization ...,28250,2017-04-15T14:21:22,Case Study,contribution,methods,[],0
6,A Caribbean Taste of Technology: Creolization ...,28250,2017-04-15T14:21:22,Reflection,contribution,methods,[],0
...,...,...,...,...,...,...,...,...
562,fieldnote with no title,15221,2019-11-10T04:44:05,Sensing,fieldnote,methods,[],0
254,“Planting a Seed is a Revolutionary Act&#8221;,27568,2020-07-31T09:43:51,Conversation,contribution,methods,"[2017, October 2019, Today, today, Halloween, ...",32
255,“Planting a Seed is a Revolutionary Act&#8221;,27568,2020-07-31T09:43:51,Engagement,contribution,methods,"[2017, October 2019, Today, today, Halloween, ...",32
256,“Planting a Seed is a Revolutionary Act&#8221;,27568,2020-07-31T09:43:51,Field Work,contribution,methods,"[2017, October 2019, Today, today, Halloween, ...",32


In [47]:
keywords.sort_values('title')

Unnamed: 0,title,id,time,taxonomy_name,type_of_entry,type_of_taxonomy,temporalEntities,entitiesNumber
172,#BlackBoyJoy On the River with Eugene B. Redmond,20440,2020-02-24T13:46:45,Biodiversity,contribution,keywords,[],0
173,#BlackBoyJoy On the River with Eugene B. Redmond,20440,2020-02-24T13:46:45,History,contribution,keywords,[],0
174,#BlackBoyJoy On the River with Eugene B. Redmond,20440,2020-02-24T13:46:45,Pollution,contribution,keywords,[],0
175,#BlackBoyJoy On the River with Eugene B. Redmond,20440,2020-02-24T13:46:45,Water,contribution,keywords,[],0
17,30 Days On/30 Days Off with Paul Perkinson,20478,2020-02-24T14:23:37,Transportation,contribution,keywords,[his twenties],1
...,...,...,...,...,...,...,...,...
554,“Planting a Seed is a Revolutionary Act&#8221;,27568,2020-07-31T09:43:51,History,contribution,keywords,"[2017, October 2019, Today, today, Halloween, ...",32
555,“Planting a Seed is a Revolutionary Act&#8221;,27568,2020-07-31T09:43:51,Inequality,contribution,keywords,"[2017, October 2019, Today, today, Halloween, ...",32
556,“Planting a Seed is a Revolutionary Act&#8221;,27568,2020-07-31T09:43:51,Race,contribution,keywords,"[2017, October 2019, Today, today, Halloween, ...",32
558,“Planting a Seed is a Revolutionary Act&#8221;,27568,2020-07-31T09:43:51,Violence,contribution,keywords,"[2017, October 2019, Today, today, Halloween, ...",32


In [11]:
methods.to_csv('data/methods_and_entities.csv', index=False, sep=",")
keywords.to_csv('data/keywords_and_entities.csv', index=False, sep=",")
folksonomies.to_csv('data/folks_and_entities.csv', index=False, sep=",")

In [48]:
#creating a unique dataset
allTaxonomies = pd.concat([methods, keywords,folksonomies])

In [13]:
#allTaxonomies.to_csv('data/alltaxonomies_and_entities.csv', index=False, sep=",")

## Planned exercises:
- Design one glyph by isolating and plotting one unique entry
- Finish metaviews on datasets
- Another notebook with entries not repeating based on the taxonomy?

## Design Exercise 1: Extracting one unique entry

In [53]:
allTaxonomies.sort_values('entitiesNumber', ascending=False)

Unnamed: 0,title,id,time,taxonomy_name,type_of_entry,type_of_taxonomy,temporalEntities,entitiesNumber
189,Check My Pulse,27404,2020-08-28T13:30:29,History,contribution,keywords,"[1989, 1959, 1959, 4, 2016, 2015, present-day,...",141
78,Check My Pulse,27404,2020-08-28T13:30:29,Field Work,contribution,methods,"[1989, 1959, 1959, 4, 2016, 2015, present-day,...",141
79,Check My Pulse,27404,2020-08-28T13:30:29,Reflection,contribution,methods,"[1989, 1959, 1959, 4, 2016, 2015, present-day,...",141
191,Check My Pulse,27404,2020-08-28T13:30:29,Water,contribution,keywords,"[1989, 1959, 1959, 4, 2016, 2015, present-day,...",141
190,Check My Pulse,27404,2020-08-28T13:30:29,Settler Colonialism,contribution,keywords,"[1989, 1959, 1959, 4, 2016, 2015, present-day,...",141
...,...,...,...,...,...,...,...,...
868,fieldnote with no title,13368,2019-09-23T14:09:06,Commodities,fieldnote,keywords,[],0
867,fieldnote with no title,13364,2019-09-23T20:47:26,Engineering,fieldnote,keywords,[],0
866,fieldnote with no title,13364,2019-09-23T20:47:26,Affect,fieldnote,keywords,[],0
865,fieldnote with no title,13180,2019-09-26T18:36:00,Spatial,fieldnote,keywords,[],0


In [54]:
allTaxonomies['entitiesNumber'].mean()

5.3495055624227446

In [73]:
#Isolating the entry with the higher number of entities
entry27568 = allTaxonomies.loc[allTaxonomies['id'] == 27568]
only1row = entry27568.iloc[1]

In [74]:
onlyEntities = pd.DataFrame(only1row['temporalEntities'], columns=['raw_entity'])

In [69]:
entry27404.to_csv('exploded_entry27568.csv', index=False)

In [75]:
onlyEntities.to_csv('data/PlantingaSeed_raw.csv')

In [76]:
entry27568

Unnamed: 0,title,id,time,taxonomy_name,type_of_entry,type_of_taxonomy,temporalEntities,entitiesNumber
254,“Planting a Seed is a Revolutionary Act&#8221;,27568,2020-07-31T09:43:51,Conversation,contribution,methods,"[2017, October 2019, Today, today, Halloween, ...",32
255,“Planting a Seed is a Revolutionary Act&#8221;,27568,2020-07-31T09:43:51,Engagement,contribution,methods,"[2017, October 2019, Today, today, Halloween, ...",32
256,“Planting a Seed is a Revolutionary Act&#8221;,27568,2020-07-31T09:43:51,Field Work,contribution,methods,"[2017, October 2019, Today, today, Halloween, ...",32
257,“Planting a Seed is a Revolutionary Act&#8221;,27568,2020-07-31T09:43:51,Teaching,contribution,methods,"[2017, October 2019, Today, today, Halloween, ...",32
551,“Planting a Seed is a Revolutionary Act&#8221;,27568,2020-07-31T09:43:51,Engagement,contribution,keywords,"[2017, October 2019, Today, today, Halloween, ...",32
552,“Planting a Seed is a Revolutionary Act&#8221;,27568,2020-07-31T09:43:51,Environmental Justice,contribution,keywords,"[2017, October 2019, Today, today, Halloween, ...",32
553,“Planting a Seed is a Revolutionary Act&#8221;,27568,2020-07-31T09:43:51,Extraction,contribution,keywords,"[2017, October 2019, Today, today, Halloween, ...",32
554,“Planting a Seed is a Revolutionary Act&#8221;,27568,2020-07-31T09:43:51,History,contribution,keywords,"[2017, October 2019, Today, today, Halloween, ...",32
555,“Planting a Seed is a Revolutionary Act&#8221;,27568,2020-07-31T09:43:51,Inequality,contribution,keywords,"[2017, October 2019, Today, today, Halloween, ...",32
556,“Planting a Seed is a Revolutionary Act&#8221;,27568,2020-07-31T09:43:51,Race,contribution,keywords,"[2017, October 2019, Today, today, Halloween, ...",32
