# DMACP data processing

## Scraping

Download content files from anthropocene website:

In [1]:
from lib.anthropocene.Scraper import Scraper

anthropocene_scraper = Scraper('../dmacp_data')
anthropocene_scraper.scrape_content()

## Preprocessing

Reduce content files and merge them into one json file for each type (contribution, project, field note):

In [7]:
import json
import os
from lib.anthropocene.Preprocessor import Preprocessor

# ---

source_dir = '../dmacp_data'
output_dir = './data/preprocess/reduced'

merged_data_filenames = {
    'contribution': 'contributions.json',
    'field_note': 'field_notes.json',
    'project': 'projects.json',
}

# ---

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# ---

anthropocene_preprocessor = Preprocessor(source_dir)
merged_contents = anthropocene_preprocessor.get_merged_article_contents_by_type()

for (type, data) in merged_contents.items():
    filepath = '%s/%s' % (output_dir, merged_data_filenames[type])

    with open(filepath, 'w') as outfile:
        json.dump(data, outfile, sort_keys=True, indent=4)

Create json file containing all contents:

In [13]:
import json
import os
from pathlib import Path

# ---

source_dir = '../dmacp_data'
output_file = './data/preprocess/merged/fully_merged_contents.json'

# ---

output_dir = path = Path(output_file).parent
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# ---

anthropocene_preprocessor = Preprocessor(source_dir)
fully_merged_content = anthropocene_preprocessor.get_fully_merged_article_contents()

with open(output_file, 'w') as outfile:
    json.dump(fully_merged_content, outfile, sort_keys=True, indent=4)

## Analysis

### Article Similarities

Compare all articles with all following articles.
(Because the similarity operation is commutative you can skip the reversed comparison to save some time.)

In [19]:
import json
import os
import shutil
from lib.anthropocene.Analyzer import Analyzer

# ---

source_filepath = './data/preprocess/merged/fully_merged_contents.json'
output_dir = './data/analysis/article_similarities/raw'

# ---

if os.path.exists(output_dir):
    shutil.rmtree(output_dir)
os.makedirs(output_dir)

# ---

with open(source_filepath) as json_file:
    contents = json.load(json_file)

    anthropocene_analyzer = Analyzer()
    
    processed_article_ids = []
    for article_id in contents:
        unprocessed_contents = dict([(key, val) for key, val in contents.items() if key not in processed_article_ids])

        similarities = anthropocene_analyzer.get_article_similarities(article_id, unprocessed_contents)
        sorted_similarities = dict(sorted(similarities.items(), key=lambda item: item[1], reverse=True))

        filepath = '%s/%s.json' % (output_dir, article_id)
        with open(filepath, 'w') as outfile:
            json.dump(sorted_similarities, outfile, sort_keys=False, indent=4)

        processed_article_ids.append(article_id)

12646 <-> 12724
12646 <-> 12854
12646 <-> 12856
12646 <-> 13164
12646 <-> 13167
12646 <-> 13170
12646 <-> 13177
12646 <-> 13180
12646 <-> 13183
12646 <-> 13364
12646 <-> 13368
12646 <-> 13371
12646 <-> 14258
12646 <-> 1566669833833-underhil
12646 <-> 1566670143264-underhil
12646 <-> 1566855154819-underhil
12646 <-> 1566877164374-johnwkim
12646 <-> 1566877523518-johnwkim
12646 <-> 1566877596229-johnwkim
12646 <-> 1566920087935-underhil
12646 <-> 1566922144169-johnwkim
12646 <-> 1566946414604-johnwkim
12646 <-> 1567000427783-underhil
12646 <-> 1567005100424-temporarycontinent
12646 <-> 1567008776790-johnwkim
12646 <-> 1567009364750-temporarycontinent
12646 <-> 1567009466132-temporarycontinent
12646 <-> 1567010303341-temporarycontinent
12646 <-> 1567046668967-temporarycontinent
12646 <-> 1567161018172-underhil
12646 <-> 1567161360053-underhil
12646 <-> 1567165201716-underhil
12646 <-> 1567172683304-temporarycontinent
12646 <-> 1567173318487-temporarycontinent
12646 <-> 1567512349489-johnw

12646 <-> 1575639851751-bsteininger
12646 <-> 1575640348687-bsteininger
12646 <-> 1575926121165-audrey
12646 <-> 1575927775252-audrey
12646 <-> 1575992123243-johnwkim
12646 <-> 1575992254429-johnwkim
12646 <-> 1579261513388-simon-turner
12646 <-> 1579852797747-alexanderwschindler
12646 <-> 1590919178472-tturnbull
12646 <-> 16879
12646 <-> 16883
12646 <-> 16887
12646 <-> 17011
12646 <-> 17014
12646 <-> 17017
12646 <-> 17346
12646 <-> 18114
12646 <-> 18117
12646 <-> 18121
12646 <-> 18124
12646 <-> 18127
12646 <-> 18130
12646 <-> 18339
12646 <-> 21810
12646 <-> 22785-2
12646 <-> 30-days-on-30-days-off-paul-perko
12646 <-> a-brief-history-of-geoengineering
12646 <-> a-caribbean-taste-of-technology-creolization-and-the-ways-of-making-of-the-dancehall-sound-system
12646 <-> a-curriculum-for-the-anthropocene
12646 <-> a-legacy-of-the-technosphere
12646 <-> a-n-t-h-r-o-p-o-z-i-n-e-0
12646 <-> a-recipe-for-crafting-color-the-revival-of-natural-dyeing-in-south-india
12646 <-> a-river-indicts
126

12646 <-> meeker-dam
12646 <-> memories-of-the-yagan-the-chilean-automobile-for-the-people
12646 <-> meskonsing-indian-lake-convergence-1
12646 <-> meskonsing-kansan
12646 <-> meskonsing-kansan-an-introduction
12646 <-> midway-meeting-st-louis
12646 <-> mikado-sol-2014
12646 <-> mississippi
12646 <-> mobile-hedges
12646 <-> monsanto-town
12646 <-> museum-library
12646 <-> natchez-etiologies-of-athropogenic-emergence
12646 <-> navigating-the-anthropocene-river
12646 <-> new-orleans-anthropocene-field-campus
12646 <-> nigeria-oil-pipes
12646 <-> nordic-fauna-seen-in-nature
12646 <-> ny-alesund-svalbard-norway
12646 <-> of-forests-of-rivers-and-of-meals
12646 <-> on-anthropotechnics-and-physical-practice
12646 <-> on-consuming-slow-media
12646 <-> on-land-and-lakes-colonizing-the-north
12646 <-> on-the-operation-of-border-regimes
12646 <-> on-the-recuperative-mismanagement-of-a-cosmopolitan-fish
12646 <-> on-the-use-of-the-word-code-by-the-kogi-translator
12646 <-> one-place-many-names
12

12724 <-> 1567161018172-underhil
12724 <-> 1567161360053-underhil
12724 <-> 1567165201716-underhil
12724 <-> 1567172683304-temporarycontinent
12724 <-> 1567173318487-temporarycontinent
12724 <-> 1567512349489-johnwkim
12724 <-> 1567773578662-johnwkim
12724 <-> 1567773810830-johnwkim
12724 <-> 1567813529190-neli-wagner
12724 <-> 1568628955437-temporarycontinent
12724 <-> 1568639196798-underhil
12724 <-> 1568639979586-johnwkim
12724 <-> 1568659030393-johnwkim
12724 <-> 1568737949503-temporarycontinent
12724 <-> 1568738597822-temporarycontinent
12724 <-> 1568772426391-underhil
12724 <-> 1568806082665-underhil
12724 <-> 1568815306498-johnwkim
12724 <-> 1568834765240-andrea
12724 <-> 1568838842480-andrea
12724 <-> 1568839037654-andrea
12724 <-> 1568841623817-andrea
12724 <-> 1568920800239-temporarycontinent
12724 <-> 1569007858411-temporarycontinent
12724 <-> 1569009208738-johnwkim
12724 <-> 1569087902404-johnwkim
12724 <-> 1569148411524-temporarycontinent
12724 <-> 1569172914075-johnwkim
1

12724 <-> a-river-semester
12724 <-> a-seed-a-sound
12724 <-> a-slobjects-exercise-whats-in-our-pockets
12724 <-> a-suspended-archive
12724 <-> a-trace-a-breath
12724 <-> ac-event-2016
12724 <-> acknowledging-indigenous-land-and-a-performance-of-idle-no-more-bonus-episode
12724 <-> acquiring-and-optimizing-sustainable-relationships-for-good-solid-cash-flow-streams-or-speaking-with-plants
12724 <-> adaptive-modeling
12724 <-> after-extraction-a-partical-political-ecology-of-central-illinois
12724 <-> agricultural-revolution-vs-the-industrial-revolution
12724 <-> alternatives-to-global-challenges
12724 <-> ambient-infrastructures-generator-life-in-nigeria
12724 <-> amongst-relatives
12724 <-> an-anthropocene-challenge-creating-models-for-landscape-scale-change
12724 <-> an-anthropocene-in-two-parts
12724 <-> anthropocene-archeology-of-the-present
12724 <-> anthropocene-campus-melbourne
12724 <-> anthropocene-campus-melbourne-2018-a-report
12724 <-> anthropocene-campus-philadelphia
12724 

12724 <-> one-place-many-names
12724 <-> one-world-solid-and-cracked
12724 <-> open-seminars
12724 <-> orbital-geopolitics
12724 <-> outrunning-the-anthropocene
12724 <-> over-the-levee-under-the-plow
12724 <-> oysters-selective-pressures-and-antibiotic-resistance-in-the-mississippi-delta
12724 <-> paddle-waves-and-sparkling-light
12724 <-> panel-talk-beween-patricia-piccinini-and-claudia-vickers
12724 <-> parallax-lisboa
12724 <-> patagonia
12724 <-> performing-the-anthropocene
12724 <-> peter-schlemihl-exploring-anthropocenic-landscapes
12724 <-> pharmocracy-access-and-care
12724 <-> phenomenal-machines-2
12724 <-> place-and-space
12724 <-> plant-domestication-and-dispersal
12724 <-> planting-a-seed-is-revolutionary-act
12724 <-> plastic-and-surrogacy
12724 <-> pockets-reflections-on-the-anthropocene-campus-melbourne
12724 <-> praying-for-the-water-with-saundi-mcclain-kloeckner
12724 <-> producing-the-anthropocene-producing-the-future
12724 <-> project-launch-minneapolis
12724 <-> pr

12854 <-> 1569173216724-johnwkim
12854 <-> 1569205099993-temporarycontinent
12854 <-> 1569206419929-temporarycontinent
12854 <-> 1569269908704-emily-sekine
12854 <-> 1569295654021-underhil
12854 <-> 1569300049721-temporarycontinent
12854 <-> 1569434224893-andrea
12854 <-> 1569460031529-sallydonovan1
12854 <-> 1569508021189-smallmythologies
12854 <-> 1569536291150-temporarycontinent
12854 <-> 1569604286805-temporarycontinent
12854 <-> 1569604564103-temporarycontinent
12854 <-> 1569618088611-underhil
12854 <-> 1569632409044-temporarycontinent
12854 <-> 1569635109935-temporarycontinent
12854 <-> 1569694184485-johnwkim
12854 <-> 1569695150061-johnwkim
12854 <-> 1569785205345-temporarycontinent
12854 <-> 1569786670515-temporarycontinent
12854 <-> 1569868503866-johnwkim
12854 <-> 1569891674801-temporarycontinent
12854 <-> 1569939440894-johnwkim
12854 <-> 1570020384463-s-kanouse
12854 <-> 1570020385379-s-kanouse
12854 <-> 1570057470897-temporarycontinent
12854 <-> 1570124673780-emily-sekine
1

12854 <-> an-anthropocene-in-two-parts
12854 <-> anthropocene-archeology-of-the-present
12854 <-> anthropocene-campus-melbourne
12854 <-> anthropocene-campus-melbourne-2018-a-report
12854 <-> anthropocene-campus-philadelphia
12854 <-> anthropocene-campus-venice-2021
12854 <-> anthropocene-east-asia
12854 <-> anthropocene-india
12854 <-> anthropocene-lecture-john-mcneill
12854 <-> anthropocene-lecture-julia-adeney-thomas
12854 <-> anthropocene-lecture-karen-litfin
12854 <-> anthropocene-lecture-philippe-descola
12854 <-> anthropocene-lecture-prasannan-parthasarathi
12854 <-> anthropocene-lecture-sheila-jasanoff
12854 <-> anthropocene-reinsurance-corporation-anthrore
12854 <-> anthropocene-river-campus-opening-plenary
12854 <-> anthropocene-river-campus-plenary-i
12854 <-> anthropocene-river-campus-seminar-clashing-temporalities
12854 <-> anthropocene-river-campus-seminar-commodity-flows
12854 <-> anthropocene-river-campus-seminar-exhaustion-and-imagination
12854 <-> anthropocene-river-c

12854 <-> praying-for-the-water-with-saundi-mcclain-kloeckner
12854 <-> producing-the-anthropocene-producing-the-future
12854 <-> project-launch-minneapolis
12854 <-> prologue-the-fence-concept
12854 <-> re-patterning-with-kudzu
12854 <-> real-estate-river
12854 <-> redeeming-urban-spaces-the-ambivalence-of-building-a-pentecostal-city-in-lagos-nigeria
12854 <-> remnant-formations
12854 <-> reshaping-the-shape-embodiment-ecology-and-culture-of-a-postnatural-carp
12854 <-> resisting-the-oblivion-of-eco-colonialism
12854 <-> resonance-vegetale
12854 <-> risk-as-immaterial-raw-material
12854 <-> risk-equity-in-the-louisiana-anthropocene
12854 <-> river-memory
12854 <-> river-semester
12854 <-> river-semester-project
12854 <-> rogue-elements-of-the-upper-mississippi
12854 <-> sacrifice-zones-and-portable-climate
12854 <-> seminar-algorithmic-intermediation-and-smartness
12854 <-> seminar-anthropogenic-landscapes
12854 <-> seminar-archiving
12854 <-> seminar-claims-property
12854 <-> seminar

12856 <-> 1569939440894-johnwkim
12856 <-> 1570020384463-s-kanouse
12856 <-> 1570020385379-s-kanouse
12856 <-> 1570057470897-temporarycontinent
12856 <-> 1570124673780-emily-sekine
12856 <-> 1570125525982-emily-sekine
12856 <-> 1570138958427-temporarycontinent
12856 <-> 1570211207571-temporarycontinent
12856 <-> 1570290161599-temporarycontinent
12856 <-> 1570290670030-temporarycontinent
12856 <-> 1570290751317-temporarycontinent
12856 <-> 1570309350997-temporarycontinent
12856 <-> 1570395312304-johnwkim
12856 <-> 1570462163821-johnwkim
12856 <-> 1570490442173-temporarycontinent
12856 <-> 1570508120454-temporarycontinent
12856 <-> 1570543912990-johnwkim
12856 <-> 1570554719993-temporarycontinent
12856 <-> 1570559932884-johnwkim
12856 <-> 1570661514721-tanadim
12856 <-> 1570810469419-temporarycontinent
12856 <-> 1570863408307-emily-sekine
12856 <-> 1570902829366-johnwkim
12856 <-> 1570906898155-temporarycontinent
12856 <-> 1570911751254-johnwkim
12856 <-> 1570913240650-johnwkim
12856 <->

12856 <-> approaching-a-waterway
12856 <-> arctic-change
12856 <-> art-air-and-ideas-in-the-anthropocene-2
12856 <-> asbestos-in-ambler
12856 <-> axiomatic-earth
12856 <-> bastar-diary
12856 <-> before-i-know-you
12856 <-> between-spaces-between-lines
12856 <-> beyond-property
12856 <-> biometric-capitalism
12856 <-> biosphere-2-the-mars-project
12856 <-> blackboyjoy-on-the-river-eugene-b-redmond-81
12856 <-> blackhawk-park-is-indigenous-land-beyond-acknowledgment
12856 <-> born-secret
12856 <-> bound-with-bright-beautiful-things
12856 <-> broadcasting-live-from-field-station-5-the-culture-of-resistance-by-land-and-sea-in-the-anthropocene
12856 <-> build-your-own-fence
12856 <-> cadillac-ranch-2004
12856 <-> campus-2014
12856 <-> campus-2016
12856 <-> check-my-pulse
12856 <-> chinas-blue-territory-and-the-technosphere-in-maritime-east-asia
12856 <-> citizenship-and-technologies-of-bordering
12856 <-> closing-discussion-with-akeel-bilgrami-and-bernd-scherer
12856 <-> comics-and-graphic-

12856 <-> seminar-film-commodity-flows
12856 <-> seminar-film-exhaustion-and-imagination
12856 <-> seminar-film-un-bounded-engineering-and-evolutionary-stability
12856 <-> seminar-filtering-the-anthropocene
12856 <-> seminar-governing-the-technosphere
12856 <-> seminar-imaging-the-anthropocene
12856 <-> seminar-modelling-wicked-problems
12856 <-> seminar-reflection-2
12856 <-> seminar-reflections-algorithmic-intermediation-and-smartness
12856 <-> seminar-report-anthropogenic-landscapes
12856 <-> seminar-report-filtering-the-anthropocene
12856 <-> seminar-report-geo-politics
12856 <-> seminar-report-slow-media
12856 <-> seminar-risk-equity
12856 <-> seminar-romancing-the-anthropocene
12856 <-> seminar-sensing
12856 <-> seminar-sensing-the-insensible
12856 <-> seminar-slow-media
12856 <-> seminar-techno-metabolism
12856 <-> seminar-un-bounded-engineering-and-evolutionary-stability
12856 <-> seminar-valuing-nature
12856 <-> sever-pre-emptive-strategy-for-an-open-arctic
12856 <-> shadowing

KeyboardInterrupt: 

Create similarity file for each article by taking the (imcomplete) similarity file and adding the missing similarities from the counterpart files:

In [20]:
import json
import numpy
import os

# ---

source_dir = './data/analysis/article_similarities/raw'
article_contents_filepath = './data/preprocess/merged/fully_merged_contents.json'
output_dir = './data/analysis/article_similarities'

# ---

def get_similarity_from_file(filepath, article_id):
    with open(filepath) as file:
        data = json.load(file)
    return data[article_id]

# ---

# Delete previously created files
filelist = [ f for f in os.listdir(output_dir) if f.endswith(".json") ]
for f in filelist:
    os.remove(os.path.join(output_dir, f))

# ---

# Load article contents to get ids
with open(article_contents_filepath) as article_contents_file:
    article_contents = json.load(article_contents_file)
    complete_article_ids = list(article_contents.keys())

# Iterate through files and add missing data
for article_id in complete_article_ids:
    similarity_file_path = '%s/%s.json' % (source_dir, article_id)
    if not os.path.exists(similarity_file_path):
        continue

    # Open similarity file to add data
    with open(similarity_file_path) as similarity_file:
        raw_similarities = json.load(similarity_file)

        existing_similarities = list(raw_similarities.keys())
        missing_article_ids = numpy.setdiff1d(complete_article_ids, existing_similarities)

        for missing_article_id in missing_article_ids:
            if missing_article_id == article_id:
                continue

            missing_article_filepath = '%s/%s.json' % (source_dir, missing_article_id)
            raw_similarities[missing_article_id] = get_similarity_from_file(missing_article_filepath, article_id)

        # Sort data and write to json file
        sorted_similarities = dict(sorted(raw_similarities.items(), key=lambda item: item[1], reverse=True))
        outfile_path = '%s/%s.json' % (output_dir, article_id)
        with open(outfile_path, 'w') as outfile:
            json.dump(sorted_similarities, outfile, sort_keys=False, indent=4)