# Tools for exploring the structure of longform articles

Import the necessary packages.

In [1]:
import xml.etree.ElementTree as et
import numpy as np
import glob
from itertools import cycle
from collections import Counter as c

Load the annotated data from the XML file.

In [2]:
xmlfiles = glob.glob('../01_corpus/*.xml')

screens = [] # Empty list for screens

for xmlfile in xmlfiles: # Loop through each XML file
    annotation = et.parse(xmlfile) # Parse the XML annotation in specific file
    xmlroot = annotation.getroot() # Get the root element
    print xmlroot.attrib['title'], '/', xmlroot.attrib['publisher'], '/', xmlroot.attrib['year'], '\n' # Print longform data
    for entry in xmlroot: # Iterate through the XML file
        screen = xmlroot.attrib['article_identifier'] + '-' + entry.attrib['identifier'], entry.attrib['mode'], entry.attrib['type'], entry.attrib['transition_in'], entry.attrib['transition_out']
        print entry.attrib['identifier'], entry.attrib['mode'], entry.attrib['type'], entry.attrib['transition_in'], entry.attrib['transition_out']
        screens.append(screen)
    print '\n' # Extra line break

A Game of Shark and Minnow - Who Will Win Control of the South China Sea? / The New York Times / 2013 

001 image-flow dynamic none dissolve
002 map dynamic dissolve dissolve
003 photograph static dissolve scroll
004 text-flow static scroll scroll
005 image-flow dynamic scroll dissolve
006 photograph static dissolve dissolve
007 image-flow dynamic dissolve dissolve
008 photograph static dissolve scroll
009 text-flow static scroll scroll
010 map dynamic scroll zoom
011 map dynamic zoom zoom
012 map dynamic zoom zoom
013 map dynamic zoom zoom
014 map dynamic zoom scroll
015 text-flow static scroll scroll
016 image-flow dynamic scroll dissolve
017 photograph static dissolve dissolve
018 photograph static dissolve dissolve
019 image-flow dynamic dissolve scroll
020 text-flow static scroll scroll
021 image-flow dynamic scroll dissolve
022 image-flow dynamic dissolve dissolve
023 image-flow dynamic dissolve scroll
024 text-flow static scroll scroll
025 photograph static scroll dissolve
026 i

### Examining the modes preceding a specific semiotic mode

In [None]:
preceding_mode = []

for current in screens[1:]:
        previous = screens[screens.index(current) -1]
        if current[1] == 'photograph':
            preceding_mode.append(previous[2] + '-' + previous[1])
            
pm = c(preceding_mode)

print c.most_common(pm)

### Examining the modes following a specific semiotic mode

In [None]:
following_modes = []

for current in screens:
    if screens.index(current) != len(screens) -1:
        following = screens[screens.index(current) +1]
        if current[1] == 'text-flow':
            following_modes.append(following[2] + '-' + following[1])
            
fm = c(following_modes)

print c.most_common(fm)

### Examine sequences of semiotic modes

In [14]:
sequence = []

for current in screens:
    if screens.index(current) != len(screens) -1:
        following = screens[screens.index(current) +1]
        if current[1] != 'text-flow':
            if following[1] != 'text-flow':
                sequence.append(following[0] + ' ' + following[1] + ' followed by ' + current[0] + ' ' + current[1])



In [15]:
sequence

['gsm-002 map followed by gsm-001 image-flow',
 'gsm-003 photograph followed by gsm-002 map',
 'gsm-006 photograph followed by gsm-005 image-flow',
 'gsm-007 image-flow followed by gsm-006 photograph',
 'gsm-008 photograph followed by gsm-007 image-flow',
 'gsm-011 map followed by gsm-010 map',
 'gsm-012 map followed by gsm-011 map',
 'gsm-013 map followed by gsm-012 map',
 'gsm-014 map followed by gsm-013 map',
 'gsm-017 photograph followed by gsm-016 image-flow',
 'gsm-018 photograph followed by gsm-017 photograph',
 'gsm-019 image-flow followed by gsm-018 photograph',
 'gsm-022 image-flow followed by gsm-021 image-flow',
 'gsm-023 image-flow followed by gsm-022 image-flow',
 'gsm-026 image-flow followed by gsm-025 photograph',
 'gsm-027 image-flow followed by gsm-026 image-flow',
 'gsm-030 image-flow followed by gsm-029 image-flow',
 'gsm-031 photograph followed by gsm-030 image-flow',
 'gsm-032 photograph followed by gsm-031 photograph',
 'kwc-002 page-flow followed by kwc-001 imag

### Counting modes, transitions and mode-transition patterns

In [None]:
transition_patterns = []

for xmlfile in xmlfiles: # Loop through each XML file
    annotation = et.parse(xmlfile) # Parse the XML annotation in specific file
    xmlroot = annotation.getroot() # Get the root element
    print xmlroot.attrib['title'], '/', xmlroot.attrib['publisher'], '/', xmlroot.attrib['year'], '\n' # Print longform data
    
    content = []
    unique_modes = []
    unique_transitions = []

    for entry in xmlroot: # Iterate through the XML file
        portion = xmlroot.attrib['article_identifier'] + '-' + entry.attrib['identifier'], entry.attrib['mode'], entry.attrib['type'], entry.attrib['transition_in'], entry.attrib['transition_out']
        content.append(portion)
        
        mode = entry.attrib['type'] + ' ' + entry.attrib['mode']
        unique_modes.append(mode)

        transition = entry.attrib['transition_in'] + '-' + entry.attrib['transition_out']
        if 'none' in transition:
            continue
        else:
            unique_transitions.append(transition)        
    
    article_transitions = []
    
    for current in content: # Print the transitions
        if content.index(current) != len(content) -1:
            following = content[content.index(current) +1]
            print 'Transition from ' + current[2] + ' ' + current[1] + ' to ' + following[2] + ' ' + following[1] + ' via ' + current[4] + '/' + following[3] 
            unique_transition = current[2] + ' ' + current[1] + ' to ' + following[2] + ' ' + following[1] + ' via ' + current[4] + '/' + following[3]
            mode = current[2] + ' ' + current[1]
            
            transition_patterns.append(unique_transition)
            article_transitions.append(unique_transition)
        else:
            continue
            
    print '\n' # Extra line break
        
    print 'A total of %i unique modes, %i unique transitions and %i unique mode-transition combinations.' % (len(c(unique_modes)), len(c(unique_transitions)), len(c.most_common(c(article_transitions))))
    
    print '\n' # Extra line break

### Counting the semiotic modes

In [None]:
modes = [] # Empty list for modes

for screen in screens:
    mode = screen[1] + '-' + screen[2]
    modes.append(mode)
    
ms = c(modes)

c.most_common(ms)

### Counting the transitions

In [None]:
transitions = []

for screen in screens:
    transition = screen[3] + '-' + screen[4]
    if 'none' in transition:
        continue
    else:
        transitions.append(transition)

tr = c(transitions)

c.most_common(tr)

### Counting symmetric transitions

In [None]:
sym_transitions = []

for screen in screens:
    transition = screen[3] + '-' + screen[4]
    if screen[3] == screen[4]:
        sym_transitions.append(transition)

symtr = c(sym_transitions)

len(sym_transitions), len(symtr), c.most_common(symtr)

### Counting asymmetric transitions

In [None]:
asym_transitions = []

for screen in screens:
    transition = screen[3] + '-' + screen[4]
    if screen[3] != screen[4]:
        asym_transitions.append(transition)

asymtr = c(asym_transitions)

len(asym_transitions), len(asymtr), c.most_common(asymtr)

### Finding the most common transitions for a specific semiotic mode

In [None]:
single_mode_transitions = []

for m, t in zip(modes, transitions):
    mode_transition = m, t
    if 'text-flow' in m:
        single_mode_transitions.append(mode_transition)

mt = c(single_mode_transitions) # Set up a counter for c.most_common

c.most_common(mt)

Find the most common transition patterns.

In [None]:
tp = c(transition_patterns)

c.most_common(tp)