In [None]:
from ipywidgets import interact_manual
from IPython.core.display import HTML

import xml.etree.ElementTree as ET
import ipywidgets as widgets
import pandas as pd

import json
import re

### (Outdated) Using XML

You can ignore this for now. Instead, focus your attention on the `Using JSON` section below, as this is currently in development.

In [None]:
tree = ET.parse('bare.xml')
root = tree.getroot()
all_articles_xml = root[0]

In [None]:
@interact_manual(idx=widgets.IntSlider(min=57, max=len(all_articles)-1, step=1))
def extract_article_info(idx):
    """Extracts article info for a given index"""
    article = all_articles_xml[idx]
    
    # Format for earlier articles, not sure if it works for all
    title, link, pubDate, creator = article[:4]
    content = article[6]
    
    return_str = f"{title.text}<br><br>{pubDate.text}<br><br>{creator.text}<br><br>{content.text}"
    display(HTML(return_str))

### Using JSON

Please reference this section for loading and parsing old articles through the `bare.json` file. It must be in the same working directory as this notebook.

In [None]:
# Load file into JSON object
with open("bare.json") as f:
    data = json.load(f)
    
# Get all article and authors
all_articles_json = data['rss']['channel']['item']
all_authors_json = data['rss']['channel']['author']

# Map usernames to author display names
author_map = {}
for author_d in all_authors_json:
    creator, first, last = author_d['author_login']['__cdata'], \
                            author_d['author_first_name']['__cdata'], \
                            author_d['author_last_name']['__cdata'],
    author_map[creator] = f"{first} {last}".title()
    
# Get map for title to article index
title_to_idx = {}
for idx, article in enumerate(all_articles_json):
    title = article.get("title", {}).get("__cdata", "No Title")
    cleaned_title = title.lower().strip()
    title_to_idx[cleaned_title] = idx

In [None]:
@interact_manual(title="Enter Your Title Here")
def extract_article_info_json(title):
    """Extracts article info for a given index"""
    idx = title_to_idx[title.lower().strip()]
    article = all_articles_json[idx]
    
    # Get relevant attributes of article, filling in where not available
    title = article.get('title', {}).get('__cdata', 'No Title')
    date = article.get('pubDate', 'No Publish Date')
    creator = article.get('creator', {}).get('__cdata', 'No Creator')
    
    # Match creator to display name
    author = author_map[creator]
    
    # Grab all HMTL content
    all_content = article.get('encoded')
    content = "".join([c['__cdata'] for c in all_content])
    
    # Render as HTML string
    return_str = f"""
                <h1>{title}</h1><br><br><b>Date Published: 
                </b>{date}<br><br><b>Author: </b>{author}<br><br>{content}
                """
    
    # Show HTML string in notebook output
    display(HTML(return_str))

In [None]:
# Create DataFrame with titles and dates
titles, days, months, years = [], [], [], []
pattern = r"[\w]{3}, ([\d]{2}) ([\w]{3}) ([\d]{4})"

for a in all_articles_json:
    try:
        title = a.get('title', {}).get('__cdata', 'No Title')
        date = a.get('pubDate', 'NO DATE')
        match = re.match(pattern, date)
        day, month, year = int(match[1]), match[2], int(match[3])
    except TypeError:
        day, month, year, title = None, None, None, None
        
    titles.append(title)
    days.append(day)
    months.append(month)
    years.append(year)
    
# Show first few rows of the table
dates = pd.DataFrame(data={"Title": titles, "Day": days, "Month": months, "Year": years})
dates.head()