# Status report

By Ben Welsh

Generates basic statistics from [News Homepages database extracts](https://palewi.re/docs/news-homepages/extracts.html).

In [1]:
import os
import json
import pandas as pd
import altair as alt
from pathlib import Path
from datetime import datetime, timedelta

In [2]:
palette = dict(
    black='#000000',
    white='#ffffff',
    default='#cecece',
    accent='#727272',
    ramp=[
        '#5e5e5e',
        '#727272',
        '#858585',
        '#989898',
        '#aaaaaa',
        '#bcbcbc',
        '#c4c4c4',
        '#cecece',
        '#e0e0e0',
        '#f0f0f0',
        '#ffffff',
    ]
)

In [3]:
def theme():
    """
    A @palewire theme for Altair.
    """

    
    fontFamily = '"Libre Franklin","Helvetica","Liberation Sans",Arial,sans-serif'

    # Headline stuff
    headlineFontSize = 20
    headlineFontWeight = "bold"
    headlineFont = fontFamily

    # Titles for axes and legends and such
    titleFont = fontFamily
    titleFontWeight = 'bold'
    titleFontSize = 14

    # Labels for ticks and legend entries and such
    labelFont = fontFamily
    labelFontSize = 12
    labelFontWeight = 'normal'

    return dict(
        config=dict(
            view=dict(
                width=500,
                height=300,
                stroke=0,
            ),
            padding=dict(top=15, bottom=15, left=10, right=10),
            background=palette['white'],
            title=dict(
                anchor='start',
                font=headlineFont,
                fontColor=palette['black'],
                fontSize=headlineFontSize,
                fontWeight=headlineFontWeight,
                lineHeight=26,
            ),
            arc=dict(fill=palette['default']),
            area=dict(fill=palette['default']),
            line=dict(stroke=palette['default'], strokeWidth=3),
            path=dict(stroke=palette['default']),
            rect=dict(fill=palette['default']),
            shape=dict(stroke=palette['default']),
            bar=dict(fill=palette['default']),
            point=dict(stroke=palette['default']),
            symbol=dict(fill=palette['default'], size=30),
            axis=dict(
                titleFont=titleFont,
                titleFontSize=titleFontSize,
                titleFontWeight=titleFontWeight,
                labelFont=labelFont,
                labelFontSize=labelFontSize,
                labelFontWeight=labelFontWeight,
                tickColor=palette['accent'],
                labelColor=palette['accent'],
                stroke=palette['default'],
            ),
            axisX=dict(
                labelAngle=0,
                labelPadding=10,
                labelFlush=True,
                tickSize=0,
                domain=False,
            ),
            axisY=dict(
                labelBaseline='middle',
                labelPadding=5,
                labelFlush=True,
                tickSize=0,
                titleAlign='left',
                titleAngle=0,
                titleX=-45,
                titleY=-11,
                domain=False,
            ),
            legend=dict(
                titleFont=titleFont,
                titleFontSize=titleFontSize,
                titleFontWeight=titleFontWeight,
                symbolType='square',
                labelFont=labelFont,
                labelFontSize=labelFontSize + 1
            ),
            range=dict(
                heatmap=palette['ramp'],
                ordinal=palette['ramp'],
                ramp=palette['ramp']
            )
        )
    )

In [4]:
alt.themes.register('palewire', theme)
alt.themes.enable('palewire')

ThemeRegistry.enable('palewire')

In [5]:
this_dir = Path("__file__").parent.absolute()

In [6]:
sources_dir = this_dir.parent / "newshomepages" / "sources"

In [7]:
extracts_dir = this_dir.parent / "extracts" / "csv"

In [8]:
df = pd.read_csv(
    extracts_dir / "screenshot-files.csv",
    parse_dates=["mtime"],
    usecols=["identifier", "handle", "file_name", "mtime"]
)

In [9]:
df['date'] = df.mtime.dt.date

In [10]:
df["date"] = pd.to_datetime(df["date"])

How many total sites?

In [11]:
total_sites = len(df.handle.unique())

In [12]:
total_sites

815

How many total screenshots?

In [13]:
total_screenshots = len(df)

In [14]:
total_screenshots

85926

When did we start?

In [15]:
start_date = min(df.date)

In [16]:
start_date

Timestamp('2022-03-22 00:00:00')

How many screenshots in the last week?

In [17]:
today = datetime.now().date()

In [18]:
today

datetime.date(2022, 8, 18)

In [19]:
one_week_ago = today - timedelta(days=7)

In [20]:
one_week_ago

datetime.date(2022, 8, 11)

In [21]:
df_this_week = df[df.date > pd.to_datetime(one_week_ago)]

In [22]:
screenshots_this_week = len(df_this_week)

In [23]:
screenshots_this_week

10582

Write out data points

In [24]:
output = dict(
    total_sites=total_sites,
    total_screenshots=total_screenshots,
    screenshots_this_week=screenshots_this_week,
)

In [25]:
json.dump(output, open(this_dir / 'status-report.json', 'w'), indent=2)

Chart the number of sites by date

In [26]:
sites_by_date = df[['date', 'handle']].drop_duplicates().groupby("date").size().rename("sites").reset_index().sort_values("date")

In [27]:
sites_by_date['rolling_mean'] = sites_by_date.sites.rolling(7).mean()

In [28]:
chart = alt.Chart(
    sites_by_date.head(len(sites_by_date) - 1),
    title="Sites archived by day",
)

bars = chart.mark_bar(
    fill="#cecece"
).encode(
    x=alt.X("date:T", title=None, timeUnit="yearmonthdate", axis=alt.Axis(format="%B %-d", grid=False)),
    y=alt.Y("sites:Q", title=None),
)

line = chart.mark_line(color='#727272', strokeWidth=3).encode(
    x='date:T',
    y='rolling_mean:Q'
)

label = chart.encode(
    x=alt.X('max(date):T'),
    y=alt.Y('rolling_mean:Q', aggregate=alt.ArgmaxDef(argmax='date')),
    text='rolling_mean'
)

# Create a text label
text = label.mark_text(align='left', dx=4)

# Create a circle annotation
circle = label.mark_circle(size=75, color="#727272")

(bars + line + circle)

In [29]:
if os.getenv("CI"):
    (bars + line + circle).save(this_dir / 'sites-by-date.png')

Chart the number of screenshots by date

In [30]:
screenshots_by_date = df.groupby("date").size().rename("screenshots").reset_index().sort_values("date")

In [31]:
screenshots_by_date['rolling_mean'] = screenshots_by_date.screenshots.rolling(7).mean()

In [32]:
chart = alt.Chart(
    screenshots_by_date.head(len(screenshots_by_date) - 1),
    title="Screenshots saved by day",
)

bars = chart.mark_bar(
    fill="#cecece"
).encode(
    x=alt.X("date:T", title=None, timeUnit="yearmonthdate", axis=alt.Axis(format="%B %-d", grid=False)),
    y=alt.Y("screenshots:Q", title=None),
)

line = chart.mark_line(color='#727272', strokeWidth=3).encode(
    x='date:T',
    y='rolling_mean:Q'
)

label = chart.encode(
    x=alt.X('max(date):T'),
    y=alt.Y('rolling_mean:Q', aggregate=alt.ArgmaxDef(argmax='date')),
    text='rolling_mean'
)

# Create a text label
text = label.mark_text(align='left', dx=4)

# Create a circle annotation
circle = label.mark_circle(size=75, color="#727272")

(bars + line + circle)

In [33]:
if os.getenv("CI"):
    (bars + line + circle).save(this_dir / 'screenshots-by-date.png')