In [None]:
import numpy as np
import pandas as pd

In [None]:
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style( 'whitegrid' )

# Parameters

In [None]:
year_start = 'September 1'

In [None]:
data_fp = './data/exports/News_Report_2023-07-25.csv'

In [None]:
groupings = [ 'Research Topics', 'Press Types', 'Categories' ]

# Preprocessing

In [None]:
# Load data
df = pd.read_csv( data_fp, parse_dates=[ 'Date', ] )

In [None]:
# Drop drafts
df.drop( df.index[df['Date'].dt.year == 1970], axis='rows', inplace=True )

In [None]:
# Drop weird articles---ancient ones w/o a title or press type
df.dropna( axis='rows', how='any', subset=[ 'Title', 'Press Types', ], inplace=True )

In [None]:
# Get rid of HTML ampersands
for str_column in [ 'Title', 'Research Topics', 'Categories' ]:
    df[str_column] = df[str_column].str.replace( '&amp;', '&' )

In [None]:
# Get date bins
start_year = df['Date'].min().year - 1
end_year = df['Date'].max().year + 1
date_bins = pd.date_range(
    '{} {}'.format( year_start, start_year ),
    pd.Timestamp.now() + pd.offsets.DateOffset( years=1 ),
    freq = pd.offsets.DateOffset( years=1 ),
)
date_bin_labels = date_bins.year[:-1]

In [None]:
# Add the year published (using the above start date )
df['Year'] = pd.cut( df['Date'], date_bins, labels=date_bin_labels ) 

# Visualize

In [None]:
for group_by_i in groupings:
    df_i = df.copy()

    # Explode and group
    df_i[group_by_i] = df_i[group_by_i].str.split( '|' )
    df_i = df_i.explode( group_by_i )

    counts = df_i.pivot_table( index='Year', columns=group_by_i, values='id', aggfunc='count' )

    years = counts.index.astype( int )

    facet_grid = sns.relplot(
        counts,
        kind = 'line',
        dashes = False,
        linewidth = 3,
        aspect = 2
    )
    facet_grid.ax.set_xlim( years[0], years[-1] )
    facet_grid.ax.set_ylim( 0, facet_grid.ax.get_ylim()[1] )
    ticks = facet_grid.ax.set_xticks( years )
    facet_grid.ax.set_ylabel( 'Count' )

    save_fp = './figures/count_per_year.{}.pdf'.format( group_by_i.lower().replace( ' ', '_' ) )

    facet_grid.fig.savefig( save_fp, bbox_inches='tight' )