In [1]:
import numpy as np
import pandas as pd

In [2]:
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style( 'whitegrid' )

# Parameters

In [3]:
year_start = 'September 1'

In [4]:
data_fp = './data/exports/News_Report_2023-07-25.csv'

In [5]:
groupings = [ 'Research Topics', 'Press Types', 'Categories' ]

# Preprocessing

In [None]:
# Load data
df = pd.read_csv( data_fp, parse_dates=[ 'Date', ] )

In [None]:
# Drop drafts
df.drop( df.index[df['Date'].dt.year == 1970], axis='rows', inplace=True )

In [None]:
# Drop weird articles---ancient ones w/o a title or press type
df.dropna( axis='rows', how='any', subset=[ 'Title', 'Press Types', ], inplace=True )

In [None]:
# Get rid of HTML ampersands
for str_column in [ 'Title', 'Research Topics', 'Categories' ]:
    df[str_column] = df[str_column].str.replace( '&amp;', '&' )

In [None]:
# Get date bins
start_year = df['Date'].min().year - 1
end_year = df['Date'].max().year + 1
date_bins = pd.date_range(
    '{} {}'.format( year_start, start_year ),
    pd.Timestamp.now() + pd.offsets.DateOffset( years=1 ),
    freq = pd.offsets.DateOffset( years=1 ),
)
date_bin_labels = date_bins.year[:-1]

In [None]:
# Add the year published (using the above start date )
df['Year'] = pd.cut( df['Date'], date_bins, labels=date_bin_labels ) 

# Visualize

In [None]:
for group_by_i in groupings:
    df_i = df.copy()

    # Explode and group
    df_i[group_by_i] = df_i[group_by_i].str.split( '|' )
    df_i = df_i.explode( group_by_i )

    counts = df_i.pivot_table( index='Year', columns=group_by_i, values='id', aggfunc='count' )

    years = counts.index.astype( int )

    facet_grid = sns.relplot(
        counts,
        kind = 'line',
        dashes = False,
        linewidth = 3,
        aspect = 2
    )
    facet_grid.ax.set_xlim( years[0], years[-1] )
    facet_grid.ax.set_ylim( 0, facet_grid.ax.get_ylim()[1] )
    ticks = facet_grid.ax.set_xticks( years )
    facet_grid.ax.set_ylabel( 'Count' )

    save_fp = './figures/count_per_year.{}.pdf'.format( group_by_i.lower().replace( ' ', '_' ) )

    facet_grid.fig.savefig( save_fp, bbox_inches='tight' )

# Scratch

In [17]:
df = pd.read_csv( '../test/test_data/press_office.csv' )

In [18]:
df.drop( 'Unnamed: 0', axis='columns', inplace=True )

In [21]:
df['Title (optional)'] = df['Title (optional)'].str.strip()

In [10]:
df.drop( 'Unnamed: 5', axis='columns', inplace=True )

df.dropna( axis='rows', inplace=True )

df = df.set_axis( df.columns.str.strip(), axis='columns' )

for column in df.columns:
    try:
        df[column] = df[column].str.strip()
    except AttributeError:
        continue

for column in df.columns:
    try:
        df[column] = df[column].values.astype( int )
    except ValueError:
        continue

df.to_csv(  '../test/test_data/press_office.csv' )

df.to_xml

# pd.DataFrame.to_excel(

SyntaxError: incomplete input (2071547193.py, line 23)

In [14]:
df.set_index( 'id', inplace=True )

In [13]:
pd.DataFrame.to_excel?

[0;31mSignature:[0m
[0mpd[0m[0;34m.[0m[0mDataFrame[0m[0;34m.[0m[0mto_excel[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mself[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mexcel_writer[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msheet_name[0m[0;34m:[0m [0;34m'str'[0m [0;34m=[0m [0;34m'Sheet1'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mna_rep[0m[0;34m:[0m [0;34m'str'[0m [0;34m=[0m [0;34m''[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfloat_format[0m[0;34m:[0m [0;34m'str | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcolumns[0m[0;34m:[0m [0;34m'Sequence[Hashable] | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mheader[0m[0;34m:[0m [0;34m'Sequence[Hashable] | bool_t'[0m [0;34m=[0m [0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mindex[0m[0;34m:[0m [0;34m'bool_t'[0m [0;34m=[0m [0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mindex_label[0m[0;34m:

In [16]:
df.to_excel( '../test/test_data/press_office.xlsx', )