# 03-mediacloud-publish-date
**Purpose**:
- check distribution of missing publication timestamps across outlets
    - (1) no datetime value in `publish_date`
    - (2) datetime is exactly at 05:00 AM UTC (midnight EST)
    
**Note**:
- Original intention was to add this analysis to `01-mediacloud-prep-urls.ipynb`. However, it is better to keep this separate (despite duplicating some code) since running `01-mediacloud-prep-urls.ipynb` again in Dec. 2022 shows different results for Media Cloud's `story_count` field vs. the original Nov. 2021 results (which the paper's main data processing and analyses are based on).

In [1]:
import os
import pandas as pd

dir_int = os.path.join("..", "..", "data", "02-intermediate", "03-mediacloud")

from usrightmedia.shared.media_references import get_mediacloud_outlet_ids
from usrightmedia.shared.datetime_utils import END_TIME, START_TIME, EST_to_UTC

In [2]:
from usrightmedia.shared.loggers import get_logger
LOGGER = get_logger(filename = '03-mediacloud-publish-date', logger_type='main')

In [3]:
df_outlets = get_mediacloud_outlet_ids()

In [4]:
df_outlets

Unnamed: 0,outlet,media_id
0,American Renaissance,26186
1,Breitbart,19334
2,Daily Caller,18775
3,Daily Stormer,113988
4,Fox News,1092
5,Gateway Pundit,25444
6,InfoWars,18515
7,Newsmax,25349
8,One America News,127733
9,Rush Limbaugh,24669


## 1. Load Media Cloud stories and prepare columns

- combine dataframes of outlets' stories
- convert EST from Media Cloud to UTC
- add timestamp-related booleans (no timestamp info and odd timestamps exactly at 05:00 UTC/midnight EST)

In [5]:
df_stories = []
for media_id in df_outlets['media_id']:
    dfs = pd.read_pickle(os.path.join(dir_int, f'stories_{media_id}.pkl'))
    df_stories.append(dfs)

df_stories = pd.concat(df_stories).reset_index(drop=True)

# fix data types
df_stories['stories_id'] = df_stories['stories_id'].astype('str')
df_stories['publish_date'] = pd.to_datetime(df_stories['publish_date'])
df_stories['publish_date'] = df_stories['publish_date'].map(lambda t: EST_to_UTC(t)) # timezone-aware
df_stories['media_id'] = df_stories['media_id'].astype('str')

# add date buckets
df_stories['publish_week'] = df_stories['publish_date'].dt.to_period('W').dt.to_timestamp()
df_stories['publish_month'] = df_stories['publish_date'].dt.to_period('M').dt.to_timestamp()
df_stories['publish_year'] = df_stories['publish_date'].dt.to_period('Y').dt.to_timestamp()

# add timestamp-related booleans
df_stories['publish_no_info'] = df_stories['publish_date'].isna()
df_stories["publish_0500_utc"] = df_stories["publish_date"].map(lambda t: all([t.hour==5, t.minute==0, t.second==0]))

# add outlet names
df_stories = df_stories.merge(df_outlets[['media_id', 'outlet']],on='media_id',how='left')

  df_stories['publish_week'] = df_stories['publish_date'].dt.to_period('W').dt.to_timestamp()
  df_stories['publish_month'] = df_stories['publish_date'].dt.to_period('M').dt.to_timestamp()
  df_stories['publish_year'] = df_stories['publish_date'].dt.to_period('Y').dt.to_timestamp()


In [6]:
df_stories["publish_date"].dtype

datetime64[ns, UTC]

In [7]:
df_stories[['publish_date', 'publish_no_info', 'publish_0500_utc']]

Unnamed: 0,publish_date,publish_no_info,publish_0500_utc
0,2016-02-18 12:00:00+00:00,False,False
1,2016-05-06 22:46:37+00:00,False,False
2,2016-03-09 23:47:01+00:00,False,False
3,2016-02-24 23:20:05+00:00,False,False
4,2016-01-22 22:43:55+00:00,False,False
...,...,...,...
968581,2020-02-24 05:00:00+00:00,False,True
968582,2020-08-31 04:00:00+00:00,False,False
968583,2020-11-15 05:00:00+00:00,False,True
968584,2020-09-27 04:00:00+00:00,False,False


### 2. `publish_date` field with no information

In [8]:
df_stories['publish_no_info'].value_counts()

False    962856
True       5730
Name: publish_no_info, dtype: int64

In [9]:
df_no_info = df_stories.pivot_table(index='outlet',columns='publish_no_info', aggfunc='size').reset_index().rename_axis(None, axis=1)
df_no_info = df_no_info.rename(columns={False: 'false', True: 'true'})
df_no_info['total_count'] = df_no_info['false'] + df_no_info['true']
df_no_info['false_pct'] = round(df_no_info['false'] / df_no_info['total_count']*100,2)
df_no_info['true_pct'] = round(df_no_info['true'] / df_no_info['total_count']*100,2)
df_no_info

Unnamed: 0,outlet,false,true,total_count,false_pct,true_pct
0,American Renaissance,9838,4,9842,99.96,0.04
1,Breitbart,149962,29,149991,99.98,0.02
2,Daily Caller,121915,8,121923,99.99,0.01
3,Daily Stormer,15826,1,15827,99.99,0.01
4,Fox News,278439,1440,279879,99.49,0.51
5,Gateway Pundit,39411,2,39413,99.99,0.01
6,InfoWars,28459,146,28605,99.49,0.51
7,Newsmax,71163,62,71225,99.91,0.09
8,One America News,117291,409,117700,99.65,0.35
9,Rush Limbaugh,9396,1,9397,99.99,0.01


### 3. `publish_date` field with timestamp exactly at 05:00 UTC (midnight EST)

In [10]:
df_stories['publish_0500_utc'].value_counts()

False    942382
True      26204
Name: publish_0500_utc, dtype: int64

In [11]:
df_0500_utc = df_stories.pivot_table(index='outlet',columns='publish_0500_utc', aggfunc='size').reset_index().rename_axis(None, axis=1)
df_0500_utc = df_0500_utc.rename(columns={False: 'false', True: 'true'})
df_0500_utc['total_count'] = df_0500_utc['false'] + df_0500_utc['true']
df_0500_utc['false_pct'] = round(df_0500_utc['false'] / df_0500_utc['total_count']*100,2)
df_0500_utc['true_pct'] = round(df_0500_utc['true'] / df_0500_utc['total_count']*100,2)
df_0500_utc

Unnamed: 0,outlet,false,true,total_count,false_pct,true_pct
0,American Renaissance,9824.0,18.0,9842.0,99.82,0.18
1,Breitbart,149534.0,457.0,149991.0,99.7,0.3
2,Daily Caller,117877.0,4046.0,121923.0,96.68,3.32
3,Daily Stormer,15801.0,26.0,15827.0,99.84,0.16
4,Fox News,269416.0,10463.0,279879.0,96.26,3.74
5,Gateway Pundit,39367.0,46.0,39413.0,99.88,0.12
6,InfoWars,28549.0,56.0,28605.0,99.8,0.2
7,Newsmax,69570.0,1655.0,71225.0,97.68,2.32
8,One America News,117700.0,,,,
9,Rush Limbaugh,9338.0,59.0,9397.0,99.37,0.63


#### 3.1 Washington Examiner

In [12]:
df_we = df_stories.loc[df_stories['outlet']=='Washington Examiner'].reset_index(drop=True)
df_we_no_info = df_we.loc[df_we['publish_no_info']==True].reset_index(drop=True)
df_we_0500_utc = df_we.loc[df_we['publish_0500_utc']==True].reset_index(drop=True)

In [13]:
df_we_count = pd.DataFrame(df_we.groupby('publish_year')['outlet'].count()).reset_index().rename(columns={'outlet': 'total_published'})
df_we_0500_utc_count = pd.DataFrame(df_we_0500_utc.groupby('publish_year')['outlet'].count()).reset_index().rename(columns={'outlet': 'published_0500_utc'})
df_we_0500_utc_smry = df_we_count.merge(df_we_0500_utc_count, how='inner', on='publish_year',validate='one_to_one')
df_we_0500_utc_smry

Unnamed: 0,publish_year,total_published,published_0500_utc
0,2016-01-01,20041,960
1,2017-01-01,26144,2926
2,2018-01-01,10950,3084
3,2019-01-01,7781,1740
4,2020-01-01,6818,190


In [14]:
df_we_0500_utc_smry['published_0500_utc_pct'] = round(df_we_0500_utc_smry['published_0500_utc'] / df_we_0500_utc_smry['total_published']*100,2)
df_we_0500_utc_smry

Unnamed: 0,publish_year,total_published,published_0500_utc,published_0500_utc_pct
0,2016-01-01,20041,960,4.79
1,2017-01-01,26144,2926,11.19
2,2018-01-01,10950,3084,28.16
3,2019-01-01,7781,1740,22.36
4,2020-01-01,6818,190,2.79


### 4. `publish_date` field with no information or timestamp exactly at 05:00 UTC (midnight EST)

- In Python, True equals 1 and False equals 0.
- If either 'bad timestamp' condition is True, the output will be equal to 1 (since the two conditions are exclusive, the value can't be 2).
- Flip 1 and 0 using bitwise NOT
    - https://stackoverflow.com/a/8305225/7016397
    - https://en.wikipedia.org/wiki/Bitwise_operation#NOT

In [15]:
df_stories['usable_timestamp'] = df_stories['publish_no_info'] + df_stories['publish_0500_utc']
df_stories['usable_timestamp'] = df_stories['usable_timestamp'].astype(int)
df_stories['usable_timestamp'] = df_stories['usable_timestamp'].map(lambda x: ~x+2)

In [16]:
df_stories.dtypes

stories_id                       object
publish_date        datetime64[ns, UTC]
title                            object
url                              object
language                         object
ap_syndicated                      bool
themes                           object
media_id                         object
media_name                       object
media_url                        object
publish_week             datetime64[ns]
publish_month            datetime64[ns]
publish_year             datetime64[ns]
publish_no_info                    bool
publish_0500_utc                   bool
outlet                           object
usable_timestamp                  int64
dtype: object

In [17]:
df_stories[['publish_no_info', 'publish_0500_utc', 'usable_timestamp']]

Unnamed: 0,publish_no_info,publish_0500_utc,usable_timestamp
0,False,False,1
1,False,False,1
2,False,False,1
3,False,False,1
4,False,False,1
...,...,...,...
968581,False,True,0
968582,False,False,1
968583,False,True,0
968584,False,False,1


In [18]:
df_stories.groupby(['publish_no_info', 'publish_0500_utc', 'usable_timestamp']).size().reset_index().rename(columns={0: 'count'})

Unnamed: 0,publish_no_info,publish_0500_utc,usable_timestamp,count
0,False,False,1,936652
1,False,True,0,26204
2,True,False,0,5730


In [19]:
df_usable_ts = df_stories.pivot_table(index='outlet',columns='usable_timestamp', aggfunc='size').reset_index().rename_axis(None, axis=1)
df_usable_ts = df_usable_ts.rename(columns={False: 'false', True: 'true'})
df_usable_ts['total_count'] = df_usable_ts['false'] + df_usable_ts['true']
df_usable_ts['false_pct'] = round(df_usable_ts['false'] / df_usable_ts['total_count']*100,2)
df_usable_ts['true_pct'] = round(df_usable_ts['true'] / df_usable_ts['total_count']*100,2)
df_usable_ts

Unnamed: 0,outlet,false,true,total_count,false_pct,true_pct
0,American Renaissance,22,9820,9842,0.22,99.78
1,Breitbart,486,149505,149991,0.32,99.68
2,Daily Caller,4054,117869,121923,3.33,96.67
3,Daily Stormer,27,15800,15827,0.17,99.83
4,Fox News,11903,267976,279879,4.25,95.75
5,Gateway Pundit,48,39365,39413,0.12,99.88
6,InfoWars,202,28403,28605,0.71,99.29
7,Newsmax,1717,69508,71225,2.41,97.59
8,One America News,409,117291,117700,0.35,99.65
9,Rush Limbaugh,60,9337,9397,0.64,99.36
