# Merging SOTU and Newspaper Data

In [21]:
import os

import numpy as np
import pandas as pd

In [22]:
newspapers_file = 'data/raw/newspapers.json'
sotu_file = 'data/raw/sotu.json'
output_file = 'data/raw/merged.json'

## Preperation of News Media Files

In [23]:
dfn = pd.read_json(newspapers_file)
dfn.head()

Unnamed: 0,title,link,YEAR,PRESIDENT,newspaper,text
0,Magnet for Democrats Speculation Focuses on Go...,https://www.proquest.com/latimes/docview/29225...,1986,Reagan,LA Times,He is perhaps the best orator in the Democrati...
1,"Pumped Up for Event, Diver Finds Feat a B-r-r-...",https://www.proquest.com/latimes/docview/29224...,1986,Reagan,LA Times,-Brad Graske spent the day under the ice on Mc...
2,D-U-M-B!: [Home Edition],https://www.proquest.com/latimes/docview/29226...,1986,Reagan,LA Times,What is it about Reagan Administration secreta...
3,Ex-Black Panther Running for Senate Cleaver Ch...,https://www.proquest.com/latimes/docview/29224...,1986,Reagan,LA Times,The Los Angeles political consultant reached a...
4,First Daughter's Novel Reminiscent of Real Lif...,https://www.proquest.com/latimes/docview/29222...,1986,Reagan,LA Times,"This book is not, repeat not, the story of Ron..."


## Preperation of SOTU Files

In [24]:
dfs = pd.read_json(sotu_file)
dfs = dfs.rename(columns={'text': 'SOTU'})
dfs.head()

Unnamed: 0,link,date,president,SOTU
0,https://www.presidency.ucsb.edu/documents/addr...,2024-03-07,"Joseph R. Biden, Jr.","[Before speaking, the President presented his ..."
1,https://www.presidency.ucsb.edu/documents/addr...,2023-02-07,"Joseph R. Biden, Jr.","The President. Mr. Speaker——\n[At this point, ..."
2,https://www.presidency.ucsb.edu/documents/addr...,2022-03-01,"Joseph R. Biden, Jr.","The President. Thank you all very, very much. ..."
3,https://www.presidency.ucsb.edu/documents/addr...,2020-02-04,"Joseph R. Biden, Jr.",The President. Thank you very much. Thank you....
4,https://www.presidency.ucsb.edu/documents/addr...,2019-02-05,"Joseph R. Biden, Jr.","The President. Madam Speaker, Mr. Vice Preside..."


In [25]:
dfs['YEAR'] = dfs['date'].dt.year

## Merging files

In [26]:
df = pd.merge(
    left=dfn, left_on='YEAR',
    right=dfs[['SOTU', 'president', 'YEAR']], right_on='YEAR',
    how='left'
)
df.shape

(4993, 8)

Even for our super GPU, this was too big to crunch. So we're going to split every story up into paragraphs to make it more palatable.

In [27]:
from tqdm import tqdm
text_col = 'SOTU'
meta_data = [col for col in list(df) if col != text_col]
df_ = []
for (i, rid) in tqdm(enumerate(df.loc[~df['SOTU'].isna()].index)):
    main_data = df[meta_data].loc[rid].to_dict()
    text = df[text_col].loc[rid].split('\n')
    main_data[text_col + '_id'] = i
    for t in text:
        d = main_data.copy()
        d[text_col] = t
        df_ += [d]
df = pd.DataFrame(df_)
df.head()

3955it [00:02, 1960.03it/s]


Unnamed: 0,title,link,YEAR,PRESIDENT,newspaper,text,president,SOTU_id,SOTU
0,Magnet for Democrats Speculation Focuses on Go...,https://www.proquest.com/latimes/docview/29225...,1986,Reagan,LA Times,He is perhaps the best orator in the Democrati...,"Joseph R. Biden, Jr.",0,"Mr. Speaker, Mr. President, distinguished Memb..."
1,Magnet for Democrats Speculation Focuses on Go...,https://www.proquest.com/latimes/docview/29225...,1986,Reagan,LA Times,He is perhaps the best orator in the Democrati...,"Joseph R. Biden, Jr.",0,Thank you for allowing me to delay my address ...
2,Magnet for Democrats Speculation Focuses on Go...,https://www.proquest.com/latimes/docview/29225...,1986,Reagan,LA Times,He is perhaps the best orator in the Democrati...,"Joseph R. Biden, Jr.",0,"Mr. Speaker, before I begin my prepared remark..."
3,Magnet for Democrats Speculation Focuses on Go...,https://www.proquest.com/latimes/docview/29225...,1986,Reagan,LA Times,He is perhaps the best orator in the Democrati...,"Joseph R. Biden, Jr.",0,I have come to review with you the progress of...
4,Magnet for Democrats Speculation Focuses on Go...,https://www.proquest.com/latimes/docview/29225...,1986,Reagan,LA Times,He is perhaps the best orator in the Democrati...,"Joseph R. Biden, Jr.",0,Tonight the American people deserve our thanks...


In [28]:
df.isna().sum()

title        0
link         0
YEAR         0
PRESIDENT    0
newspaper    0
text         0
president    0
SOTU_id      0
SOTU         0
dtype: int64

In [29]:
df['YEAR'].loc[df['SOTU'].isna()].unique()

array([], dtype=int64)

In [30]:
df = df.loc[~df['SOTU'].isna()]
df.shape

(355422, 9)

In [31]:
df.head()

Unnamed: 0,title,link,YEAR,PRESIDENT,newspaper,text,president,SOTU_id,SOTU
0,Magnet for Democrats Speculation Focuses on Go...,https://www.proquest.com/latimes/docview/29225...,1986,Reagan,LA Times,He is perhaps the best orator in the Democrati...,"Joseph R. Biden, Jr.",0,"Mr. Speaker, Mr. President, distinguished Memb..."
1,Magnet for Democrats Speculation Focuses on Go...,https://www.proquest.com/latimes/docview/29225...,1986,Reagan,LA Times,He is perhaps the best orator in the Democrati...,"Joseph R. Biden, Jr.",0,Thank you for allowing me to delay my address ...
2,Magnet for Democrats Speculation Focuses on Go...,https://www.proquest.com/latimes/docview/29225...,1986,Reagan,LA Times,He is perhaps the best orator in the Democrati...,"Joseph R. Biden, Jr.",0,"Mr. Speaker, before I begin my prepared remark..."
3,Magnet for Democrats Speculation Focuses on Go...,https://www.proquest.com/latimes/docview/29225...,1986,Reagan,LA Times,He is perhaps the best orator in the Democrati...,"Joseph R. Biden, Jr.",0,I have come to review with you the progress of...
4,Magnet for Democrats Speculation Focuses on Go...,https://www.proquest.com/latimes/docview/29225...,1986,Reagan,LA Times,He is perhaps the best orator in the Democrati...,"Joseph R. Biden, Jr.",0,Tonight the American people deserve our thanks...


In [33]:
df.isna().sum()

title        0
link         0
YEAR         0
PRESIDENT    0
newspaper    0
text         0
president    0
SOTU_id      0
SOTU         0
dtype: int64

In [32]:
df.to_json(output_file, orient='records')