In [1]:
import pandas as pd
import spacy
import os

In [2]:
preprocessed_dir = '../data/preprocessed'
os.makedirs(preprocessed_dir, exist_ok=True)

In [3]:
df_obama = pd.read_json("../data/raw/obama_press_releases.json")
df_trump = pd.read_json("../data/raw/trump_press_releases.json")
df_biden = pd.read_json("../data/raw/biden_press_releases.json")

In [4]:
print(df_obama.head())
print(df_trump.head())
print(df_biden.head())

                                                link  \
0  https://2009-2017.state.gov/r/pa/prs/ps/2009/d...   
1  https://2009-2017.state.gov/r/pa/prs/ps/2009/d...   
2  https://2009-2017.state.gov/r/pa/prs/ps/2009/d...   
3  https://2009-2017.state.gov/secretary/20092013...   
4  https://2009-2017.state.gov/r/pa/prs/ps/2009/d...   

                                    title_of_release  \
0                 Michelle Kwan To Visit South Korea   
1  U.S. Department of State Honors Nestor Tedesco...   
2                           Burma's Independence Day   
3                 Republic of Haiti Independence Day   
4                U.S. Welcomes Chad-Sudan Engagement   

                                               title    document_type  \
0                 Michelle Kwan To Visit South Korea       Media Note   
1  U.S. Department of State Honors Nestor Tedesco...       Media Note   
2                           Burma's Independence Day  Press Statement   
3                 Republic of Hait

Fields in our datasets:
- link: URL of the original press release.
- title_of_release: The title of the press release
- title: The shortened or reformatted title.
- document_type: Type of the document (e.g., Media Note, Press Statement, Interview).
- document_author: The entity or person responsible for the release.
- publish_date: The date the document was published.
- text: The body text of the release.
- tags: Tags associated with the press release.

Preprocessing choices:

We tokenize the text into phrases keepig capitalization for named entities.

We remove punctuation.

We prepare the text for the Hugging Face NER model.

We'll use SpaCy for basic text preprocessing (tokenization and punctuation removal) because it integrates well with Hugging Face models.

In [5]:
nlp = spacy.load("en_core_web_sm")

In [6]:
# text preprocessing: tokenization + remove punctuation + retain capitalization for NEs
def preprocess_text(text):
    if isinstance(text, list):   # our text is column contains lists instead of strings, so we have to join list elements into a string
        text = " ".join(text)
    doc = nlp(text)
    tokens = [token.text if token.ent_type_ else token.text.lower() 
              for token in doc if not token.is_punct]
    return " ".join(tokens)

In [7]:
df_obama['text'] = df_obama['text'].apply(lambda x: " ".join(x) if isinstance(x, list) else x)
df_obama['processed_text'] = df_obama['text'].apply(preprocess_text)

In [8]:
print(df_obama[['text', 'processed_text']].head()) # just checking if it worked

                                                text  \
0  Michelle Kwan will visit Korea for the U.S. De...   
1  The U.S. Department of State has named Nestor ...   
2  Monday, January 4, 2010, marks the 62nd annive...   
3  On behalf of President Obama and the people of...   
4  The United States welcomes the continued engag...   

                                      processed_text  
0  Michelle Kwan will visit Korea for the U.S. De...  
1  The U.S. Department of State has named Nestor ...  
2  Monday January 4 2010 marks the 62nd anniversa...  
3  on behalf of president Obama and the people of...  
4  The United States welcomes the continued engag...  


In [9]:
df_obama.to_json(os.path.join(preprocessed_dir, 'obama_preprocessed.json'), orient='records', lines=True)

In [10]:
df_trump['text'] = df_trump['text'].apply(lambda x: " ".join(x) if isinstance(x, list) else x)
df_trump['processed_text'] = df_trump['text'].apply(preprocess_text)
df_trump.to_json(os.path.join(preprocessed_dir, 'trump_preprocessed.json'), orient='records', lines=True)

In [11]:
df_biden['text'] = df_biden['text'].apply(lambda x: " ".join(x) if isinstance(x, list) else x)
df_biden['processed_text'] = df_biden['text'].apply(preprocess_text)
df_biden.to_json(os.path.join(preprocessed_dir, 'biden_preprocessed.json'), orient='records', lines=True)