# Data Cleaning

In [9]:
import json
import os
import string

In [10]:
with open("../data/raw/all_obama_press_statements_sec_of_state_v2.json", "r") as obama_file:
    obama_data = json.load(obama_file) 

with open("../data/raw/all_trump_press_statements_sec_of_state.json", "r") as trump_file:
    trump_data = json.load(trump_file)

with open("../data/raw/all_biden_press_statements_sec_of_state_v2.json", "r") as biden_file:
    biden_data = json.load(biden_file)

In [11]:
def find_fields(data):
    fields = set()
    for entry in data:
        fields.update(entry.keys())
    return fields

biden_fields = find_fields(biden_data)
obama_fields = find_fields(obama_data)
trump_fields = find_fields(trump_data)

print("Biden fields:", biden_fields)
print("Obama fields:", obama_fields)
print("Trump fields:", trump_fields)

Biden fields: {'date', 'text', 'tags', 'document_type', 'title', 'type_of_release', 'link', 'page_url', 'publish_date', 'document_author', 'title_of_release'}
Obama fields: {'text', 'tags', 'document_type', 'title', 'document_author_title', 'link', 'document_author_name', 'publish_date', 'document_author', 'title_of_release'}
Trump fields: {'date', 'text', 'tags', 'document_type', 'title', 'type_of_release', 'link', 'page_url', 'publish_date', 'document_author', 'title_of_release'}


We should check for missing values in common fields across all datasets.

Common fields:
- title_of_release
- document_author
- document_type
- publish_date
- text
- tags
- link

In [12]:
def keep_common_fields(data, common_fields):
    cleaned_data = []
    for entry in data:
        cleaned_entry = {field: entry.get(field, "Unknown") for field in common_fields}
        cleaned_data.append(cleaned_entry)
    return cleaned_data

common_fields = ['title_of_release', 'document_author', 'document_type', 'publish_date', 'text', 'tags', 'link']

biden_cleaned = keep_common_fields(biden_data, common_fields)
obama_cleaned = keep_common_fields(obama_data, common_fields)
trump_cleaned = keep_common_fields(trump_data, common_fields)

print(f"Biden cleaned entries: {len(biden_cleaned)}")
print(f"Obama cleaned entries: {len(obama_cleaned)}")
print(f"Trump cleaned entries: {len(trump_cleaned)}")

Biden cleaned entries: 7737
Obama cleaned entries: 15712
Trump cleaned entries: 7464


I decided to drop all non common fields (those that are not in common are not so important for our analysis) and to fill empty entries with "Unknown" value

In [13]:
def remove_punctuation_from_list(text_list):
    return [remove_punctuation(text) for text in text_list]

obama_cleaned = [remove_punctuation_from_list(entry['text']) for entry in obama_cleaned]
trump_cleaned = [remove_punctuation_from_list(entry['text']) for entry in trump_cleaned]
biden_cleaned = [remove_punctuation_from_list(entry['text']) for entry in biden_cleaned]

In [14]:
os.makedirs('../data/cleaned', exist_ok=True)

In [15]:
with open('../data/cleaned/cleaned_obama_data.json', 'w') as f:
    json.dump(obama_cleaned, f)

# Save cleaned Trump dataset
with open('../data/cleaned/cleaned_trump_data.json', 'w') as f:
    json.dump(trump_cleaned, f)

with open('../data/cleaned/cleaned_biden_data.json', 'w') as f:
    json.dump(biden_cleaned, f)
