In [1]:
import json
import spacy
import os
import time

In [2]:
nlp = spacy.load('en_core_web_sm', disable=['ner', 'tagger'])  # enable parser for noun chunks

In [3]:
os.makedirs('../data/tokenized', exist_ok=True)

For our analysis tokenizing phrases rather than individual words could be a better choice due to the presence of important phrases like "United States of America" etc... which might lose context if broken into individual words.

We process the text in batches to speed up the processing time.

In [6]:
def preprocess_data_with_pipe(dataset, batch_size=1000):
    tokenized_phrases = []
    for doc in nlp.pipe([" ".join(entry['text']) if isinstance(entry['text'], list) else entry['text'] for entry in dataset], batch_size=batch_size):
        phrases = [chunk.text for chunk in doc.noun_chunks]  # Extract noun phrases
        tokenized_phrases.append(phrases)
    return tokenized_phrases

In [7]:
def lowercase_text(phrase_list):
    return [phrase.lower() for phrase in phrase_list]

In [9]:
def preprocess_data(dataset, batch_size=1000):
    tokenized_phrases = preprocess_data_with_pipe(dataset, batch_size)
    lowercased_phrases = [lowercase_text(phrases) for phrases in tokenized_phrases]
    return lowercased_phrases

In [10]:
with open('../data/cleaned/cleaned_biden_data.json', 'r') as f:
    biden_data = json.load(f)

with open('../data/cleaned/cleaned_obama_data.json', 'r') as f:
    obama_data = json.load(f)

with open('../data/cleaned/cleaned_trump_data.json', 'r') as f:
    trump_data = json.load(f)

In [11]:
start_time = time.time()

In [None]:
print("Processing Biden dataset...")
biden_preprocessed_phrases = preprocess_data(biden_data, batch_size=1000)
with open('../data/tokenized/preprocessed_biden_phrases.json', 'w') as f:
    json.dump(biden_preprocessed_phrases, f)

In [None]:
print("Processing Obama dataset...")
obama_preprocessed_phrases = preprocess_data(obama_data, batch_size=1000)
with open('../data/tokenized/preprocessed_obama_phrases.json', 'w') as f:
    json.dump(obama_preprocessed_phrases, f)

In [None]:
print("Processing Trump dataset...")
trump_preprocessed_phrases = preprocess_data(trump_data, batch_size=1000)
with open('../data/tokenized/preprocessed_trump_phrases.json', 'w') as f:
    json.dump(trump_preprocessed_phrases, f)

In [None]:
end_time = time.time()
print(f"Preprocessing competed in {end_time - start_time} seconds")