## en_core_web_sm

In [1]:
import json
import spacy
import os
import time
import string

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
os.makedirs('../data/tokenized/SpaCy/en_core_web_sm', exist_ok=True)

I remove punctuation before applying the NER model to avoid unnecessary tokens

In [4]:
def remove_punctuation(text):
    return text.translate(str.maketrans("", "", string.punctuation))

I lowercase most of the text, but keeping named entities in their original form  

In [5]:
def lowercase_non_entities(doc):
    return " ".join([token.text.lower() if not token.ent_type_ else token.text for token in doc])


In [6]:
def preprocess_data_phrases(dataset, batch_size=1000):
    tokenized_phrases = []
    
    for doc in nlp.pipe([ " ".join(entry['text']) if isinstance(entry['text'], list) else entry['text'] for entry in dataset], batch_size=batch_size):
        cleaned_text = remove_punctuation(doc.text)
        processed_doc = nlp(cleaned_text)
        
        phrases = [chunk.text for chunk in processed_doc.noun_chunks]
        
        lowercased_phrases = [lowercase_non_entities(nlp(phrase)) for phrase in phrases]
        
        tokenized_phrases.append(lowercased_phrases)
    
    return tokenized_phrases

I used nlp.pipe to process data in batches for better performace on our large datasets

In [7]:
with open('../data/cleaned/cleaned_biden_data.json', 'r') as f:
    biden_data = json.load(f)

with open('../data/cleaned/cleaned_obama_data.json', 'r') as f:
    obama_data = json.load(f)

with open('../data/cleaned/cleaned_trump_data.json', 'r') as f:
    trump_data = json.load(f)

In [8]:
start_time = time.time()

In [9]:
print("Processing Biden dataset...")
biden_preprocessed_phrases = preprocess_data_phrases(biden_data)
with open('../data/tokenized/SpaCy/en_core_web_sm/preprocessed_biden_phrases_sm.json', 'w') as f:
    json.dump(biden_preprocessed_phrases, f)

In [None]:
print("Processing Obama dataset...")
obama_preprocessed_phrases = preprocess_data_phrases(obama_data)
with open('../data/tokenized/SpaCy/en_core_web_sm/preprocessed_obama_phrases_sm.json', 'w') as f:
    json.dump(obama_preprocessed_phrases, f)

In [None]:
print("Processing Trump dataset...")
trump_preprocessed_phrases = preprocess_data_phrases(trump_data)
with open('../data/tokenized/SpaCy/en_core_web_sm/preprocessed_trump_phrases_sm.json', 'w') as f:
    json.dump(trump_preprocessed_phrases, f)

In [None]:
end_time = time.time()
print(f"Preprocessing completed in {end_time - start_time} seconds")