**Instructions**

To run this file, run each cell sequentially from top to bottom. There are cells at the bottom of the notebook which are no longer used, so don't run them. I'm just keeping them there for record.

**Runtime**

Cleaning one year has an approximate runtime of ~9 hrs, but it may be faster.

**Tips**

Google Colab is prone to disconnecting if your computer falls asleep or your wifi disconnects, so make sure your computer is on and your internet connection is stable.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import spacy
import string
import pandas as pd
import nltk
from tqdm import tqdm


In [None]:
# Configure paths and runtime settings
dataset_folder = "/content/drive/MyDrive/2024SUDSProject/datasets/"
years = ['2022']
chunk_size = 2000  # Tune based on memory
use_english_words = True
spacy_batch_size = 50


In [None]:
# Creates table of characters removed from text such as numbers and punctuation
char_removal_dict = {}
for char in string.printable:
    if char not in string.ascii_letters and char not in string.whitespace:
        char_removal_dict[char] = ''
char_removal_dict['\n'] = ''
removal_table = str.maketrans(char_removal_dict)


In [None]:
# Load spaCy with only the components needed for lemmatization
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])


In [None]:
# Create set of english words for word cleaning (optional)
english_words_set = None
if use_english_words:
    nltk.download('words', quiet=True)
    from nltk.corpus import words
    english_words_set = set(words.words())


In [None]:
def normalize_text(text):
    text = str(text).lower()
    return text.translate(removal_table)

def lemmatize_texts(texts):
    docs = nlp.pipe(texts, batch_size=spacy_batch_size)
    lemmatized = []
    for doc in docs:
        if english_words_set is None:
            tokens = [token.lemma_ for token in doc if not token.is_stop]
        else:
            tokens = [token.lemma_ for token in doc if not token.is_stop and token.text in english_words_set]
        lemmatized.append(tokens)
    return lemmatized


In [None]:
# Stream chunks to disk to keep memory usage bounded
for year in years:
    input_path = dataset_folder + f'combined_data_{year}.csv'
    output_path = dataset_folder + f'combined_data_preprocessed_{year}_lemma.csv'
    first_write = True

    for chunk in tqdm(pd.read_csv(input_path, chunksize=chunk_size), miniters=1, desc='Loading data'):
        texts = chunk['content'].fillna('').map(normalize_text).tolist()
        chunk['content'] = lemmatize_texts(texts)

        chunk.to_csv(
            output_path,
            index=False,
            mode='w' if first_write else 'a',
            header=first_write,
        )
        first_write = False
