**Instructions**

The best bet to clean the entire dataset is to use the digital research alliance supercluster to cut down on runtime. The shell files are also included in the google drive

To run this file, run each cell sequentially from top to bottom. There are cells at the bottom of the notebook which are no longer used, so don't run them. I'm just keeping them there for record.

**Runtime**

Cleaning one year has an approximate runtime of ~9 hrs, but it may be faster.

**Tips**

Google Colab is prone to disconnecting if your computer falls asleep or your wifi disconnects, so make sure your computer is on and your internet connection is stable.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from tqdm import tqdm
import string
import pandas as pd
import nltk

In [None]:
project_folder = "/content/drive/MyDrive/2024SUDSProject/"
dataset_folder = "/content/drive/MyDrive/2024SUDSProject/datasets/"

In [None]:
# Creates table of characters removed from text such as numbers and punctuation

char_removal_dict = {}

for char in string.printable:
  if char not in string.ascii_letters and char not in string.whitespace:
    char_removal_dict[char] = ''

char_removal_dict['\n'] = ''

removal_table = str.maketrans(char_removal_dict)

In [None]:
ps = nltk.stem.PorterStemmer()

In [None]:
# Create set of english words for word cleaning

nltk.download('words')
nltk.download('stopwords')

from nltk.corpus import words, stopwords

english_words_set = set(words.words())
stop_words = set(stopwords.words('english'))



In [None]:
def remove_extra_spaces(text):
    return ' '.join(text.split())

# Function to remove stopwords and lemmatize from text batch
def remove_stopwords_and_stem_batch(texts):

    word_tokens = [word_tokenize(text) for text in texts]

    filtered_docs = []

    for doc in word_tokens:
        filtered_doc = [ps.stem(word) for word in doc if not word in stop_words
            and word in english_words_set]
        filtered_docs.append(' '.join(filtered_doc))

    return filtered_docs

# Cleans a dataframe as a batch. Notice that this method mutates the given dataframe
def clean_dataframe_batch_v2(dataframe):
    dataframe['content'] = remove_stopwords_and_stem_batch(dataframe['content'])

In [None]:
# Define the chunk size (number of rows to read at a time)
chunk_size = 512

# Counter for debugging purposes
counter = 0

# Initialize an empty list to store processed chunks
processed_chunks = []

# years = ['2017', '2018', '2019', '2020', '2021', '2022']

years = ['2022']

for year in years:
    # Iterate over chunks of the CSV file
    for chunk in tqdm(pd.read_csv(dataset_folder+f'combined_data_{year}.csv', chunksize=chunk_size), miniters=1, desc='Loading data'):

        if counter == 4:
          break

        # Convert all letters to lowercase and remove unnecessary characters
        chunk['content'] = chunk['content'].astype(str).str.lower()
        chunk['content'] = chunk['content'].str.translate(removal_table)
        chunk['content'] = chunk['content'].apply(remove_extra_spaces)

        # Cleans the dataframe
        clean_dataframe_batch_v2(chunk)

        # Append the processed chunk to the list
        processed_chunks.append(chunk)

        counter += 1

    # Concatenate processed chunks into a single DataFrame
    processed_data = pd.concat(processed_chunks)

    # Print the first few rows of the processed data
    print(processed_data.head())

    # Convert dataframe back to csv file
    processed_data.to_csv(dataset_folder+f"combined_data_preprocessed_{year}_stem.csv", index=False)
