# Loading the data set and cleaning

### 1. Importing required libraries

In [None]:
# Uncomment and run this only once to download the model
#!python -m spacy download en_core_web_sm

In [9]:
import pandas as pd
import numpy as np
import re
import nltk
import spacy

In [10]:
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize

# Download necessary resources (only once)
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /mnt/home/bhatta73/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /mnt/home/bhatta73/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### 2. Loading cleaned csv file

As the filtered_comments.csv is a big file with around 1.8 million comments, we will load the data in batches to preserve memory. The below code chunk may take 10-15 minutes to run. It also ensure the dataframe we output only has unique comments and no duplicates.

We will also be removing rows in column "comment" which may be less than 10 characters. This ensures that only valid comments are present in the dataframe, reducing noise. We are also removing comments with the terms 'attached', 'attachment', as comments with such terms are pointing to a file or attachment and are not actual comments or a reflection of the sentiments behind EPA.

In [6]:
def load_filtered_unique_comments(input_csv, chunksize=10000):
    seen = set()
    filtered_comments = []

    for chunk in pd.read_csv(input_csv, chunksize=chunksize, usecols=['comment'], low_memory=False):
        chunk.dropna(subset=['comment'], inplace=True)
        chunk['comment'] = chunk['comment'].astype(str)
        chunk.drop_duplicates(subset=['comment'], inplace=True)

        # Filter out short comments
        chunk = chunk[chunk['comment'].str.len() > 10]
        
        # Remove comments that contain any form of 'attached' or 'attachment'
        pattern = r'\b(?:attached|attachment)\b'
        chunk = chunk[~chunk['comment'].str.contains(pattern, case=False, na=False)]

        # Remove already seen comments
        chunk = chunk[~chunk['comment'].isin(seen)]

        # Update seen set and result list
        seen.update(chunk['comment'])
        filtered_comments.extend(chunk['comment'].tolist())

    return pd.DataFrame(filtered_comments, columns=['comment'])

if __name__ == "__main__":
    input_path = "../data/filtered_comments.csv"
    df = load_filtered_unique_comments(input_path)
    print("✅ Loaded with shape:", df.shape)


✅ Loaded with shape: (1365767, 1)


We have loaded in 1,365,767 comments!

In [11]:
df.head()

Unnamed: 0,comment
0,RE: Docket EPA-R10-OAR-2019-0710 Federal Regi...
1,"People ask why the EPA is gutting regulations,..."
2,Cache Valley needs access to Tier 3 gasoline A...
3,While attainment may appear to have been achie...
4,Yes. I agree that we should commend achieveme...


### 3. Cleaning data

The code below cleans the extracted text, as it may contain html entities.

In [8]:
def clean_text(text):
    """Cleans comment text by removing unicode, HTML entities, URLs, and extra spaces."""
    text = re.sub(r"&#\d+;", " ", text)  # Remove HTML entities like &#160;
    text = re.sub(r"<.*?>", "", text)  # Remove HTML tags
    text = re.sub(r"&[a-zA-Z]+;", " ", text)  # Remove HTML entities like &rsquo;, &amp;
    text = re.sub(r"https?://\S+|www\.\S+", "", text)  # Remove URLs
    text = re.sub(r"\\u[0-9A-Fa-f]{4}", "", text)  # Remove unicode escape sequences like \u2019
    text = re.sub(r"[^\w\s.,!?']", "", text)  # Remove special characters except common punctuation
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
    return text

df["cleaned_comment"] = df["comment"].apply(clean_text)

KeyboardInterrupt: 

To better extract sentiments from the data before putting them through BERT, let's sentencize the comments and remove filler words.

In [None]:
# Load English stopwords
stop_words = set(stopwords.words('english'))

# Optional: load spacy for smarter tokenization if needed
nlp = spacy.load("en_core_web_sm")

def sentencize_and_filter(comment):
    """Split a comment into sentences, remove stopwords, and return clean sentences."""
    sentences = sent_tokenize(comment)
    filtered_sentences = []

    for sent in sentences:
        # Tokenize using spaCy to keep punctuation and proper casing
        doc = nlp(sent)
        filtered = " ".join([token.text for token in doc if token.text.lower() not in stop_words])
        # Remove short garbage sentences (like one or two words)
        if len(filtered.split()) > 3:
            filtered_sentences.append(filtered)

    return filtered_sentences

# Sentencize and flatten
df["sentences"] = df["cleaned_comment"].apply(sentencize_and_filter)
flattened_sentences = [sentence for sublist in df["sentences"] for sentence in sublist]

# Final output dataframe
df_sentences = pd.DataFrame(flattened_sentences, columns=["sentence"])
print("✅ Final sentence dataframe shape:", df_sentences.shape)

In [None]:
df_sentences.to_pickle("../data/df_sentences.pkl")