Prepocess text: lowercasing, removing punctuation and stopwords (excluding may, will, etc.)

In [2]:
%pip install nltk

Collecting nltk
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting joblib (from nltk)
  Downloading joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2024.11.6-cp311-cp311-macosx_10_9_x86_64.whl.metadata (40 kB)
Using cached nltk-3.9.1-py3-none-any.whl (1.5 MB)
Downloading regex-2024.11.6-cp311-cp311-macosx_10_9_x86_64.whl (287 kB)
Downloading joblib-1.5.1-py3-none-any.whl (307 kB)
Installing collected packages: regex, joblib, nltk
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3/3[0m [nltk][32m2/3[0m [nltk]b]
[1A[2KSuccessfully installed joblib-1.5.1 nltk-3.9.1 regex-2024.11.6
Note: you may need to restart the kernel to use updated packages.


In [4]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize

# Download resources once
nltk.download("punkt")
nltk.download("stopwords")

# Define the modals to preserve
MODAL_VERBS = {
    "can", "could", "may", "might", "must", "shall", "should", "will", "would"
}

# Build custom stopword list (remove modal verbs from standard stopwords)
stop_words = set(stopwords.words("english")) - MODAL_VERBS

def remove_special_chars(text):
    # Keep standard ASCII characters and typical punctuation
    return re.sub(r"[^\x00-\x7F]+", "", text)

def preprocess_text(raw_text):
    """
    Preprocess the input text by:
    - Removing non-ASCII characters
    - Lowercasing
    - Removing punctuation (but preserving sentence structure)
    - Removing stopwords (except modal verbs)
    Returns a list of cleaned sentences.
    """
    preprocessed_sentences = []

    # Clean unusual special characters
    raw_text = remove_special_chars(raw_text)

    # Sentence tokenize
    sentences = sent_tokenize(raw_text)

    for sentence in sentences:
        # Lowercase
        sentence = sentence.lower()

        # Remove basic punctuation (keep alphanumeric and whitespace)
        sentence = re.sub(r"[^a-z0-9\s]", "", sentence)

        # Tokenize and filter stopwords
        words = word_tokenize(sentence)
        filtered_words = [w for w in words if w not in stop_words]

        # Reconstruct cleaned sentence
        cleaned_sentence = " ".join(filtered_words)
        preprocessed_sentences.append(cleaned_sentence)

    return preprocessed_sentences

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/valeriiaklynna/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/valeriiaklynna/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
from pathlib import Path

# Path setup
repo_root = Path().resolve().parent
input_folder = repo_root / "2_output" / "extracted_text_indiv"
output_folder = repo_root / "2_output" / "extracted_text_indiv"
output_folder.mkdir(parents=True, exist_ok=True)

for filename in ["High_Emission.txt", "Low_Emission.txt"]:
    input_path = input_folder / filename
    output_path = output_folder / filename.replace(".txt", "_preprocessed.txt")

    with open(input_path, "r", encoding="utf-8") as f:
        raw_text = f.read()

    preprocessed_sentences = preprocess_text(raw_text)

    with open(output_path, "w", encoding="utf-8") as out_f:
        out_f.write("\n".join(preprocessed_sentences))

    print(f"✅ Preprocessed: {filename} → {output_path.name}")

✅ Preprocessed: High_Emission.txt → High_Emission_preprocessed.txt
✅ Preprocessed: Low_Emission.txt → Low_Emission_preprocessed.txt


Hedging anf forward-looking trends

In [9]:
# Hedging and forward-looking phrases
HEDGING_PHRASES = [
    "may", "might", "could", "suggest", "possibly", "potentially", "assume",
    "expected to", "anticipated", "planned", "typically", "generally", "likely",
    "intend", "estimate", "believe", "aim", "tend to"
]

# Forward-looking indicators (future promises/plans)
FORWARD_LOOKING_PHRASES = [
    "will", "plan to", "aim to", "target", "commit to", "intend to",
    "expect", "expected to", "will reduce", "future", "in the coming years", "forecast", "projection"
]

# Stopwords minus modal verbs
MODALS = {"can", "could", "may", "might", "must", "shall", "should", "will", "would"}
stop_words = set(stopwords.words("english")) - MODALS

# === Functions ===
def count_phrases(text, phrases):
    """
    Count occurrences of each phrase in the text (case-insensitive).
    """
    counts = Counter()
    for phrase in phrases:
        # Match whole words or exact phrases, ignoring case
        pattern = re.compile(rf"\b{re.escape(phrase)}\b", flags=re.IGNORECASE)
        matches = pattern.findall(text)
        if matches:
            counts[phrase] = len(matches)
    return counts

In [12]:
from collections import Counter

high_path = Path("../2_output/extracted_text_indiv/High_Emission_preprocessed.txt")
low_path = Path("../2_output/extracted_text_indiv/Low_Emission_preprocessed.txt")

with open(high_path, "r", encoding="utf-8") as f:
    high_text = f.read()

with open(low_path, "r", encoding="utf-8") as f:
    low_text = f.read()

# Count hedging and forward-looking
high_hedging = count_phrases(high_text, HEDGING_PHRASES)
low_hedging = count_phrases(low_text, HEDGING_PHRASES)

high_forward = count_phrases(high_text, FORWARD_LOOKING_PHRASES)
low_forward = count_phrases(low_text, FORWARD_LOOKING_PHRASES)

In [15]:
from IPython.display import display

# Combine results
hedging_df = pd.DataFrame([high_hedging, low_hedging], index=["High", "Low"]).fillna(0).astype(int)
forward_df = pd.DataFrame([high_forward, low_forward], index=["High", "Low"]).fillna(0).astype(int)

# Display in notebook
print(" Hedging Phrase Frequency:")
display(hedging_df)

print("\n Forward-Looking Phrase Frequency:")
display(forward_df)

🔍 Hedging Phrase Frequency:


Unnamed: 0,may,might,could,suggest,possibly,potentially,assume,anticipated,planned,typically,generally,likely,intend,estimate,believe,aim
High,1713,94,1000,3,17,240,19,281,403,65,208,207,26,128,70,806
Low,3284,152,1481,4,24,413,40,300,344,115,423,441,35,440,177,1189



🔮 Forward-Looking Phrase Frequency:


Unnamed: 0,will,target,expect,will reduce,future,forecast,projection
High,2471,2477,64,24,1033,35,12
Low,4031,3043,259,23,1264,88,15


In [17]:
from collections import Counter

# Reconstruct counters
high_counts_hedging = Counter(high_hedging)
low_counts_hedging = Counter(low_hedging)

# Show results
print("🔎 Top Hedging Phrases (High Emission):")
print(high_counts_hedging.most_common(10))

print("\n🔎 Top Hedging Phrases (Low Emission):")
print(low_counts_hedging.most_common(10))

🔎 Top Hedging Phrases (High Emission):
[('may', 1713), ('could', 1000), ('aim', 806), ('planned', 403), ('anticipated', 281), ('potentially', 240), ('generally', 208), ('likely', 207), ('estimate', 128), ('might', 94)]

🔎 Top Hedging Phrases (Low Emission):
[('may', 3284), ('could', 1481), ('aim', 1189), ('likely', 441), ('estimate', 440), ('generally', 423), ('potentially', 413), ('planned', 344), ('anticipated', 300), ('believe', 177)]


In [19]:
high_counts_forward = Counter(high_forward)
low_counts_forward = Counter(low_forward)

print("🔎 Top Forwand Looking (High Emission):")
print(high_counts_forward.most_common(10))

print("\n🔎 Top Forward Looking (Low Emission):")
print(low_counts_forward.most_common(10))

🔎 Top Forwand Looking (High Emission):
[('target', 2477), ('will', 2471), ('future', 1033), ('expect', 64), ('forecast', 35), ('will reduce', 24), ('projection', 12)]

🔎 Top Forward Looking (Low Emission):
[('will', 4031), ('target', 3043), ('future', 1264), ('expect', 259), ('forecast', 88), ('will reduce', 23), ('projection', 15)]
