## Stage 1: Preprocessing & Cleaning

### Setup & Data Loading

In [None]:
# Imports
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
from unidecode import unidecode

import contractions

In [None]:
# Load the data
raw_data = pd.read_csv('../data/esg_documents_for_dax_companies.csv', delimiter = '|', index_col = 0)

In [None]:
# Check loaded data and reset index
raw_data = raw_data.reset_index(drop=True)
raw_data.head(15)

**Column descriptions**
- symbol: stock symbol of the company
- company: company name
- date: publication date of document
- title: document title
- content: document content
- datatype: document type
- internal: is this a report by company (1) or a third-party document (0)
- domain (optional): Web domain where the document was published
- url (optional): URL where the document can be accessed
- esg_topics (optional): ESG topics extracted from the data using our internal NLP

In [None]:
# Check shape (row and column amount)
raw_data.shape

In [None]:
# Check datatypes
raw_data.dtypes

## Data Normalization & Cleaning

First, several steps are conducted to normalize the text. These include lowercase conversion, expanding abbreviations, removing stopwords, applying lemmatization (dimensionality reduction), removing URLs and email addresses and extra whitespaces.

In [None]:
cleaned_data = raw_data.copy(deep=True)

In [None]:

def remove_urls(text):
    urls = re.findall(r'http\S+|www\S+|https\S+', text, flags=re.MULTILINE)
    return re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE), len(urls)

def remove_emails(text):
    mail_addresses = re.findall(r'\S+@\S+\s?', text, flags=re.MULTILINE)
    return re.sub(r'\S+@\S+\s?', '', text, flags=re.MULTILINE), len(mail_addresses)

def remove_extra_whitespace(text):
    extra_spaces = re.findall(r'\s{2,}', text)
    return re.sub(r'\s+', ' ', text).strip(), len(extra_spaces)

cleaned_data['cleaned_content'] = cleaned_data['content'].astype(str)
cleaned_data['cleaned_content'] = cleaned_data['cleaned_content'].apply(lambda x: x.lower())
cleaned_data['cleaned_content'] = cleaned_data['cleaned_content'].apply(lambda x: unidecode(x, errors="preserve")) # Remove diacritics
cleaned_data['cleaned_content'], url_count = zip(*cleaned_data['cleaned_content'].apply(remove_urls))
cleaned_data['cleaned_content'], email_count = zip(*cleaned_data['cleaned_content'].apply(remove_emails))

cleaned_data['cleaned_content'], extra_space_count = zip(*cleaned_data['cleaned_content'].apply(remove_extra_whitespace))

print("URLs removed:", sum(url_count))
print("Mail addresses removed:", sum(email_count))
print("Extra whitespaces removed:", sum(extra_space_count))

In [None]:
def expand_contractions(text):
    expanded_text = []   
    for word in text.split():
        expanded_text.append(contractions.fix(word))  
    
    expanded_text = ' '.join(expanded_text)
    return contractions.fix(expanded_text)


cleaned_data['cleaned_content'] = cleaned_data['cleaned_content'].apply(expand_contractions)

In [None]:
from textblob import TextBlob
cleaned_data['cleaned_content'] = cleaned_data['cleaned_content'].apply(lambda x: str(TextBlob(x).correct()))

In [None]:
raw_data['content'][0]

In [None]:
cleaned_data['cleaned_content'][0]

In [None]:
# Expand abbreviations
# Basic idea from: https://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
# Adjusted the patterns, so the regex patterns are complied once
# Compile the regular expressions for efficiency
specific_patterns = [
    (re.compile(r"won['’]t"), "will not"),
    (re.compile(r"can['’]t"), "can not"),
]

general_patterns = [
    (re.compile(r"n['’]t"), " not"),
    (re.compile(r"['’]re"), " are"),
    (re.compile(r"['’]s"), " is"),
    (re.compile(r"['’]d"), " would"),
    (re.compile(r"['’]ll"), " will"),
    (re.compile(r"['’]t"), " not"),
    (re.compile(r"['’]ve"), " have"),
    (re.compile(r"['’]m"), " am"),
]

extended_patterns = [
    (re.compile(r"ain['’]t"), "am not"), # or "is not", "are not", "has not", "have not", depending on context
    (re.compile(r"shan['’]t"), "shall not"),
    (re.compile(r"y['’]all"), "you all"),
    (re.compile(r"o['’]clock"), "of the clock"),
    (re.compile(r"ma['’]am"), "madam"),
    (re.compile(r"let['’]s"), "let us"),
    (re.compile(r"how['’]d"), "how did"),
    (re.compile(r"how['’]ll"), "how will"),
    (re.compile(r"what['’]re"), "what are"),
    (re.compile(r"what['’]ve"), "what have"),
    (re.compile(r"when['’]s"), "when is"),
    (re.compile(r"where['’]d"), "where did"),
    (re.compile(r"where['’]s"), "where is"),
    (re.compile(r"why['’]s"), "why is"),
    (re.compile(r"why['’]d"), "why did"),
    (re.compile(r"who['’]s"), "who is"),
    (re.compile(r"who['’]ll"), "who will"),
    (re.compile(r"who['’]ve"), "who have"),
    (re.compile(r"that['’]s"), "that is"),
    (re.compile(r"that['’]ll"), "that will"),
    (re.compile(r"there['’]s"), "there is"),
    (re.compile(r"there['’]re"), "there are"),
    (re.compile(r"there['’]d"), "there would"),
    (re.compile(r"there['’]ll"), "there will"),
]

def decontracted(phrase):
    count = 0

    # specific
    for pattern, replacement in specific_patterns:
        matches = len(pattern.findall(phrase))
        count += matches
        phrase = pattern.sub(replacement, phrase)

    # general
    for pattern, replacement in general_patterns + extended_patterns:
        matches = len(pattern.findall(phrase))
        count += matches
        phrase = pattern.sub(replacement, phrase)

    return phrase, count

# Apply the function to expand abbreviations
cleaned_data['cleaned_content'], abbreviation_counts = zip(*cleaned_data['cleaned_content'].apply(decontracted))
print("Expanded abbreviations:", sum(abbreviation_counts))

In [None]:
# Remove special characters excl. punctuation
def remove_non_alphanumeric(text):
    special_chars = re.findall(r'[^a-zA-Z0-9\s.,!?\'"]', text)
    return re.sub(r'[^a-zA-Z0-9\s.,!?\'"]', ' ', text), len(special_chars)

cleaned_data['cleaned_content'], special_char_count = zip(*cleaned_data['cleaned_content'].apply(remove_non_alphanumeric))
print("Special characters removed:", sum(special_char_count))

In [None]:
def tokenize(text):
    word_tokens = word_tokenize(text)
    sentence_tokens = sent_tokenize(text)
    return {"word_tokens": word_tokens, "sentence_tokens": sentence_tokens}

cleaned_data['tokenized_content'] = cleaned_data['cleaned_content'].apply(tokenize)

In [None]:
def remove_stopwords_and_lemmatize(tokenized_content):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    word_tokens = tokenized_content["word_tokens"]
    filtered_words = [word for word in word_tokens if word.lower() not in stop_words]
    lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]

    return {"word_tokens": lemmatized_words, "sentence_tokens": tokenized_content["sentence_tokens"]}, len(word_tokens) - len(filtered_words)

cleaned_data['cleaned_tokenized_content'], stopwords_count = zip(*cleaned_data['tokenized_content'].apply(remove_stopwords_and_lemmatize))

print("Stopwords removed:", sum(stopwords_count))


In [None]:
print(cleaned_data['cleaned_tokenized_content'][1])

## Data Preprocessing

Preliminaries
- Sentence segmentation
- Word tokenization
- Normalization

Frequent preprocessing
- Stopword removal
- Stemming and/or lemmatization
- Digits/Punctuations removal
- Case normalization

Task-specific preprocessing
- Unicode normalization
- Language detection
- Code mixing