## Stage 1: Preprocessing & Cleaning

### Setup & Data Loading

In [1]:
# Imports
import pandas as pd
import re
import os
import string
import nltk
import spacy
import requests
import contractions
from langdetect import detect 
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from unidecode import unidecode
from bs4 import BeautifulSoup
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('maxent_ne_chunker')
nltk.download('averaged_perceptron_tagger')
nltk.download('words')

[nltk_data] Downloading package wordnet to /Users/tim/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/tim/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/tim/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/tim/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/tim/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package words to /Users/tim/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [2]:
# Download a spacy model, can also be adjusted (medium = en_core_web_md, large = en_core_web_lg)
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.5.0/en_core_web_md-3.5.0-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [3]:
# Load the data
raw_data = pd.read_csv('../data/esg_documents_for_dax_companies.csv', delimiter = '|', index_col = 0)

In [4]:
# Check loaded data and reset index
raw_data = raw_data.reset_index(drop=True)
raw_data.head(10)

Unnamed: 0,company,content,datatype,date,domain,esg_topics,internal,symbol,title,url
0,Beiersdorf AG,Sustainability Highlight Report CARE BEYOND SK...,sustainability_report,2021-03-31,,"['CleanWater', 'GHGEmission', 'ProductLiabilit...",1,BEI,BeiersdorfAG Sustainability Report 2021,
1,Deutsche Telekom AG,Corporate Responsibility Report 2021 2 Content...,sustainability_report,2021-03-31,,"['DataSecurity', 'Iso50001', 'GlobalWarming', ...",1,DTE,DeutscheTelekomAG Sustainability Report 2021,
2,Vonovia SE,VONOVIA SE SUSTAINABILITY REPORT 2021 =For a S...,sustainability_report,2021-03-31,,"['Whistleblowing', 'DataSecurity', 'Vaccine', ...",1,VNA,VonoviaSE Sustainability Report 2021,
3,Merck KGaA,Sustainability Report 2021 TABLE OF CONTENTS S...,sustainability_report,2021-03-31,,"['DataSecurity', 'DataMisuse', 'DrugResistance...",1,MRK,MerckKGaA Sustainability Report 2021,
4,MTU,Our ideas and concepts FOR A SUSTAINABLE FUTUR...,sustainability_report,2020-03-31,,"['WorkLifeBalance', 'Corruption', 'AirQuality'...",1,MTX,MTUAeroEngines Sustainability Report 2020,
5,E ONSE,#StandWithUkraine Sustainability Report 2021 C...,sustainability_report,2021-03-31,,"['DataSecurity', 'Iso50001', 'GlobalWarming', ...",1,EOAN,E.ONSE Sustainability Report 2021,
6,RWE AG,Focus on tomorrow. Sustainability Report 2021 ...,sustainability_report,2021-03-31,,"['WorkLifeBalance', 'Corruption', 'Iso50001', ...",1,RWE,RWEAG Sustainability Report 2021,
7,Heidelberg Cement AG,Annual Report 2021 HeidelbergCement at a glanc...,annual_report,2021-03-31,,"['WorkLifeBalance', 'Vaccine', 'DataSecurity',...",1,HEI,HeidelbergCementAG Annual Report 2021,
8,Heidelberg Cement AG,Company Strategy & Business & Product & Produc...,sustainability_report,2020-03-31,,"['CleanWater', 'Corruption', 'Whistleblowing',...",1,HEI,HeidelbergCementAG Sustainability Report 2020,
9,Siemens AG,Sustainability 1 Siemens 2 Our 3 Governance – ...,sustainability_report,2020-03-31,,"['DataSecurity', 'Iso50001', 'EmployeeTurnover...",1,SIE,SiemensAG Sustainability Report 2020,


**Column descriptions**
- symbol: stock symbol of the company
- company: company name
- date: publication date of document
- title: document title
- content: document content
- datatype: document type
- internal: is this a report by company (1) or a third-party document (0)
- domain (optional): Web domain where the document was published
- url (optional): URL where the document can be accessed
- esg_topics (optional): ESG topics extracted from the data using our internal NLP

In [5]:
# Check shape (row and column amount)
raw_data.shape

(11188, 10)

In [6]:
# Check datatypes
raw_data.dtypes

company       object
content       object
datatype      object
date          object
domain        object
esg_topics    object
internal       int64
symbol        object
title         object
url           object
dtype: object

The data is loaded correctly.

In [7]:
# Small checkpoint function to save intermediary processing steps and enhance development
def csv_checkpoint(df, filename='checkpoint'):

    if not os.path.exists('../data/checkpoints/'):
        os.makedirs('../data/checkpoints/')

    # Save DataFrame to CSV
    df.to_csv(f'../data/checkpoints/{filename}.csv', index=False, sep = '|')
    print(f'Saved DataFrame to {filename}.csv')
    
    # Load CSV back into DataFrame
    df = pd.read_csv(f'../data/checkpoints/{filename}.csv', delimiter = '|')
    print(f'Loaded DataFrame from {filename}.csv')
    
    return df

## General Data Cleaning

As initial data cleaning steps, the following is conducted:
- Rows with missing "content" were dropped to prevent any missing data-related issues. Missing data can create gaps in the data and lead to errors or distortions in the analysis.
- The "URL" column was removed as the relevant information was available in the "domain" column. Removing redundant columns simplifies the data set and makes it easier to work with
- Duplicate entries were identified and removed, resulting in a cleaner and more concise dataset. Duplicates can distort the data and lead to biased analysis. 
- Language checking was conducted and all rows with non-English content were dropped to ensure consistent language. Language inconsistencies can create bias in the data and lead to inaccurate conclusions. Therefore, it is important to ensure that the data is consistent in language to prevent linguistic biases.
- "Date" is formatted as a date and wrong dates, e.g. "bayer-03-31" are replaced with a default date (2023-03-31).
- Remove company name parts like "AG" for clarity
- The "sample" method was used to check the data for representativeness and potential issues.

In [8]:
general_cleaned_data = raw_data.copy(deep=True)

In [9]:
# Delete all rows with no content, e.g. no report
general_cleaned_data = general_cleaned_data.dropna(subset=['content'])

In [10]:
# Drop the "url" column, since the most relevant information from an analysis perspective is already in the "domain" column (e.g. the source of the report)
general_cleaned_data = general_cleaned_data.drop(columns=['url'])

In [11]:
# Check for duplicates and delete them
duplicates = general_cleaned_data[general_cleaned_data.duplicated()]
print(f'Duplicated rows: {len(duplicates)}')
general_cleaned_data = general_cleaned_data.drop_duplicates()

Duplicated rows: 6


In [12]:
# Check for other languange than English
general_cleaned_data['language'] = general_cleaned_data['content'].apply(lambda x: detect(x))
not_english = len(general_cleaned_data) - len(general_cleaned_data.loc[general_cleaned_data['language'] == 'en'])

# Drop rows with other languange, since other languanges influences to quality of the later analysis
general_cleaned_data = general_cleaned_data.loc[general_cleaned_data['language'] == 'en']

print(f'Deleted amount of rows with language other ehan English: {not_english}')
general_cleaned_data.drop(['language'], axis=1, inplace=True)

Deleted amount of rows with language other ehan English: 107


In [13]:
# Correct the dates to ISO standard
def find_incorrect_dates(data):
    incorrect_dates = []

    for index, row in data.iterrows():
        try:
            pd.to_datetime(row['date'], format='%Y-%m-%d', errors='raise')
        except ValueError:
            incorrect_dates.append((index, row['date']))

    return incorrect_dates

incorrect_date_rows = find_incorrect_dates(general_cleaned_data)
print("Incorrectly formatted dates:")
for index, date in incorrect_date_rows:
    print(f"Row {index}: {date}")

Incorrectly formatted dates:
Row 13: p.DE-03-31
Row 18: p.DE-03-31
Row 20: bayer-03-31
Row 22: p.DE-03-31
Row 25: p.DE-03-31
Row 26: p.DE-03-31
Row 31: p.DE-03-31
Row 32: p.DE-03-31
Row 33: p.DE-03-31
Row 37: p.DE-03-31
Row 41: p.DE-03-31
Row 50: p.DE-03-31
Row 78: p.DE-03-31
Row 80: p.DE-03-31
Row 86: p.DE-03-31
Row 87: p.DE-03-31
Row 88: p.DE-03-31


In [14]:
# Correct the wrong formatted dates and set default date
def correct_date_format(data):
    data['date'] = pd.to_datetime(data['date'], errors='coerce').fillna('2022-03-31')
    return data

general_cleaned_data = correct_date_format(general_cleaned_data)

In [15]:
# Replace company name parts like "AG" to have a cleaner name
general_cleaned_data['company'] = general_cleaned_data['company'].str.replace(' AG', '')
general_cleaned_data['company'] = general_cleaned_data['company'].str.replace(' SE', '')
general_cleaned_data['company'] = general_cleaned_data['company'].str.replace(' KGaA', '')

In [16]:
# Drop rows with no content, e.g. no report
general_cleaned_data = general_cleaned_data.dropna(subset=['content'])

In [17]:
# Check the date with some samples
general_cleaned_data.sample(5)

Unnamed: 0,company,content,datatype,date,domain,esg_topics,internal,symbol,title
2837,Beiersdorf,A group of 36 major personal care and cosmetic...,esg,2022-02-23,esgtoday,['Transparency'],0,BEI,Consumer Products Giants Partner to Launch Imp...
230,Adidas,Experience this story and others in the new is...,general,2022-03-25,highsnobiety,[],0,ADS,The Shade Is Real: Gear Testing the Best Cycli...
2457,Bayer,"CHICAGO, Nov. 29, 2021 ( GLOBE NEWSWIRE) -- Mo...",business,2021-11-29,marketscreener,['HumanCapital'],0,BAYN,Mondelēz International Appoints Ertharin Cousi...
6024,Infineon Technologies,Instilling a sense of confidence and trust amo...,business,2021-05-16,thefintechtimes,"['Fraud', 'DataSecurity', 'UnbankedPopulation'...",0,IFX,Financial Inclusion Is Nothing Without Securit...
5053,Deutsche Telekom,The information you requested is not available...,business,2022-01-27,bnnbloomberg,['Privacy'],0,DTE,Deutsche Telekom Weighs Tower Tie-Up With Voda...


In [18]:
# Change name of "Muenchener Rueckversicherungs Gesellschaft AGin Muenchen" to something more readable
general_cleaned_data['company'] = general_cleaned_data['company'].replace('Muenchener Rueckversicherungs Gesellschaftin Muenchen', 'Munich RE')

In [19]:
# Create checkpoint file
general_cleaned_data = csv_checkpoint(general_cleaned_data, 'general_cleaned_data')

Saved DataFrame to general_cleaned_data.csv
Loaded DataFrame from general_cleaned_data.csv


## Text Data Cleaning & Preprocessing

The "content" column, containing the text of the reports, undergoes a series of cleaning, normalization, and preprocessing steps to ensure accurate and efficient analysis. These steps include:

- **String conversion**: Converting the input to a string format ensures consistency and compatibility during subsequent processing tasks.
- **Lowercase conversion**: Transforming all text to lowercase serves as a simple normalization step, reducing the complexity and variability of the input data.
- **Unicode decoding**: Removing diacritics (e.g., accented characters) and normalizing the text encoding mitigates potential discrepancies arising from different encoding formats.
- **URL and email address removal**: Eliminating URLs and email addresses reduces noise in the dataset, as these elements do not contribute valuable information for the analysis.
- **Extra whitespace removal**: Eradicating extra whitespaces improves text analysis and tokenization by ensuring that only meaningful spaces are retained.
- **Contact detail removal**: Excluding phone numbers, contact person strings, and social media references further minimizes noise in the dataset, honing the focus on relevant text.
- **Table of contents removal**: Discarding the table of contents enhances the data quality by eliminating repetitive and non-essential information.
- **Named entity removal**: Employing the spaCy model to remove human names and other named entities optimizes the text for analysis and modeling by concentrating on pertinent content.
- **Abbreviation expansion**: Utilizing the contractions library and custom functions with regular expressions, common and uncommon abbreviations are expanded to improve text interpretation.
- **Special character elimination**: Excluding all special characters, except punctuation, refines the input data. Retaining punctuation is necessary for accurate sentence tokenization and removed after sentence tokenization..
- **Tokenization and lemmatization**: Tokenizing words and sentences, and subsequently lemmatizing words using the WordNetLemmatizer from nltk, streamlines the text and reduces morphological variations.
- **Stopword removal**: Customizing the nltk stopwords list by adding or removing specific stopwords enables more precise and tailored text analysis.
- **Part-of-speech (POS) tagging**: Assigning POS tags to words and sentences enhances the text representation by providing additional linguistic information, which may be beneficial for subsequent analysis and modeling tasks.  

Spellchecking was tested with TextBlob and PySpellChecker but deliverd not useful results

In [20]:
cleaned_data = general_cleaned_data.copy(deep=True)

In [21]:
# Since the spacy model shows better results on the "raw" text, the named entity removal is conducted before all normalization and cleaning steps
spacy_model = spacy.load('en_core_web_md')
spacy_model.max_length = 1800000 # Increase max text length

def remove_named_entities(text):
    doc = spacy_model(text)
    
    named_entities = set()
    for ent in doc.ents:
        if ent.label_ in ["PERSON"]:
            named_entities.add(ent.text)
    
    named_entities_count = len(named_entities)
    
    for named_entity in named_entities:
        text = text.replace(named_entity, '')
    
    return text, named_entities_count

# Assuming cleaned_data is a pandas DataFrame with a 'content' column
cleaned_data['cleaned_content'], name_entity_count = zip(*cleaned_data['content'].apply(remove_named_entities))
print("Name entities removed:", sum(name_entity_count))

Name entities removed: 94694


In [22]:
def remove_urls(text):
    urls = re.findall(r'http\S+|www\S+|https\S+', text, flags=re.MULTILINE)
    return re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE), len(urls)

def remove_emails(text):
    mail_addresses = re.findall(r'\S+@\S+\s?', text, flags=re.MULTILINE)
    return re.sub(r'\S+@\S+\s?', '', text, flags=re.MULTILINE), len(mail_addresses)

def remove_extra_whitespace(text):
    extra_spaces = re.findall(r'\s{2,}', text)
    return re.sub(r'\s+', ' ', text).strip(), len(extra_spaces)

cleaned_data['cleaned_content'] = cleaned_data['cleaned_content'].astype(str) # Convert all texts to string
cleaned_data['cleaned_content'] = cleaned_data['cleaned_content'].apply(lambda x: x.lower()) # Convert all texts to lower-case
cleaned_data['cleaned_content'] = cleaned_data['cleaned_content'].apply(lambda x: unidecode(x, errors="preserve")) # Remove diacritics / accented characters and unicode normalization
cleaned_data['cleaned_content'], url_count = zip(*cleaned_data['cleaned_content'].apply(remove_urls)) # Remove URLs from texts
cleaned_data['cleaned_content'], email_count = zip(*cleaned_data['cleaned_content'].apply(remove_emails)) # Remove e-mail addresses from texts
cleaned_data['cleaned_content'], extra_space_count = zip(*cleaned_data['cleaned_content'].apply(remove_extra_whitespace)) # Remove extra whitespaces from texts

print("URLs removed:", sum(url_count))
print("Mail addresses removed:", sum(email_count))
print("Extra whitespaces removed:", sum(extra_space_count))

URLs removed: 7487
Mail addresses removed: 434
Extra whitespaces removed: 148978


In [23]:
def remove_contact_details(text):
    # Remove phone numbers
    phone_regex = r'[\+\(]?[1-9][0-9 .\-\(\)]{8,}[0-9]'
    phone_count = len(re.findall(phone_regex, text))
    text = re.sub(phone_regex, '', text)

    # Remove common contact-related phrases
    contact_phrases_regex = r'\b(?:Contact Person|Phone|Tel|Fax|Mobile|E?mail|Skype|Twitter|Facebook|LinkedIn|Website):\b'
    contact_phrases_count = len(re.findall(contact_phrases_regex, text, flags=re.IGNORECASE))
    text = re.sub(contact_phrases_regex, '', text, flags=re.IGNORECASE)

    total_count = phone_count + contact_phrases_count
    return text, total_count

def remove_table_of_contents(text):
    # Remove common table of contents phrases
    toc_phrases_regex = r'\b(?:Table of Contents|Contents)\b'
    toc_phrases_count = len(re.findall(toc_phrases_regex, text, flags=re.IGNORECASE))
    text = re.sub(toc_phrases_regex, '', text, flags=re.IGNORECASE)

    # Remove content with numbering like "1. Introduction", "1.1. Background", "A. Overview", etc.
    toc_entries_regex = r'(^|\n)\s*\w+(\.\w+)*\s+\w+([\w\s]+)?'
    toc_entries_count = len(re.findall(toc_entries_regex, text))
    text = re.sub(toc_entries_regex, '', text)

    total_count = toc_phrases_count + toc_entries_count
    return text, total_count

cleaned_data['cleaned_content'], contact_count = zip(*cleaned_data['cleaned_content'].apply(remove_contact_details))
cleaned_data['cleaned_content'], toc_count = zip(*cleaned_data['cleaned_content'].apply(remove_table_of_contents))
print("Contact information removed:", sum(contact_count))
print("TOCs removed:", sum(toc_count))

Contact information removed: 44160
TOCs removed: 9603


In [24]:
def expand_contractions(text):
    expanded_text = []
    for word in text.split():
        expanded_text.append(contractions.fix(word))
    expanded_text = ' '.join(expanded_text)
    return contractions.fix(expanded_text)

cleaned_data['cleaned_content'] = cleaned_data['cleaned_content'].apply(expand_contractions)

In [25]:
# Expand custom abbreviations which are not captured by "contractions"
# Basic idea from: https://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
# Compile the regular expressions only once for efficiency
specific_patterns = [
    (re.compile(r"won['’]t"), "will not"),
    (re.compile(r"can['’]t"), "can not"),
]

def decontracted(phrase):
    count = 0

    # Replace specific patterns
    for pattern, replacement in specific_patterns:
        matches = len(pattern.findall(phrase))
        count += matches
        phrase = pattern.sub(replacement, phrase)

    return phrase, count

# Apply the function to expand abbreviations
cleaned_data['cleaned_content'], abbreviation_counts = zip(*cleaned_data['cleaned_content'].apply(decontracted))
print("Expanded custom abbreviations:", sum(abbreviation_counts))

Expanded custom abbreviations: 0


In [26]:
# Remove special characters excl. punctuation since this is needed by the sentence tokenization
def remove_non_alphanumeric(text):
    special_chars = re.findall(r'[^a-zA-Z0-9\s.,!?\'"]', text)
    return re.sub(r'[^a-zA-Z0-9\s.,!?\'"]', ' ', text), len(special_chars)

cleaned_data['cleaned_content'], special_char_count = zip(*cleaned_data['cleaned_content'].apply(remove_non_alphanumeric))
print("Special characters excl. punctuation removed:", sum(special_char_count))

Special characters excl. punctuation removed: 1387193


In [27]:
def tokenize_words(text):
    # Remove numbers, digits, and punctuation
    text = re.sub(r'\b\d+\b', '', text)
    text = re.sub(r'[^\w\s]', '', text)

    # Tokenize words
    tokens = word_tokenize(text)

    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return tokens, len(tokens)

cleaned_data['word_tokens'], word_token_count = zip(*cleaned_data['cleaned_content'].apply(tokenize_words))
print("Generated word token amount:", sum(word_token_count))

Generated word token amount: 17274365


In [28]:
def tokenize_sentences(text):
    # Tokenize sentences
    tokens = sent_tokenize(text)
    
    return tokens, len(tokens)

cleaned_data['sentence_tokens'], sentence_token_count = zip(*cleaned_data['cleaned_content'].apply(tokenize_sentences))
print("Generated sentence token amount:", sum(sentence_token_count))

Generated sentence token amount: 712214


In [29]:
def remove_stopwords_from_word_tokens(tokens, custom_stopwords):

    # Remove stopwords from tokens
    filtered_tokens = [token for token in tokens if token.lower() not in custom_stopwords]
    
    return filtered_tokens, len(tokens) - len(filtered_tokens)

def remove_stopwords_from_sentence_tokens(sentences_list, custom_stopwords):
    filtered_sentences_list = []
    total_removed_items_count = 0

    for sentence in sentences_list:
        # Tokenize the sentence into words
        word_tokens = word_tokenize(sentence)

        # Remove stopwords, digits, numbers, dates, and punctuation from word tokens
        filtered_word_tokens = [
            token for token in word_tokens
            if token.lower() not in custom_stopwords
            and not re.search(r'\d', token)  # Remove tokens containing digits
            ]

        # Remove remaining special characters from sentences, i.e. punctuation
        filtered_word_tokens = [
            re.sub(rf"[{re.escape(string.punctuation)}]", '', token) for token in filtered_word_tokens
            ]

        # Reconstruct the sentence without the removed words and special characters
        filtered_sentence = ' '.join(filtered_word_tokens)
        removed_items_count = len(word_tokens) - len(filtered_word_tokens)

        filtered_sentences_list.append(filtered_sentence)
        total_removed_items_count += removed_items_count

    return filtered_sentences_list, total_removed_items_count

# Define custom stopwords to add or remove
custom_stopwords = {
    'add': ['said','company','companies','year','billion','million','siemens','linde','rwe','volkswagen','symrise','porsche','sap','adidas','puma','airbus','bmw','hannover','mtu','heiderbergcement','qiagen','benz','continental','bayer','fresenius'],
    'remove': [''] # Currently not needed anymore
}

# Combine stopwords to filter the content of the reports
all_stopwords = set(stopwords.words('english'))
all_stopwords |= set(custom_stopwords['add'])
all_stopwords -= set(custom_stopwords['remove'])

cleaned_data['word_tokens'], stopword_count_words = zip(*cleaned_data['word_tokens'].apply(remove_stopwords_from_word_tokens, custom_stopwords=all_stopwords))
cleaned_data['sentence_tokens'], stopword_count_sentences = zip(*cleaned_data['sentence_tokens'].apply(remove_stopwords_from_sentence_tokens, custom_stopwords=all_stopwords))

print("Removed stopwords in word tokens", sum(stopword_count_words))
print("Removed stopwords in sentence tokens", sum(stopword_count_sentences))

Removed stopwords in word tokens 6687477
Removed stopwords in sentence tokens 7701752


In [30]:
def pos_tagging_tokens(word_tokens, sentence_list):
    # POS tagging for word tokens
    pos_tagged_word_tokens = nltk.pos_tag(word_tokens)

    # Create a dictionary to map word tokens to their POS tags, this reduces the effort to call nltk.pos_tag twice
    pos_tags_dict = dict(pos_tagged_word_tokens)

    # POS tagging for sentence tokens
    pos_tagged_sentence_list = []
    for sentence in sentence_list:
        tokenized_sentence = nltk.word_tokenize(sentence)
        pos_tagged_sentence = [(token, pos_tags_dict[token]) for token in tokenized_sentence if token in pos_tags_dict]
        pos_tagged_sentence_list.append(pos_tagged_sentence)

    return pos_tagged_word_tokens, pos_tagged_sentence_list

# Apply POS tagging
pos_tags = cleaned_data.apply(lambda row: pos_tagging_tokens(row['word_tokens'], row['sentence_tokens']), axis=1)
cleaned_data['pos_tagged_word_tokens'], cleaned_data['pos_tagged_sentence_tokens'] = zip(*pos_tags)

In [31]:
cleaned_data.sample(5)

Unnamed: 0,company,content,datatype,date,domain,esg_topics,internal,symbol,title,cleaned_content,word_tokens,sentence_tokens,pos_tagged_word_tokens,pos_tagged_sentence_tokens
3004,Continental,"CHICAGO, Aug. 25, 2022 /PRNewswire/ -- The glo...",general,2022-08-25,prnewswire.co,"['EMobility', 'ValueChain']",0,CON,Automotive Valves Market worth $ 28.2 billion ...,"chicago, august 25, 2022 prnewswire the g...","[chicago, august, prnewswire, global, automoti...",[chicago august prnewswire global automotive...,"[(chicago, RB), (august, JJ), (prnewswire, NN)...","[[(chicago, RB), (august, JJ), (prnewswire, NN..."
6173,Infineon Technologies,The Czech koruna rises as the country's centra...,business,2022-02-03,marketscreener,"['RenewableEnergy', 'RussianFederation']",0,IFX,EUROPEAN MIDDAY BRIEFING - Stocks Lower Ahead ...,'s central bank is expected to raise interest ...,"[central, bank, expected, raise, interest, rat...",[s central bank expected raise interest rates ...,"[(central, JJ), (bank, NN), (expected, VBD), (...","[[(central, JJ), (bank, NN), (expected, VBN), ..."
89,Beiersdorf,CARE BEYOND SKIN03 Foreword05 Interview07 | Ou...,sustainability_report,2020-03-31,,"['RoundtableOnSustainablePalmOil', 'CleanWater...",1,BEI,BeiersdorfAG Sustainability Report 2020,our commitments08 overview of the consumer b...,"[commitments08, overview, consumer, business, ...",[overview consumer business sustainability age...,"[(commitments08, NN), (overview, JJ), (consume...","[[(overview, VBZ), (consumer, NN), (business, ..."
2473,Bayer,Request Here Sample Report Buy This Complete B...,general,2021-10-22,ecochunk,['ValueChain'],0,BAYN,Fungal Otitis Externa Market Will Watch a Sens...,. the research covers a valuable source of per...,"[research, cover, valuable, source, perceptive...","[, research covers valuable source perceptive ...","[(research, NN), (cover, NN), (valuable, JJ), ...","[[], [(research, NN), (valuable, JJ), (source,..."
7611,Qiagen,BackgroundWe aimed to examine the immunogenici...,thinktank,2021-12-03,thelancet,"['Social', 'GenderDiversity', 'PatientSafety',...",0,QIA,Immunogenicity and safety of two doses of the ...,cov 2 vaccine coronavac sinovac life scienc...,"[cov, vaccine, coronavac, sinovac, life, scien...",[cov vaccine coronavac sinovac life sciences ...,"[(cov, NN), (vaccine, NN), (coronavac, NN), (s...","[[(cov, VBP), (vaccine, JJ), (coronavac, NN), ..."


In [32]:
# Create checkpoint file
cleaned_data = csv_checkpoint(cleaned_data, 'cleaned_data')

Saved DataFrame to cleaned_data.csv
Loaded DataFrame from cleaned_data.csv


## Data Enrichment

Several additional information could be helpful in the further analysis, which are not included in the dataset. Therefore a small scraper is used to enrich the the dataset with the sector, industry and market capitalization of the DAX companies.

In [33]:
url = 'https://disfold.com/stock-index/dax/companies/'
response = requests.get(url)

soup = BeautifulSoup(response.content, 'html.parser')

table = soup.find('table')
scraped_data = []
for row in table.find_all('tr'):
    cols = row.find_all('td')
    cols = [col.text.strip() for col in cols]
    scraped_data.append(cols)

def clean_scraped_data(data):
    cleaned_data = []
    
    for row in data:
        # Remove empty rows
        if len(row) > 0:
            # Remove the '$' and ',' signs from the market cap and convert it to float
            market_cap = float(row[3].replace('$', '').replace(',', '').replace('B', ''))
            cleaned_data.append([row[1], row[2], market_cap, row[4], row[5], row[6]])
    
    df = pd.DataFrame(cleaned_data, columns=['company_name', 'symbol', 'market_cap_in_usd_b', 'country', 'sector', 'industry'])
    
    return df

company_enrichments = clean_scraped_data(scraped_data)
company_enrichments.to_csv('../data/dax_company_sectors.csv', index=False)
company_enrichments.head()

Unnamed: 0,company_name,symbol,market_cap_in_usd_b,country,sector,industry
0,Linde plc,LIN,156.93,United Kingdom,Basic Materials,Specialty Chemicals
1,SAP SE,SAP,121.03,Germany,Technology,Software—Application
2,Siemens AG,SIE,110.13,Germany,Industrials,Specialty Industrial Machinery
3,Deutsche Telekom AG,DTE,101.78,Germany,Communication Services,Telecom Services
4,Airbus SE,AIR,96.87,Netherlands,Industrials,Aerospace & Defense


In [34]:
# Fix the ticker symbols to prevent NaN and ensure correct join conditions
company_enrichments['symbol'] = company_enrichments['symbol'].replace('SRT3', 'SRT')
company_enrichments['symbol'] = company_enrichments['symbol'].replace('HEN3', 'HNK')
company_enrichments.loc[company_enrichments['company_name'] == 'Mercedes-Benz Group AG', 'symbol'] = 'DAI'

In [35]:
cleaned_data['symbol'] = cleaned_data['symbol'].astype(pd.StringDtype())
company_enrichments['symbol'] = company_enrichments['symbol'].astype(pd.StringDtype())

# Merge the cleaned data with the enrichment
enriched_cleaned_data = pd.merge(cleaned_data, company_enrichments, how='left', on='symbol')

In [36]:
enriched_cleaned_data[enriched_cleaned_data['industry'].isnull()]

Unnamed: 0,company,content,datatype,date,domain,esg_topics,internal,symbol,title,cleaned_content,word_tokens,sentence_tokens,pos_tagged_word_tokens,pos_tagged_sentence_tokens,company_name,market_cap_in_usd_b,country,sector,industry
48,Hannover R,Sustainability Report 2020 We face up to futur...,sustainability_report,2020-03-31,,"['Whistleblowing', 'Vaccine', 'Corruption', 'G...",1,HNR1,HannoverRückversicherungAG Sustainability Repo...,!somewhat different! approach. our purpose a...,"['somewhat', 'different', 'approach', 'purpose...","[' somewhat different ', 'approach ', 'purpose...","[('somewhat', 'RB'), ('different', 'JJ'), ('ap...","[[('somewhat', 'RB'), ('different', 'JJ')], [(...",,,,,
76,Hannover R,Annual Report An overview Gross premium E 01 i...,annual_report,2021-03-31,,"['Vaccine', 'Monopolization', 'Corruption', 'G...",1,HNR1,HannoverRückversicherungAG Annual Report 2021,",762.3 30,000 13,774.2 13,963.4 14,361.8 17,06...","['group', 'net', 'income', 'e', 'eur', 'policy...","[' group net income e eur policyholders ', 'su...","[('group', 'NN'), ('net', 'JJ'), ('income', 'N...","[[('group', 'NN'), ('net', 'JJ'), ('income', '...",,,,,


Hannover R AG cannot be matched, since it is not present in the scraped data. Since there are only 2 records this is negligible and will be fixed manually.

In [37]:
enriched_cleaned_data.loc[enriched_cleaned_data['company'] == 'Hannover R', 'sector'] = 'Financials'
enriched_cleaned_data.loc[enriched_cleaned_data['company'] == 'Hannover R', 'industry'] = 'Insurance—Reinsurance'

In [38]:
# Drop redundant columns/data
enriched_cleaned_data = enriched_cleaned_data.drop(columns=['content', 'company_name', 'country'])

In [39]:
# Check final dataframe
enriched_cleaned_data.sample(10)

Unnamed: 0,company,datatype,date,domain,esg_topics,internal,symbol,title,cleaned_content,word_tokens,sentence_tokens,pos_tagged_word_tokens,pos_tagged_sentence_tokens,market_cap_in_usd_b,sector,industry
1794,BMW,general,2021-09-06,autonews,"['Recycling', 'EMobility']",0,BMW,BMW concept shows'sustainable car of the futur...,bmw's i vision circular concept previews a ...,"['bmws', 'vision', 'circular', 'concept', 'pre...",['s vision circular concept previews sustainab...,"[('bmws', 'NN'), ('vision', 'NN'), ('circular'...","[[('vision', 'NN'), ('circular', 'JJ'), ('conc...",60.24,Consumer Discretionary,Auto Manufacturers
4484,Deutsche Post,business,2022-03-09,marketscreener,['Compliance'],0,DPW,Deutsche Post AG: Announcement pursuant to Art...,dgap news deutsche post ag key word s s...,"['dgap', 'news', 'deutsche', 'post', 'ag', 'ke...",['dgap news deutsche post ag key word share bu...,"[('dgap', 'NN'), ('news', 'NN'), ('deutsche', ...","[[('dgap', 'NN'), ('news', 'NN'), ('deutsche',...",45.4,Industrials,Integrated Freight & Logistics
4420,Deutsche Boerse,,2022-03-30,draeger,"['Diversity', 'StakeholderEngagement', 'Collus...",0,DB1,Dräger Sustainability Report 2021,foreword 3 about the report 6 sustainability...,"['foreword', 'report', 'sustainability', 'stak...",['foreword report sustainability stakeholders ...,"[('foreword', 'JJ'), ('report', 'NN'), ('susta...","[[('foreword', 'NN'), ('report', 'NN'), ('sust...",31.43,Financials,Financial Data & Stock Exchanges
9242,Siemens Energy,business,2022-02-18,marketscreener,['Environment'],0,ENR,Siemens Energy's CEO increases pressure on Sie...,the market environment in the onshore busin...,"['market', 'environment', 'onshore', 'business...",['market environment onshore business remain d...,"[('market', 'NN'), ('environment', 'NN'), ('on...","[[('market', 'NN'), ('environment', 'NN'), ('o...",13.7,Utilities,Utilities—Independent Power Producers
1106,Allianz,business,2022-10-09,bnnbloomberg,"['RussianFederation', 'Petroleum']",0,ALV,Oil Steadies After Weekly Jump on Concern Fed ...,", please check back again soon. a worker drill...","['please', 'check', 'back', 'soon', 'worker', ...","[' please check back soon ', 'worker drills oi...","[('please', 'VB'), ('check', 'VB'), ('back', '...","[[('please', 'VB'), ('check', 'VB'), ('back', ...",88.44,Financials,Insurance—Diversified
4738,Deutsche Telekom,general,2022-11-07,uniglobalunion,['Social'],0,DTE,UNI stands in solidarity with Crnogorski Telek...,"ct , a deutsche telekom subsidiary, in monte...","['ct', 'deutsche', 'telekom', 'subsidiary', 'm...",['ct deutsche telekom subsidiary montenegro ...,"[('ct', 'NN'), ('deutsche', 'NN'), ('telekom',...","[[('ct', 'NN'), ('deutsche', 'NN'), ('telekom'...",101.78,Communication Services,Telecom Services
5587,Heidelberg Cement,business,2021-03-15,worldcement,"['CarbonCaptureAndStorage', 'Recarbonation', '...",0,HEI,HeidelbergCement pioneers new carbon capture t...,. here are the instructions how to enable java...,"['instruction', 'enable', 'javascript', 'web',...","['', 'instructions enable javascript web brows...","[('instruction', 'NN'), ('enable', 'JJ'), ('ja...","[[], [('enable', 'JJ'), ('javascript', 'NN'), ...",11.91,Basic Materials,Building Materials
8563,SAP,general,2021-11-12,businesswire,"['ValueChain', 'EnergyManagement', 'WasteManag...",0,SAP,Global Smart Cities Markets Report 2021: Smart...,dublin business wire the smart cit...,"['dublin', 'business', 'wire', 'smart', 'city'...",['dublin business wire smart cities market sha...,"[('dublin', 'NN'), ('business', 'NN'), ('wire'...","[[('dublin', 'NN'), ('business', 'NN'), ('wire...",121.03,Technology,Software—Application
9762,Siemens Healthineers,general,2022-11-02,itnonline,['Social'],0,SHL,Blue Earth Diagnostics Announces Results on Cl...,18f rhpsma 7.3 pet image showing prostate canc...,"['18f', 'rhpsma', 'pet', 'image', 'showing', '...",['rhpsma pet image showing prostate cancer spr...,"[('18f', 'CD'), ('rhpsma', 'NN'), ('pet', 'JJ'...","[[('rhpsma', 'NN'), ('pet', 'NN'), ('image', '...",55.11,Healthcare,Diagnostics & Research
10055,Siemens Healthineers,general,2022-11-21,itnonline,"['Environment', 'Social', 'Toxicity']",0,SHL,Rice Refines Analysis of MRI Contrast Agents,". in this model, green gadolinium is surrounde...","['model', 'green', 'gadolinium', 'surrounded',...","['', 'model green gadolinium surrounded blue ...","[('model', 'NN'), ('green', 'JJ'), ('gadoliniu...","[[], [('model', 'NN'), ('green', 'JJ'), ('gado...",55.11,Healthcare,Diagnostics & Research


In [41]:
# Due to the processing, some rows (2) have no content anymore. These are dropped.
enriched_cleaned_data[enriched_cleaned_data['cleaned_content'].isna()]
enriched_cleaned_data = enriched_cleaned_data.dropna(subset=['cleaned_content'])

In [42]:
# Create checkpoint file for further analysis
enriched_cleaned_data = csv_checkpoint(enriched_cleaned_data, 'enriched_cleaned_data')

Saved DataFrame to enriched_cleaned_data.csv
Loaded DataFrame from enriched_cleaned_data.csv
