## Stage 1: Preprocessing & Cleaning

### Setup & Data Loading

In [116]:
# Imports
import pandas as pd
import re
import string
import nltk
import requests
import contractions
from nltk.tree import Tree
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from unidecode import unidecode
from bs4 import BeautifulSoup
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('maxent_ne_chunker')
nltk.download('averaged_perceptron_tagger')
nltk.download('words')

[nltk_data] Downloading package wordnet to /Users/tim/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/tim/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/tim/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/tim/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/tim/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package words to /Users/tim/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [74]:
# Load the data
raw_data = pd.read_csv('../data/esg_documents_for_dax_companies.csv', delimiter = '|', index_col = 0)

In [75]:
# Check loaded data and reset index
raw_data = raw_data.reset_index(drop=True)
raw_data.head(15)

Unnamed: 0,company,content,datatype,date,domain,esg_topics,internal,symbol,title,url
0,Beiersdorf AG,Sustainability Highlight Report CARE BEYOND SK...,sustainability_report,2021-03-31,,"['CleanWater', 'GHGEmission', 'ProductLiabilit...",1,BEI,BeiersdorfAG Sustainability Report 2021,
1,Deutsche Telekom AG,Corporate Responsibility Report 2021 2 Content...,sustainability_report,2021-03-31,,"['DataSecurity', 'Iso50001', 'GlobalWarming', ...",1,DTE,DeutscheTelekomAG Sustainability Report 2021,
2,Vonovia SE,VONOVIA SE SUSTAINABILITY REPORT 2021 =For a S...,sustainability_report,2021-03-31,,"['Whistleblowing', 'DataSecurity', 'Vaccine', ...",1,VNA,VonoviaSE Sustainability Report 2021,
3,Merck KGaA,Sustainability Report 2021 TABLE OF CONTENTS S...,sustainability_report,2021-03-31,,"['DataSecurity', 'DataMisuse', 'DrugResistance...",1,MRK,MerckKGaA Sustainability Report 2021,
4,MTU,Our ideas and concepts FOR A SUSTAINABLE FUTUR...,sustainability_report,2020-03-31,,"['WorkLifeBalance', 'Corruption', 'AirQuality'...",1,MTX,MTUAeroEngines Sustainability Report 2020,
5,E ONSE,#StandWithUkraine Sustainability Report 2021 C...,sustainability_report,2021-03-31,,"['DataSecurity', 'Iso50001', 'GlobalWarming', ...",1,EOAN,E.ONSE Sustainability Report 2021,
6,RWE AG,Focus on tomorrow. Sustainability Report 2021 ...,sustainability_report,2021-03-31,,"['WorkLifeBalance', 'Corruption', 'Iso50001', ...",1,RWE,RWEAG Sustainability Report 2021,
7,Heidelberg Cement AG,Annual Report 2021 HeidelbergCement at a glanc...,annual_report,2021-03-31,,"['WorkLifeBalance', 'Vaccine', 'DataSecurity',...",1,HEI,HeidelbergCementAG Annual Report 2021,
8,Heidelberg Cement AG,Company Strategy & Business & Product & Produc...,sustainability_report,2020-03-31,,"['CleanWater', 'Corruption', 'Whistleblowing',...",1,HEI,HeidelbergCementAG Sustainability Report 2020,
9,Siemens AG,Sustainability 1 Siemens 2 Our 3 Governance – ...,sustainability_report,2020-03-31,,"['DataSecurity', 'Iso50001', 'EmployeeTurnover...",1,SIE,SiemensAG Sustainability Report 2020,


**Column descriptions**
- symbol: stock symbol of the company
- company: company name
- date: publication date of document
- title: document title
- content: document content
- datatype: document type
- internal: is this a report by company (1) or a third-party document (0)
- domain (optional): Web domain where the document was published
- url (optional): URL where the document can be accessed
- esg_topics (optional): ESG topics extracted from the data using our internal NLP

In [76]:
# Check shape (row and column amount)
raw_data.shape

(11188, 10)

In [77]:
# Check datatypes
raw_data.dtypes

company       object
content       object
datatype      object
date          object
domain        object
esg_topics    object
internal       int64
symbol        object
title         object
url           object
dtype: object

## Data Normalization & Cleaning

First, several steps are conducted to normalize the text. These include lowercase conversion, expanding abbreviations, removing stopwords, applying lemmatization (dimensionality reduction), removing URLs and email addresses and extra whitespaces.

In [78]:
cleaned_data = raw_data.copy(deep=True)

In [79]:
def remove_urls(text):
    urls = re.findall(r'http\S+|www\S+|https\S+', text, flags=re.MULTILINE)
    return re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE), len(urls)

def remove_emails(text):
    mail_addresses = re.findall(r'\S+@\S+\s?', text, flags=re.MULTILINE)
    return re.sub(r'\S+@\S+\s?', '', text, flags=re.MULTILINE), len(mail_addresses)

def remove_extra_whitespace(text):
    extra_spaces = re.findall(r'\s{2,}', text)
    return re.sub(r'\s+', ' ', text).strip(), len(extra_spaces)

cleaned_data['cleaned_content'] = cleaned_data['content'].astype(str) # Convert all texts to string
cleaned_data['cleaned_content'] = cleaned_data['cleaned_content'].apply(lambda x: x.lower()) # Convert all texts to lower-case
cleaned_data['cleaned_content'] = cleaned_data['cleaned_content'].apply(lambda x: unidecode(x, errors="preserve")) # Remove diacritics / accented characters and unicode normalization
cleaned_data['cleaned_content'], url_count = zip(*cleaned_data['cleaned_content'].apply(remove_urls)) # Remove URLs from texts
cleaned_data['cleaned_content'], email_count = zip(*cleaned_data['cleaned_content'].apply(remove_emails)) # Remove e-mail addresses from texts
cleaned_data['cleaned_content'], extra_space_count = zip(*cleaned_data['cleaned_content'].apply(remove_extra_whitespace)) # Remove extra whitespaces from texts

print("URLs removed:", sum(url_count))
print("Mail addresses removed:", sum(email_count))
print("Extra whitespaces removed:", sum(extra_space_count))

URLs removed: 13737
Mail addresses removed: 523
Extra whitespaces removed: 19457


In [80]:
def remove_contact_details(text):
    # Remove phone numbers
    phone_regex = r'[\+\(]?[1-9][0-9 .\-\(\)]{8,}[0-9]'
    phone_count = len(re.findall(phone_regex, text))
    text = re.sub(phone_regex, '', text)

    # Remove common contact-related phrases
    contact_phrases_regex = r'\b(?:Contact Person|Phone|Tel|Fax|Mobile|E?mail|Skype|Twitter|Facebook|LinkedIn|Website):\b'
    contact_phrases_count = len(re.findall(contact_phrases_regex, text, flags=re.IGNORECASE))
    text = re.sub(contact_phrases_regex, '', text, flags=re.IGNORECASE)

    total_count = phone_count + contact_phrases_count
    return text, total_count

def remove_table_of_contents(text):
    # Remove common table of contents phrases
    toc_phrases_regex = r'\b(?:Table of Contents|Contents)\b'
    toc_phrases_count = len(re.findall(toc_phrases_regex, text, flags=re.IGNORECASE))
    text = re.sub(toc_phrases_regex, '', text, flags=re.IGNORECASE)

    # Remove content with numbering like "1. Introduction", "1.1. Background", "A. Overview", etc.
    toc_entries_regex = r'(^|\n)\s*\w+(\.\w+)*\s+\w+([\w\s]+)?'
    toc_entries_count = len(re.findall(toc_entries_regex, text))
    text = re.sub(toc_entries_regex, '', text)

    total_count = toc_phrases_count + toc_entries_count
    return text, total_count

cleaned_data['cleaned_content'], contact_count = zip(*cleaned_data['cleaned_content'].apply(remove_contact_details))
cleaned_data['cleaned_content'], toc_count = zip(*cleaned_data['cleaned_content'].apply(remove_table_of_contents))
print("Contact information removed:", sum(contact_count))
print("TOCs removed:", sum(toc_count))

Contact information removed: 44477
TOCs removed: 9878


In [81]:
def expand_contractions(text):
    expanded_text = []
    for word in text.split():
        expanded_text.append(contractions.fix(word))
    expanded_text = ' '.join(expanded_text)
    return contractions.fix(expanded_text)

cleaned_data['cleaned_content'] = cleaned_data['cleaned_content'].apply(expand_contractions)

In [82]:
raw_data['content'][0]

'Sustainability Highlight Report CARE BEYOND SKIN 2021 03 Foreword OUR SUSTAINABILITY COMMITMENT 06 Our Values, our Brands, our Strategy 07 Our Sustainability Agenda CARE BEYOND SKIN 08 Our Partnerships 09 Our Promise toward Consumers MINIMIZING OUR ENVIRONMENTAL FOOTPRINT 11 Our Targets Climate Care: Our Holistic Approach to Climate Protection 12 14 16 17 18 People and Nature in Balance – Innovative NIVEA Products Eucerin: Dermocosmetics Meets Sustainability La Prairie Combines Sustainability and Luxury Climate-neutral Production in Leipzig and Berlin KEY FOR NAVIGATION Jump to the table MAXIMIZING OUR SOCIAL IMPACT 20 Our Targets Our Engagement for Sustainable Palm Oil Cultivation in Indonesia 21 23 24 25 27 The Power of Human Touch – NIVEA’s New Social Mission Eucerin’s Social Mission for Greater Social Participation We Stand Strong for Women and Girls Worldwide Diversity and Inclusion as Key to Success OUTLOOK 30 Continuing our Sustainability Journey in 2022 ANNEX 33 Key Figures at

In [83]:
cleaned_data['cleaned_content'][0]

', our brands, our strategy 07 our sustainability agenda care beyond skin 08 our partnerships 09 our promise toward consumers minimizing our environmental footprint 11 our targets climate care: our holistic approach to climate protection people and nature in balance - innovative nivea products eucerin: dermocosmetics meets sustainability la prairie combines sustainability and luxury climate-neutral production in leipzig and berlin key for navigation jump to the table maximizing our social impact 20 our targets our engagement for sustainable palm oil cultivation in indonesia the power of human touch - nivea\'s new social mission eucerin\'s social mission for greater social participation we stand strong for women and girls worldwide diversity and inclusion as key to success outlook 30 continuing our sustainability journey in 2022 annex 33 key figures at a glance 34 about this report 35 contact persons 36 imprint page reference web link highlights 2021 of foreword dear readers, at beiersd

In [84]:
# Expand abbreviations
# Basic idea from: https://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
# Compile the regular expressions only once for efficiency
specific_patterns = [
    (re.compile(r"won['’]t"), "will not"),
    (re.compile(r"can['’]t"), "can not"),
]

def decontracted(phrase):
    count = 0

    # Replace specific patterns
    for pattern, replacement in specific_patterns:
        matches = len(pattern.findall(phrase))
        count += matches
        phrase = pattern.sub(replacement, phrase)

    return phrase, count

# Apply the function to expand abbreviations
cleaned_data['cleaned_content'], abbreviation_counts = zip(*cleaned_data['cleaned_content'].apply(decontracted))
print("Expanded custom abbreviations:", sum(abbreviation_counts))

Expanded custom abbreviations: 0


In [85]:
# Remove special characters excl. punctuation since this is needed by the sentence tokenization
def remove_non_alphanumeric(text):
    special_chars = re.findall(r'[^a-zA-Z0-9\s.,!?\'"]', text)
    return re.sub(r'[^a-zA-Z0-9\s.,!?\'"]', ' ', text), len(special_chars)

cleaned_data['cleaned_content'], special_char_count = zip(*cleaned_data['cleaned_content'].apply(remove_non_alphanumeric))
print("Special characters removed:", sum(special_char_count))

Special characters removed: 1407434


In [86]:
def tokenize_words(text):
    # Remove numbers, digits, and punctuation
    text = re.sub(r'\b\d+\b', '', text)
    text = re.sub(r'[^\w\s]', '', text)

    # Tokenize words
    tokens = word_tokenize(text)

    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return tokens, len(tokens)

cleaned_data['word_tokens'], word_token_count = zip(*cleaned_data['cleaned_content'].apply(tokenize_words))
print("Generated word token amount:", sum(word_token_count))

Generated word token amount: 17798685


In [87]:
def tokenize_sentences(text):
    # Tokenize sentences
    tokens = sent_tokenize(text)
    
    return tokens, len(tokens)

cleaned_data['sentence_tokens'], sentence_token_count = zip(*cleaned_data['cleaned_content'].apply(tokenize_sentences))
print("Generated sentence token amount:", sum(sentence_token_count))

Generated sentence token amount: 727222


In [88]:
def remove_stopwords_from_word_tokens(tokens, custom_stopwords):

    # Remove stopwords from tokens
    filtered_tokens = [token for token in tokens if token.lower() not in custom_stopwords]
    
    return filtered_tokens, len(tokens) - len(filtered_tokens)

def remove_stopwords_from_sentence_tokens(sentences_list, custom_stopwords):
    filtered_sentences_list = []
    total_removed_items_count = 0

    for sentence in sentences_list:
        # Tokenize the sentence into words
        word_tokens = word_tokenize(sentence)

        # Remove stopwords, digits, numbers, dates, and punctuation from word tokens
        filtered_word_tokens = [
            token for token in word_tokens
            if token.lower() not in custom_stopwords
            and not re.search(r'\d', token)  # Remove tokens containing digits
            and token not in string.punctuation  # Remove punctuation
        ]

        # Reconstruct the sentence without the removed words
        filtered_sentence = ' '.join(filtered_word_tokens)
        removed_items_count = len(word_tokens) - len(filtered_word_tokens)

        filtered_sentences_list.append(filtered_sentence)
        total_removed_items_count += removed_items_count

    return filtered_sentences_list, total_removed_items_count

# Define custom stopwords to add or remove
custom_stopwords = {
    'add': ['additional_stopword1', 'additional_stopword2'],
    'remove': ['stopword_to_remove1', 'stopword_to_remove2']
}

# Combine stopwords to filter the content of the reports
all_stopwords = set(stopwords.words('english'))
all_stopwords |= set(custom_stopwords['add'])
all_stopwords -= set(custom_stopwords['remove'])

cleaned_data['word_tokens'], stopword_count_words = zip(*cleaned_data['word_tokens'].apply(remove_stopwords_from_word_tokens, custom_stopwords=all_stopwords))
cleaned_data['sentence_tokens'], stopword_count_sentences = zip(*cleaned_data['sentence_tokens'].apply(remove_stopwords_from_sentence_tokens, custom_stopwords=all_stopwords))

print("Removed stopwords in word tokens", sum(stopword_count_words))
print("Removed stopwords in sentence tokens", sum(stopword_count_sentences))

Removed stopwords in word tokens 6459538
Removed stopwords in sentence tokens 9233671


In [93]:
def pos_tagging_tokens(word_tokens, sentence_list):
    # POS tagging for word tokens
    pos_tagged_word_tokens = nltk.pos_tag(word_tokens)

    # Create a dictionary to map word tokens to their POS tags
    pos_tags_dict = dict(pos_tagged_word_tokens)

    # POS tagging for sentence tokens
    pos_tagged_sentence_list = []
    for sentence in sentence_list:
        tokenized_sentence = nltk.word_tokenize(sentence)
        pos_tagged_sentence = [(token, pos_tags_dict[token]) for token in tokenized_sentence if token in pos_tags_dict]
        pos_tagged_sentence_list.append(pos_tagged_sentence)

    return pos_tagged_word_tokens, pos_tagged_sentence_list

# Apply POS tagging
pos_tags = cleaned_data.apply(lambda row: pos_tagging_tokens(row['word_tokens'], row['sentence_tokens']), axis=1)
cleaned_data['pos_tagged_word_tokens'], cleaned_data['pos_tagged_sentence_tokens'] = zip(*pos_tags)


In [97]:
print(cleaned_data['pos_tagged_sentence_tokens'].head())

0    [[(strategy, NN), (sustainability, NN), (agend...
1    [[(degmanagement, NN)], [(deutsche, NN), (tele...
2    [[(sustainable, JJ), (future, NN)], [(vonovia,...
3    [[(management, NN), (company, NN), (profile, N...
4    [[(sustainability, NN), (mtu, NN), (far, RB), ...
Name: pos_tagged_sentence_tokens, dtype: object


In [91]:
print(cleaned_data['pos_tagged_word_tokens'].head())

0    [(brand, NN), (strategy, NN), (sustainability,...
1    [(degmanagement, NN), (fact, NN), (deutsche, N...
2    [(sustainable, JJ), (future, NN), (vonovia, NN...
3    [(management, NN), (employee, NN), (company, N...
4    [(sustainability, NN), (mtu, NN), (go, VBP), (...
Name: pos_tagged_word_tokens, dtype: object


In [98]:
cleaned_data.head()

Unnamed: 0,company,content,datatype,date,domain,esg_topics,internal,symbol,title,url,cleaned_content,word_tokens,sentence_tokens,pos_tagged_sentence_tokens,pos_tagged_word_tokens
0,Beiersdorf AG,Sustainability Highlight Report CARE BEYOND SK...,sustainability_report,2021-03-31,,"['CleanWater', 'GHGEmission', 'ProductLiabilit...",1,BEI,BeiersdorfAG Sustainability Report 2021,,", our brands, our strategy 07 our sustainabili...","[brand, strategy, sustainability, agenda, care...",[brands strategy sustainability agenda care be...,"[[(strategy, NN), (sustainability, NN), (agend...","[(brand, NN), (strategy, NN), (sustainability,..."
1,Deutsche Telekom AG,Corporate Responsibility Report 2021 2 Content...,sustainability_report,2021-03-31,,"['DataSecurity', 'Iso50001', 'GlobalWarming', ...",1,DTE,DeutscheTelekomAG Sustainability Report 2021,,!degmanagement facts! from deutsche tele...,"[degmanagement, fact, deutsche, telekom, cr, r...","[degmanagement facts, deutsche telekom, cr rep...","[[(degmanagement, NN)], [(deutsche, NN), (tele...","[(degmanagement, NN), (fact, NN), (deutsche, N..."
2,Vonovia SE,VONOVIA SE SUSTAINABILITY REPORT 2021 =For a S...,sustainability_report,2021-03-31,,"['Whistleblowing', 'DataSecurity', 'Vaccine', ...",1,VNA,VonoviaSE Sustainability Report 2021,,for a sustainable future. vonovia se sustaina...,"[sustainable, future, vonovia, se, sustainabil...","[sustainable future, vonovia se sustainability...","[[(sustainable, JJ), (future, NN)], [(vonovia,...","[(sustainable, JJ), (future, NN), (vonovia, NN..."
3,Merck KGaA,Sustainability Report 2021 TABLE OF CONTENTS S...,sustainability_report,2021-03-31,,"['DataSecurity', 'DataMisuse', 'DrugResistance...",1,MRK,MerckKGaA Sustainability Report 2021,,management employees 4 company profile 115 a...,"[management, employee, company, profile, attra...",[management employees company profile attracti...,"[[(management, NN), (company, NN), (profile, N...","[(management, NN), (employee, NN), (company, N..."
4,MTU,Our ideas and concepts FOR A SUSTAINABLE FUTUR...,sustainability_report,2020-03-31,,"['WorkLifeBalance', 'Corruption', 'AirQuality'...",1,MTX,MTUAeroEngines Sustainability Report 2020,,", but sustainability at mtu goes far beyond cl...","[sustainability, mtu, go, far, beyond, climate...",[sustainability mtu goes far beyond climate ac...,"[[(sustainability, NN), (mtu, NN), (far, RB), ...","[(sustainability, NN), (mtu, NN), (go, VBP), (..."


In [110]:
def extract_human_names(chunked_tokens):
    names = []
    for subtree in chunked_tokens.subtrees():
        if subtree.label() == 'PERSON':
            name = ' '.join([leaf[0] for leaf in subtree.leaves()])
            names.append(name)
    return names

def remove_human_names(row):
    pos_tagged_word_tokens = row['pos_tagged_word_tokens']
    pos_tagged_sentence_tokens = row['pos_tagged_sentence_tokens']
    
    chunked_tokens = nltk.ne_chunk(pos_tagged_word_tokens)
    human_names = extract_human_names(chunked_tokens)
    human_names = set(human_names)

    filtered_word_tokens = [token for token, _ in pos_tagged_word_tokens if token not in human_names]
    filtered_pos_tagged_word_tokens = [(token, pos) for token, pos in pos_tagged_word_tokens if token not in human_names]

    filtered_sentence_tokens = []
    filtered_pos_tagged_sentence_tokens = []
    
    for pos_tagged_sentence in pos_tagged_sentence_tokens:
        filtered_sentence = [token for token, _ in pos_tagged_sentence if token not in human_names]
        filtered_pos_sentence = [(token, pos) for token, pos in pos_tagged_sentence if token not in human_names]
        
        filtered_sentence_tokens.append(filtered_sentence)
        filtered_pos_tagged_sentence_tokens.append(filtered_pos_sentence)

    return pd.Series({
        'word_tokens': filtered_word_tokens,
        'sentence_tokens': filtered_sentence_tokens,
        'pos_tagged_word_tokens': filtered_pos_tagged_word_tokens,
        'pos_tagged_sentence_tokens': filtered_pos_tagged_sentence_tokens
    })

# Apply the optimized function to all four columns
cleaned_data.loc[:, ['word_tokens','sentence_tokens','pos_tagged_word_tokens','pos_tagged_sentence_tokens']] = cleaned_data.apply(remove_human_names, axis=1)

In [111]:
print(cleaned_data['word_tokens'][0])

['brand', 'strategy', 'sustainability', 'agenda', 'care', 'beyond', 'skin', 'partnership', 'promise', 'toward', 'consumer', 'minimizing', 'environmental', 'footprint', 'target', 'climate', 'care', 'holistic', 'approach', 'climate', 'protection', 'people', 'nature', 'balance', 'innovative', 'nivea', 'product', 'eucerin', 'dermocosmetics', 'meet', 'sustainability', 'la', 'prairie', 'combine', 'sustainability', 'luxury', 'climate', 'neutral', 'production', 'leipzig', 'berlin', 'key', 'navigation', 'jump', 'table', 'maximizing', 'social', 'impact', 'target', 'engagement', 'sustainable', 'palm', 'oil', 'cultivation', 'indonesia', 'power', 'human', 'touch', 'niveas', 'new', 'social', 'mission', 'eucerins', 'social', 'mission', 'greater', 'social', 'participation', 'stand', 'strong', 'woman', 'girl', 'worldwide', 'diversity', 'inclusion', 'key', 'success', 'outlook', 'continuing', 'sustainability', 'journey', 'annex', 'key', 'figure', 'glance', 'report', 'contact', 'person', 'imprint', 'page'

In [112]:
cleaned_data.head()

Unnamed: 0,word_tokens,sentence_tokens,pos_tagged_word_tokens,pos_tagged_sentence_tokens
0,"[brand, strategy, sustainability, agenda, care...","[[strategy, sustainability, agenda, care, beyo...","[(brand, NN), (strategy, NN), (sustainability,...","[[(strategy, NN), (sustainability, NN), (agend..."
1,"[degmanagement, fact, deutsche, telekom, cr, r...","[[degmanagement], [deutsche, telekom], [cr, re...","[(degmanagement, NN), (fact, NN), (deutsche, N...","[[(degmanagement, NN)], [(deutsche, NN), (tele..."
2,"[sustainable, future, vonovia, se, sustainabil...","[[sustainable, future], [vonovia, se, sustaina...","[(sustainable, JJ), (future, NN), (vonovia, NN...","[[(sustainable, JJ), (future, NN)], [(vonovia,..."
3,"[management, employee, company, profile, attra...","[[management, company, profile, attractive, em...","[(management, NN), (employee, NN), (company, N...","[[(management, NN), (company, NN), (profile, N..."
4,"[sustainability, mtu, go, far, beyond, climate...","[[sustainability, mtu, far, beyond, climate, a...","[(sustainability, NN), (mtu, NN), (go, VBP), (...","[[(sustainability, NN), (mtu, NN), (far, RB), ..."


## Data Enrichment

In [1]:
url = 'https://disfold.com/stock-index/dax/companies/'
response = requests.get(url)

soup = BeautifulSoup(response.content, 'html.parser')

table = soup.find('table')
data = []
for row in table.find_all('tr'):
    cols = row.find_all('td')
    cols = [col.text.strip() for col in cols]
    data.append(cols)

def clean_scraped_data(data):
    cleaned_data = []
    
    for row in data:
        # Remove empty rows
        if len(row) > 0:
            # Remove the '$' and ',' signs from the market cap and convert it to float
            market_cap = float(row[3].replace('$', '').replace(',', '').replace('B', ''))
            cleaned_data.append([row[1], row[2], market_cap, row[4], row[5], row[6]])
    
    df = pd.DataFrame(cleaned_data, columns=['Company Name', 'Ticker Symbol', 'Market Cap [in B$]', 'Country', 'Industry Sector', 'Industry'])
    
    return df

df = clean_scraped_data(data)
df.to_csv('../data/dax_company_sectors.csv', index=False)
df.head(10)

NameError: name 'requests' is not defined