In [1]:
import os
import csv
import re
import pandas as pd

In [2]:
def cleanText(text):
    '''
    Function that receives a string. The transformation it applies on the string are the following:
    1) Sets it to lowercase
    2) Removes any characters except letters, —, -, ', ’, , and whitespaces
    3) Replaces multiple whitespaces in just one

    Parameters:
    text - String

    Output:
    Cleaned String
    '''

    # text to lowercase
    text = text.lower()
    # keep only letters, -, ' and space
    text = re.sub(r"[^A-Za-z—\-\'\’\ ]", ' ', text)
    # replace multiple whitespace with just one
    return re.sub(r"\s+", ' ', text)

In [3]:
#read the information in the excels into a dataframe
dfEco = pd.read_excel(
        "SustainabilityDictionaries/Economicdictionary.xlsx", header = None,sheet_name='Ark1', names=['words'])
dfEnv = pd.read_excel(
        "SustainabilityDictionaries/Environmentaldictionary.xlsx", header = None, sheet_name='Ark1', names=['words'])
dfSoc = pd.read_excel(
        "SustainabilityDictionaries/Socialdictionary.xlsx", header = None, sheet_name='Ark1', names=['words'])
#extract the words in a list and then transform them into a pattern
ecoPattern = "\\b" + "\\b|\\b".join(dfEco['words'].to_list())+"\\b"
envPattern = "\\b" + "\\b|\\b".join(dfEnv['words'].to_list())+"\\b"
socPattern = "\\b" + "\\b|\\b".join(dfSoc['words'].to_list())+"\\b"
#prepare pattern
ecoWORD = re.compile(ecoPattern)
envWORD = re.compile(envPattern)
socWORD = re.compile(socPattern)
WORD = re.compile(r'[A-Za-z—\-\'\’]+')

In [4]:
%%time
#replace csv file if it already exists, otherwise create
with open("Counts/CorporateSustainability.csv", "w+", newline="", encoding='utf-8') as csv_file:
        #headers
        csv_file.write("%s,%s,%s,%s\n" % ('file', 'Economic sustainability',
                                          'Environmental sustainability', 'Social sustainability'))
        #go through files
        for root, dirs, files in os.walk("TextFiles"):
            for file in files:
                if file.endswith('.txt'):
                    filePath = open('TextFiles/'+file,
                                    'r', encoding='utf-8')
                    text = filePath.read()
                    cleanedText = cleanText(text)
                    #find the words and count them
                    ecoTokens = len(re.findall(ecoWORD, cleanedText))
                    envTokens = len(re.findall(envWORD, cleanedText))
                    socTokens = len(re.findall(socWORD, cleanedText))
                    #normalization factor
                    totalTokens = len(re.findall(WORD, text))/500
                    csv_file.write("%s, %.4f, %.4f, %.4f\n" % (
                        file, round(ecoTokens/totalTokens, 4), round(envTokens/totalTokens, 4), round(socTokens/totalTokens, 4)))

Wall time: 24.9 s
