In [1]:
from nltk import tokenize
import pandas as pd
import re
import textract

In [11]:

data = textract.process('Data/apmm-sustainability-report-a4_2017.pdf')
data = re.split(b'\s{4,}',data)
data = [sentence.decode() for sentence in data if len(sentence)>70 and len(sentence)<500]
data = ' '.join(data)

In [12]:
# tested in transformers==4.18.0 
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

finbert = AutoModelForSequenceClassification.from_pretrained('Models/topic_model/',num_labels=4)
tokenizer = AutoTokenizer.from_pretrained('Models/topic_tokenizer/')
nlp = pipeline("text-classification", model=finbert, tokenizer=tokenizer)



In [13]:
finbert_sentiment = AutoModelForSequenceClassification.from_pretrained('Models/sentiment_model/',num_labels=3)
tokenizer_sentiment = AutoTokenizer.from_pretrained('Models/sentiment_tokenizer/')
nlp_sentiment = pipeline("text-classification", model=finbert_sentiment, tokenizer=tokenizer_sentiment)



In [14]:
finbert_fls = AutoModelForSequenceClassification.from_pretrained('Models/fls_model/',num_labels=3)
tokenizer_fls = AutoTokenizer.from_pretrained('Models/fls_tokenizer/')
nlp_fls = pipeline("text-classification", model=finbert_fls, tokenizer=tokenizer_fls)



In [15]:
finbert_sub_cat = AutoModelForSequenceClassification.from_pretrained('Models/subESG_model/',num_labels=9)
tokenizer_sub_cat = AutoTokenizer.from_pretrained('Models/subESG_tokenizer/')
nlp_sub_cat = pipeline("text-classification", model=finbert_sub_cat, tokenizer=tokenizer_sub_cat)



In [16]:
list_all_sentences = tokenize.sent_tokenize(data)
len(list_all_sentences)

376

In [17]:
list_all_sentences_short = [sentence for sentence in list_all_sentences if len(sentence)<500]
len(list_all_sentences_short)

365

In [18]:
def build_dataset(lst):
    try:
        dataset_lst = []
        for sentence in lst :
            topic_label= nlp(sentence)[0]['label']
            topic_score = nlp(sentence)[0]['score']
            
            sentiment_label = nlp_sentiment(sentence)[0]['label']
            sentiment_score = nlp_sentiment(sentence)[0]['score']
            
            fls_label = nlp_fls(sentence)[0]['label']
            fls_score = nlp_fls(sentence)[0]['score']
            
            sub_cat_label = nlp_sub_cat(sentence)[0]['label']
            sub_cat_score = nlp_sub_cat(sentence)[0]['score']
            
            print(f'{sentence[0:25]} belongs to {topic_label}({topic_score}%) with a sentiment {sentiment_label}({sentiment_score}% =>{sub_cat_label})')
            dataset_lst.append({'sentence':sentence,
                                'topic_label': topic_label,
                                'topic_score': topic_score,
                                'sentiment_label': sentiment_label,
                                'sentiment_score': sentiment_score,
                                'fls_label': fls_label,
                                'fls_score': fls_score,
                                'sub_cat_label': sub_cat_label,
                                'sub_cat_score': sub_cat_score                                
                                })
    except:
        dataset_lst.append({'sentence':sentence,
                                'topic_label': 'error',
                                'topic_score': 0,
                                'sentiment_label': 'error',
                                'sentiment_score': 0,
                                'fls_label': 'error',
                                'fls_score': 0,
                                'sub_cat_label': 'error',
                                'sub_cat_score': 0})
    return dataset_lst

In [19]:
dataset_lst = build_dataset(list_all_sentences_short)

Companies can no longer s belongs to Social(0.9573908448219299%) with a sentiment Neutral(0.99234938621521% =>Non-ESG)
Our business
A.P. belongs to None(0.9767014980316162%) with a sentiment Neutral(0.9984109401702881% =>Non-ESG)
Moller - Maersk employs o belongs to Social(0.9824466705322266%) with a sentiment Neutral(0.999998927116394% =>Human Capital)
Our vision is to be the g belongs to Social(0.9548644423484802%) with a sentiment Positive(0.9310210943222046% =>Product Liability)
Examples of where some of belongs to Environmental(0.7603691816329956%) with a sentiment Neutral(0.9751501083374023% =>Pollution & Waste)
A world-leading provider  belongs to Social(0.843917965888977%) with a sentiment Neutral(0.5442354679107666% =>Community Relations)
Moller - Maersk has been  belongs to Social(0.9871279001235962%) with a sentiment Positive(0.9999998807907104% =>Community Relations)
We plan to still be doing belongs to Social(0.7782077789306641%) with a sentiment Neutral(0.9991421699523926

In [25]:
df_csv = pd.DataFrame(dataset_lst)

In [26]:
df_csv

Unnamed: 0,sentence,topic_label,topic_score,sentiment_label,sentiment_score,fls_label,fls_score,sub_cat_label,sub_cat_score
0,Companies can no longer stay on the\r\nsidelin...,Social,0.957391,Neutral,0.992349,Non-specific FLS,0.510004,Non-ESG,0.401613
1,Our business\r\nA.P.,,0.976701,Neutral,0.998411,Not FLS,0.990726,Non-ESG,0.784374
2,"Moller - Maersk employs over 85,000\r\npeople ...",Social,0.982447,Neutral,0.999999,Not FLS,0.982055,Human Capital,0.828750
3,Our vision is to be the global integrator of\r...,Social,0.954864,Positive,0.931021,Not FLS,0.946253,Product Liability,0.499080
4,Examples of where some of our material\r\nissu...,Environmental,0.760369,Neutral,0.975150,Not FLS,0.978589,Pollution & Waste,0.637766
...,...,...,...,...,...,...,...,...,...
360,Please send your feedback to:\r\nA.P.,,0.926178,Neutral,0.999666,Not FLS,0.878980,Non-ESG,0.544031
361,Møller - Mærsk A/S\r\nEsplanaden 50\r\nDK–1098...,Social,0.814684,Neutral,0.999970,Not FLS,0.970404,Climate Change,0.347196
362,+45 33 63 33 63\r\nCompany reg.,,0.846329,Neutral,0.999593,Not FLS,0.981492,Non-ESG,0.797524
363,no.,,0.946030,Neutral,0.977230,Not FLS,0.919827,Non-ESG,0.932540


In [27]:
df_csv.to_csv('Data/labeled_sentences_positive_G_365_extra_classes.csv')