In [6]:
from nltk import tokenize
import pandas as pd

In [7]:
with open('Data/BHS_governance_scandal.txt',encoding='utf8') as f:
    data = f.read()

In [8]:
# tested in transformers==4.18.0 
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

finbert = AutoModelForSequenceClassification.from_pretrained('Models/topic_model/',num_labels=4)
tokenizer = AutoTokenizer.from_pretrained('Models/topic_tokenizer/')
nlp = pipeline("text-classification", model=finbert, tokenizer=tokenizer)



In [9]:
finbert_sentiment = AutoModelForSequenceClassification.from_pretrained('Models/sentiment_model/',num_labels=3)
tokenizer_sentiment = AutoTokenizer.from_pretrained('Models/sentiment_tokenizer/')
nlp_sentiment = pipeline("text-classification", model=finbert_sentiment, tokenizer=tokenizer_sentiment)



In [10]:
finbert_fls = AutoModelForSequenceClassification.from_pretrained('Models/fls_model/',num_labels=3)
tokenizer_fls = AutoTokenizer.from_pretrained('Models/fls_tokenizer/')
nlp_fls = pipeline("text-classification", model=finbert_fls, tokenizer=tokenizer_fls)



In [11]:
finbert_sub_cat = AutoModelForSequenceClassification.from_pretrained('Models/subESG_model/',num_labels=9)
tokenizer_sub_cat = AutoTokenizer.from_pretrained('Models/subESG_tokenizer/')
nlp_sub_cat = pipeline("text-classification", model=finbert_sub_cat, tokenizer=tokenizer_sub_cat)



In [12]:
list_all_sentences = tokenize.sent_tokenize(data)
len(list_all_sentences)

58

In [13]:
list_all_sentences_short = [sentence for sentence in list_all_sentences if len(sentence)<500]
len(list_all_sentences_short)

58

In [14]:
def build_dataset(lst):
    try:
        dataset_lst = []
        for sentence in lst :
            topic_label= nlp(sentence)[0]['label']
            topic_score = nlp(sentence)[0]['score']
            
            sentiment_label = nlp_sentiment(sentence)[0]['label']
            sentiment_score = nlp_sentiment(sentence)[0]['score']
            
            fls_label = nlp_fls(sentence)[0]['label']
            fls_score = nlp_fls(sentence)[0]['score']
            
            sub_cat_label = nlp_sub_cat(sentence)[0]['label']
            sub_cat_score = nlp_sub_cat(sentence)[0]['score']
            
            print(f'{sentence[0:25]} belongs to {topic_label}({topic_score}%) with a sentiment {sentiment_label}({sentiment_score}% =>{sub_cat_label})')
            dataset_lst.append({'sentence':sentence,
                                'topic_label': topic_label,
                                'topic_score': topic_score,
                                'sentiment_label': sentiment_label,
                                'sentiment_score': sentiment_score,
                                'fls_label': fls_label,
                                'fls_score': fls_score,
                                'sub_cat_label': sub_cat_label,
                                'sub_cat_score': sub_cat_score                                
                                })
    except:
        dataset_lst.append({'sentence':sentence,
                                'topic_label': 'error',
                                'topic_score': 0,
                                'sentiment_label': 'error',
                                'sentiment_score': 0,
                                'fls_label': 'error',
                                'fls_score': 0,
                                'sub_cat_label': 'error',
                                'sub_cat_score': 0})
    return dataset_lst

In [15]:
dataset_lst = build_dataset(list_all_sentences_short)

Fallout from the scandal  belongs to Governance(0.4531234800815582%) with a sentiment Negative(0.9998595714569092% =>Business Ethics & Values)
The damning parliamentary belongs to Social(0.9786429405212402%) with a sentiment Negative(0.9997097849845886% =>Community Relations)
The BHS scandal has been  belongs to Environmental(0.9315072298049927%) with a sentiment Negative(0.9592201113700867% =>Pollution & Waste)
Dominic Chappell, the bus belongs to Social(0.9576088190078735%) with a sentiment Neutral(0.9999719858169556% =>Product Liability)
The fallout from the scan belongs to Governance(0.40840819478034973%) with a sentiment Negative(0.9984422326087952% =>Business Ethics & Values)
Around 11,000 jobs are at belongs to Social(0.9828281402587891%) with a sentiment Negative(0.7277733087539673% =>Human Capital)
Green controlled BHS for  belongs to Social(0.9706413745880127%) with a sentiment Neutral(0.9993252754211426% =>Community Relations)
Green and other sharehold belongs to Governance(

In [16]:
df_csv = pd.DataFrame(dataset_lst)

In [17]:
df_csv

Unnamed: 0,sentence,topic_label,topic_score,sentiment_label,sentiment_score,fls_label,fls_score,sub_cat_label,sub_cat_score
0,Fallout from the scandal could lead to a lost ...,Governance,0.453123,Negative,0.99986,Non-specific FLS,0.937266,Business Ethics & Values,0.983647
1,The damning parliamentary report into the demi...,Social,0.978643,Negative,0.99971,Non-specific FLS,0.708357,Community Relations,0.77319
2,The BHS scandal has been described by MPs as t...,Environmental,0.931507,Negative,0.95922,Not FLS,0.990499,Pollution & Waste,0.551511
3,"Dominic Chappell, the businessman who bought B...",Social,0.957609,Neutral,0.999972,Not FLS,0.993661,Product Liability,0.327208
4,The fallout from the scandal could lead to a k...,Governance,0.408408,Negative,0.998442,Non-specific FLS,0.93579,Business Ethics & Values,0.983201
5,"Around 11,000 jobs are at risk after BHS colla...",Social,0.982828,Negative,0.727773,Not FLS,0.644578,Human Capital,0.94825
6,Green controlled BHS for 15 years until March ...,Social,0.970641,Neutral,0.999325,Not FLS,0.986811,Community Relations,0.530825
7,Green and other shareholders extracted £586m i...,Governance,0.641291,Neutral,0.999768,Not FLS,0.989532,Corporate Governance,0.984617
8,Pensions\nBHS has been left with a pension def...,,0.912101,Negative,0.881347,Not FLS,0.991116,Community Relations,0.335942
9,Green told MPs that problems facing the pensio...,Social,0.968416,Negative,0.97743,Not FLS,0.988535,Community Relations,0.538749


In [18]:
df_csv.to_csv('Data/labeled_sentences_negative_G_58_extra_classes.csv')