In [237]:
import pandas as pd
import nltk
import re
from pathlib import Path
from transformers import TransfoXLTokenizer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
import seaborn as sns
nltk.download('stopwords')
nltk.download('punkt')

# 1: Transformer
# 2: NLTK word tokenize
# 3: WhiteSpace
tokenization_method = 2

# should links be removed or not
preprocess_links = False

# other preprocessing steps
# baseline boolean gets set when all three are false, returns tokens directly
# lowercasing and alphabetic symbols
lower_alpha = True
# numeric tokens
numerical = True
# currency & percentage symbols
spec_char = False

remove_stopwords = False
lemmatize_tokens = False
stemm_tokens = False

if lemmatize_tokens and stemm_tokens:
    raise Exception("Sorry, no lemmatization and stemming at the same time")
if not lower_alpha and not numerical and not spec_char:
    # if all three steps are false it is more efficient to set baseline to True
    baseline = True
else:
    baseline = False

# Create a TransfoXLTokenizer object with the add_special_tokens parameter set to False
tokenizer = TransfoXLTokenizer.from_pretrained("transfo-xl-wt103", add_special_tokens=False)

# Compile a regular expression pattern to match currency symbols and percentage signs
currency_symbol_pattern = re.compile(r'[$€£₹]')
percentage_pattern = re.compile(r'\d*%|\b%')
# currency_pattern = re.compile(r'[$€£₹]?\s?\d+(?:[.,]\d{3})*(?:[.,]\d{2})?(?=[^\d.,]|$)')

# variables to indicate whether the logging was already printed once or not
printed_mod_alpha = False
printed_mod_num = False
printed_mod_spec = False
printed_mod_base = False

# function to test different token modification techniques
def lowercase_delete_special_characters(tokens):
    modified_tokens = []
    # variables to indicate whether the logging was already printed once or not
    global printed_mod_alpha
    global printed_mod_num
    global printed_mod_spec
    global printed_mod_base

    # if no preprocessing was marked as true, return tokens directly to increase performance
    if baseline:
        if not printed_mod_base:
            # print logging once
            printed_mod_base = True
            print("Baseline modifications")

        return tokens

    else:
        # loop through tokens to modify them
        for token in tokens:
            # check whether preprocessing step is set to true and if token matches or not
            if token.isalpha() and lower_alpha:
                # if it matches, lowercase and append
                modified_tokens.append(token.lower())
                # print logging once
                if not printed_mod_alpha:
                    printed_mod_alpha = True
                    print("Adding Alphabetic symbols and lowercase")
            # check whether preprocessing step is set to true and if token matches or not
            elif token.isnumeric() and numerical:
                modified_tokens.append(token)
                if not printed_mod_num:
                    printed_mod_num = True
                    print("Adding Numerical symbols")
            # check whether preprocessing step is set to true and if token matches or not
            # can be modified for other regular expression, modification to re.match also possible
            elif spec_char and (re.search(currency_symbol_pattern, token) or re.search(percentage_pattern, token)):
                # re.search(currency_pattern, token)
                modified_tokens.append(token)
                if not printed_mod_spec:
                    printed_mod_spec = True
                    print("Adding Currency and Percentage symbols")

        return modified_tokens

def remove_links(string):
    # Compile the regular expression pattern to match substrings that start with a slash or a string followed by a slash, and that may contain any number of additional slashes and text in between, followed by a string with a dot and some type of file type

    # Pattern to match domain names
    domain_pattern = re.compile(r'(?:(?:https?://)?(?:www\.)?)?([\w\.]+\.[a-z]+)')
    # Pattern to match substrings like "about-us/investors/pages/"
    path_pattern = re.compile(r'(\S+\/|\/)(\/\S+\/?)*\S+\.[a-z]+')

    # Replace all links in the string with an empty string
    string = re.sub(r'https?:\/\/\S+', '', string)
    # Replace all substrings that match the domain pattern or the path pattern with an empty string
    clean_string = domain_pattern.sub('', path_pattern.sub('', string))
    # Replace multiple spaces with a single space
    clean_string = re.sub(' +', ' ', clean_string)

    return clean_string

# Filter out stop words
stop_words = nltk.corpus.stopwords.words('english')

def filter_stop_words(tokens):
    return [token for token in tokens if token not in stop_words]

# Create a lemmatizer object
lemmatizer = WordNetLemmatizer()

# Define a function to convert POS tags to WordNet POS tags
# needed to determine type of word in order to lemmatize it e.g. verb, noun
def get_wordnet_pos(treebank_pos):
    if treebank_pos.startswith('J'):
        return wordnet.ADJ
    elif treebank_pos.startswith('V'):
        return wordnet.VERB
    elif treebank_pos.startswith('N'):
        return wordnet.NOUN
    elif treebank_pos.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # default to noun if no match is found

# Apply POS tagging and lemmatization to a list of words
def lemmatize_words(words):
    tagged_words = nltk.pos_tag(words)
    lemmatized_words = []
    for word, tag in tagged_words:
        wn_tag = get_wordnet_pos(tag)
        lemmatized_words.append(lemmatizer.lemmatize(word, wn_tag))
    return lemmatized_words

# Stemming
stemmer = PorterStemmer()


path = str(Path.cwd()) + '\project_training.json'
# print(path)
# Read the JSON file
with open(path, 'r') as f:
    data = f.read()

# Load the JSON data into a dataframe
df_train = pd.read_json(data)

# Create a dataframes for the text and the climate label
df_text = pd.DataFrame(df_train, columns=['text'])
df_climate = pd.DataFrame(df_train, columns=['climate'])
# Keep rows where df_climate is 'yes'
df_filtered = df_train.loc[df_climate['climate'] == 'yes']

# Split the filtered dataframe into separate dataframes
df_text_climate_yes = pd.DataFrame(df_filtered, columns=['text'])
df_sentiment = pd.DataFrame(df_filtered, columns=['sentiment'])
df_commitment = pd.DataFrame(df_filtered, columns=['commitment'])
df_specificity = pd.DataFrame(df_filtered, columns=['specificity'])
# turn all labels into numerical labels

df_climate = df_climate.replace({'yes': 1, 'no': 0})
# opportunity/neutral/risk
df_sentiment = df_sentiment.replace({'opportunity': 0, 'neutral': 1, 'risk': 2})
# yes/no
df_commitment = df_commitment.replace({'yes': 1, 'no': 0})
# specific language/non-specific language
df_specificity = df_specificity.replace({'spec': 1, 'non': 0})

path = str(Path.cwd()) + '\project_validation.json'
with open(path, 'r') as f_test:
    data_test = f_test.read()

# Load the JSON data into a dataframe
df_test = pd.read_json(data_test)
df_text_test = pd.DataFrame(df_test, columns=['text'])
df_climate_test = pd.DataFrame(df_test, columns=['climate'])
# Keep rows where df_climate is 'yes'
df_filtered_test = df_test.loc[df_climate_test['climate'] == 'yes']
df_text_test_climate_yes = pd.DataFrame(df_filtered_test, columns=['text'])
df_sentiment_test = pd.DataFrame(df_filtered_test, columns=['sentiment'])
df_commitment_test = pd.DataFrame(df_filtered_test, columns=['commitment'])
df_specificity_test = pd.DataFrame(df_filtered_test, columns=['specificity'])
# same for climate classification text data
df_climate_test = df_climate_test.replace({'yes': 1, 'no': 0})
# opportunity/neutral/risk
df_sentiment_test = df_sentiment_test.replace({'opportunity': 0, 'neutral': 1, 'risk': 2})
# yes/no
df_commitment_test = df_commitment_test.replace({'yes': 1, 'no': 0})
# specific language/non-specific language
df_specificity_test = df_specificity_test.replace({'spec': 1, 'non': 0})

# remove links
if preprocess_links:
    print("Removing Links...")
    df_text['text'] = df_text['text'].apply(remove_links)
    df_text_climate_yes['text'] = df_text_climate_yes['text'].apply(remove_links)
    df_text_test['text'] = df_text_test['text'].apply(remove_links)
    df_text_test_climate_yes['text'] = df_text_test_climate_yes['text'].apply(remove_links)

# tokenization methods
if tokenization_method == 1:
    print("Transformer Tokenization")
    df_text['tokens'] = df_text['text'].apply(tokenizer.tokenize)
    df_text_climate_yes['tokens'] = df_text_climate_yes['text'].apply(tokenizer.tokenize)
    df_text_test['tokens'] = df_text_test['text'].apply(tokenizer.tokenize)
    df_text_test_climate_yes['tokens'] = df_text_test_climate_yes['text'].apply(tokenizer.tokenize)
elif tokenization_method == 2:
    print("NLTK Tokenization")
    df_text['tokens'] = df_text['text'].apply(nltk.word_tokenize)
    df_text_climate_yes['tokens'] = df_text_climate_yes['text'].apply(nltk.word_tokenize)
    df_text_test['tokens'] = df_text_test['text'].apply(nltk.word_tokenize)
    df_text_test_climate_yes['tokens'] = df_text_test_climate_yes['text'].apply(nltk.word_tokenize)
elif tokenization_method == 3:
    print("WhiteSpace Tokenization")
    df_text['tokens'] = df_text['text'].str.split()
    df_text_climate_yes['tokens'] = df_text_climate_yes['text'].str.split()
    df_text_test['tokens'] = df_text_test['text'].str.split()
    df_text_test_climate_yes['tokens'] = df_text_test_climate_yes['text'].str.split()

# preprocessing
df_text['tokens'] = df_text['tokens'].apply(lowercase_delete_special_characters)
df_text_climate_yes['tokens'] = df_text_climate_yes['tokens'].apply(lowercase_delete_special_characters)
df_text_test['tokens'] = df_text_test['tokens'].apply(lowercase_delete_special_characters)
df_text_test_climate_yes['tokens'] = df_text_test_climate_yes['tokens'].apply(lowercase_delete_special_characters)


# stopwords
if remove_stopwords:
    print("Removing Stopwords...")
    df_text['tokens'] = df_text['tokens'].apply(filter_stop_words)
    df_text_climate_yes['tokens'] = df_text_climate_yes['tokens'].apply(filter_stop_words)
    df_text_test['tokens'] = df_text_test['tokens'].apply(filter_stop_words)
    df_text_test_climate_yes['tokens'] = df_text_test_climate_yes['tokens'].apply(filter_stop_words)

# lemmatization
if lemmatize_tokens:
    print("Lemmatizing...")
    # Apply lemmatization to the 'tokens' column in your DataFrame
    df_text['tokens'] = df_text['tokens'].apply(lambda x: lemmatize_words(x))
    df_text_climate_yes['tokens'] = df_text_climate_yes['tokens'].apply(lambda x: lemmatize_words(x))
    df_text_test['tokens'] = df_text_test['tokens'].apply(lambda x: lemmatize_words(x))
    df_text_test_climate_yes['tokens'] = df_text_test_climate_yes['tokens'].apply(lambda x: lemmatize_words(x))

# stemming
if stemm_tokens:
    print("Stemming...")
    df_text['tokens'] = df_text['tokens'].apply(lambda x: [stemmer.stem(word) for word in x])
    df_text_climate_yes['tokens'] = df_text_climate_yes['tokens'].apply(lambda x: [stemmer.stem(word) for word in x])
    df_text_test['tokens'] = df_text_test['tokens'].apply(lambda x: [stemmer.stem(word) for word in x])
    df_text_test_climate_yes['tokens'] = df_text_test_climate_yes['tokens'].apply(lambda x: [stemmer.stem(word) for word in x])

df_text

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ThreadTheRipper\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ThreadTheRipper\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Transformer Tokenization
Adding Alphabetic symbols and lowercase
Adding Numerical symbols


Unnamed: 0,text,tokens
0,The accelerator programs have sub-portfolios o...,"[the, accelerator, programs, have, sub, portfo..."
1,"Also by means of BNDES Finem, we offer credit ...","[also, by, means, of, bndes, finem, we, offer,..."
2,Climate change Climate change exposes UPM to v...,"[climate, change, climate, change, exposes, up..."
3,Several tools and methodologies aimed at asses...,"[several, tools, and, methodologies, aimed, at..."
4,We worked with the UK government to accelerate...,"[we, worked, with, the, uk, government, to, ac..."
...,...,...
395,"At the beginning of 2019, VINCI Airports signe...","[at, the, beginning, of, 2019, vinci, airports..."
396,We have also signed up to the Partnership for ...,"[we, have, also, signed, up, to, the, partners..."
397,Suzano also is involved and spearheads externa...,"[suzano, also, is, involved, and, spearheads, ..."
398,Risks to the Group’s reputation Risks include ...,"[risks, to, the, group, reputation, risks, inc..."


In [222]:
# Self-trained word embedding, only run if self-trained word embedding needed

import gensim
# read the unlabelled data
path = Path.cwd() / 'NLP_project_unlabelled_slim.txt'
with open(path, 'r', encoding='utf-8') as file:
    text_unlabelled = file.read()

# split each paragraph into one position of a list
text_unlabelled_list = text_unlabelled.split('\n')
# save unlabelled paragraphs into dataframe
df_text_unlabelled = pd.DataFrame(text_unlabelled_list, columns=['text'])
# concat with training data
df_text_unlabelled = pd.DataFrame(pd.concat([df_text.text, df_text_unlabelled.text], ignore_index=True))
# drop duplicates
df_text_unlabelled = df_text_unlabelled.drop_duplicates()
# drop last row as it is empty
df_text_unlabelled.drop([100400], inplace=True)

# tokenize and preprocess using NLTK, preprocessing is still dependant on set variables at beginning of Notebook
df_text_unlabelled['tokens'] = df_text_unlabelled['text'].apply(nltk.word_tokenize)
df_text_unlabelled['tokens'] = df_text_unlabelled['tokens'].apply(lowercase_delete_special_characters)

# Convert the data to a list of lists of tokens
data_embedding = list(df_text_unlabelled['tokens'])

# Initialize a Word2Vec model with an embedding size of 300, and 5 epochs
own_embedding = gensim.models.Word2Vec(sentences=data_embedding, vector_size=300, min_count=1, window=5, epochs=5)
# self-trained word embedding
model_word2vec = own_embedding.wv

In [230]:
import numpy as np
from gensim.models import KeyedVectors

# load GloVe word embedding
model_word2vec = KeyedVectors.load_word2vec_format('glove.6B.300d.txt', binary=False, no_header=True)


In [231]:
def document_vector_func(doc):
    # Create document vectors by averaging word vectors, Remove out-of-vocabulary words
    doc = [model_word2vec.get_vector(word) for word in doc if word in model_word2vec.index_to_key]
    doc = np.vstack(doc)
    return np.mean(doc, axis=0)

In [238]:
# calculate document vectors using defined function above
df_text['document_vectors'] = df_text['tokens'].apply(document_vector_func)
df_text_test['document_vectors'] = df_text_test['tokens'].apply(document_vector_func)
df_text_climate_yes['document_vectors'] = df_text_climate_yes['tokens'].apply(document_vector_func)
df_text_test_climate_yes['document_vectors'] = df_text_test_climate_yes['tokens'].apply(document_vector_func)

# convert to list
document_vectors = df_text.document_vectors.tolist()
document_vectors_test = df_text_test.document_vectors.tolist()
document_vectors_climate_yes = df_text_climate_yes.document_vectors.tolist()
document_vectors_test_climate_yes = df_text_test_climate_yes.document_vectors.tolist()

# save document vectors in appropriate dataframe, so that it can be used when classifying
df_climate['document_vector'] = document_vectors
df_climate_test['document_vector'] = document_vectors_test
df_sentiment['document_vector'] = document_vectors_climate_yes
df_sentiment_test['document_vector'] = document_vectors_test_climate_yes
df_commitment['document_vector'] = document_vectors_climate_yes
df_commitment_test['document_vector'] = document_vectors_test_climate_yes
df_specificity['document_vector'] = document_vectors_climate_yes
df_specificity_test['document_vector'] = document_vectors_test_climate_yes

# update the dataframe column order (document_vector, label)
df_climate = df_climate.iloc[:, [1, 0]]
df_climate_test = df_climate_test.iloc[:, [1, 0]]
df_sentiment = df_sentiment.iloc[:, [1, 0]]
df_sentiment_test = df_sentiment_test.iloc[:, [1, 0]]
df_commitment = df_commitment.iloc[:, [1, 0]]
df_commitment_test = df_commitment_test.iloc[:, [1, 0]]
df_specificity = df_specificity.iloc[:, [1, 0]]
df_specificity_test = df_specificity_test.iloc[:, [1, 0]]


df_climate_test

Unnamed: 0,document_vector,climate
0,"[-0.091688745, 0.07141805, -0.12825042, -0.115...",0
1,"[-0.10824829, 0.11942579, -0.011191578, -0.206...",1
2,"[-0.078438394, 0.058795307, 0.0060408427, -0.1...",1
3,"[-0.09102161, 0.16441008, 0.043849185, -0.1823...",1
4,"[-0.030263793, 0.057995267, -0.07471928, -0.19...",1
...,...,...
395,"[-0.11305666, 0.10444998, 0.05659384, -0.19879...",0
396,"[-0.092985146, 0.12477428, 0.0066316435, -0.20...",1
397,"[-0.01266073, 0.009422414, -0.082624316, -0.18...",1
398,"[-0.13721633, 0.1194861, 0.040766872, -0.19293...",1


In [239]:
# climate classification

m = LogisticRegression(penalty=None, max_iter=300)
# fit training data
m.fit(df_climate.document_vector.values.tolist(), df_climate.climate)

# sns.histplot(m.coef_[0], kde=True, binwidth=0.1)

# predict test data
pred_class = m.predict(df_climate_test.document_vector.values.tolist())

# compare result of prediction for test data with actual labels of test data
print(classification_report(df_climate_test.climate, pred_class))
# write the result into an Excel file, Hint: Excel File has to exist before data can be written
result_climate = pd.DataFrame(classification_report(df_climate_test.climate, pred_class, output_dict=True)).transpose()
with pd.ExcelWriter("metrics_new.xlsx", mode="a", engine="openpyxl", if_sheet_exists='replace') as writer:
    result_climate.to_excel(writer, sheet_name="glove_climate_LR")

              precision    recall  f1-score   support

           0       0.74      0.59      0.66        71
           1       0.92      0.95      0.93       329

    accuracy                           0.89       400
   macro avg       0.83      0.77      0.80       400
weighted avg       0.88      0.89      0.89       400



In [240]:
# sentiment classification

m = LogisticRegression(penalty=None, max_iter=300)
# fit training data
m.fit(df_sentiment.document_vector.values.tolist(), df_sentiment.sentiment)

# sns.histplot(m.coef_[0], kde=True, binwidth=0.1)

# predict test data
pred_class = m.predict(df_sentiment_test.document_vector.values.tolist())

# compare result of prediction for test data with actual labels of test data
print(classification_report(df_sentiment_test.sentiment, pred_class))
# write the result into an Excel file, Hint: Excel File has to exist before data can be written
result_climate = pd.DataFrame(classification_report(df_sentiment_test.sentiment, pred_class, output_dict=True)).transpose()
with pd.ExcelWriter("metrics_new.xlsx", mode="a", engine="openpyxl", if_sheet_exists='replace') as writer:
    result_climate.to_excel(writer, sheet_name="glove_sentiment_LR")

              precision    recall  f1-score   support

           0       0.65      0.78      0.71        81
           1       0.73      0.62      0.67       136
           2       0.81      0.84      0.82       112

    accuracy                           0.74       329
   macro avg       0.73      0.75      0.74       329
weighted avg       0.74      0.74      0.73       329



In [241]:
# commitment classification

m = LogisticRegression(penalty=None, max_iter=300)
# fit training data
m.fit(df_commitment.document_vector.values.tolist(), df_commitment.commitment)

# sns.histplot(m.coef_[0], kde=True, binwidth=0.1)

# predict test data
pred_class = m.predict(df_commitment_test.document_vector.values.tolist())

# compare result of prediction for test data with actual labels of test data
print(classification_report(df_commitment_test.commitment, pred_class))
# write the result into an Excel file, Hint: Excel File has to exist before data can be written
result_climate = pd.DataFrame(classification_report(df_commitment_test.commitment, pred_class, output_dict=True)).transpose()
with pd.ExcelWriter("metrics_new.xlsx", mode="a", engine="openpyxl", if_sheet_exists='replace') as writer:
    result_climate.to_excel(writer, sheet_name="glove_commitment_LR")

              precision    recall  f1-score   support

           0       0.84      0.74      0.79       190
           1       0.69      0.81      0.75       139

    accuracy                           0.77       329
   macro avg       0.77      0.77      0.77       329
weighted avg       0.78      0.77      0.77       329



In [242]:
# specificity classification

m = LogisticRegression(penalty=None, max_iter=300)
# fit training data
m.fit(df_specificity.document_vector.values.tolist(), df_specificity.specificity)

# sns.histplot(m.coef_[0], kde=True, binwidth=0.1)

# predict test data
pred_class = m.predict(df_specificity_test.document_vector.values.tolist())

# compare result of prediction for test data with actual labels of test data
print(classification_report(df_specificity_test.specificity, pred_class))
# write the result into an Excel file, Hint: Excel File has to exist before data can be written
result_climate = pd.DataFrame(classification_report(df_specificity_test.specificity, pred_class, output_dict=True)).transpose()
with pd.ExcelWriter("metrics_new.xlsx", mode="a", engine="openpyxl", if_sheet_exists='replace') as writer:
    result_climate.to_excel(writer, sheet_name="glove_specificity_LR")

              precision    recall  f1-score   support

           0       0.87      0.78      0.82       197
           1       0.72      0.82      0.76       132

    accuracy                           0.80       329
   macro avg       0.79      0.80      0.79       329
weighted avg       0.81      0.80      0.80       329

