In [66]:
import pandas as pd
import nltk
from pathlib import Path
nltk.download('punkt')

path = str(Path.cwd()) + '\project_training.json'
# print(path)
# Read the JSON file
with open(path, 'r') as f:
    data = f.read()

# Load the JSON data into a dataframe
df_train = pd.read_json(data)

# Create a dataframes for the text and the climate label
df_text = pd.DataFrame(df_train, columns=['text'])
df_climate = pd.DataFrame(df_train, columns=['climate'])
# Keep rows where df_climate is 'yes'
df_filtered = df_train.loc[df_climate['climate'] == 'yes']

# Split the filtered dataframe into separate dataframes
df_text_climate_yes = pd.DataFrame(df_filtered, columns=['text'])
df_sentiment = pd.DataFrame(df_filtered, columns=['sentiment'])
df_commitment = pd.DataFrame(df_filtered, columns=['commitment'])
df_specificity = pd.DataFrame(df_filtered, columns=['specificity'])
# turn all labels into numerical labels

df_climate = df_climate.replace({'yes': 1, 'no': 0})
# opportunity/neutral/risk
df_sentiment = df_sentiment.replace({'opportunity': 0, 'neutral': 1, 'risk': 2})
# yes/no
df_commitment = df_commitment.replace({'yes': 1, 'no': 0})
# specific language/non-specific language
df_specificity = df_specificity.replace({'spec': 1, 'non': 0})

path = str(Path.cwd()) + '\project_validation.json'
with open(path, 'r') as f_test:
    data_test = f_test.read()

# Load the JSON data into a dataframe
df_test = pd.read_json(data_test)
df_text_test = pd.DataFrame(df_test, columns=['text'])
df_climate_test = pd.DataFrame(df_test, columns=['climate'])
# Keep rows where df_climate is 'yes'
df_filtered_test = df_test.loc[df_climate_test['climate'] == 'yes']
df_text_test_climate_yes = pd.DataFrame(df_filtered_test, columns=['text'])
df_sentiment_test = pd.DataFrame(df_filtered_test, columns=['sentiment'])
df_commitment_test = pd.DataFrame(df_filtered_test, columns=['commitment'])
df_specificity_test = pd.DataFrame(df_filtered_test, columns=['specificity'])
# same for climate classification text data
df_climate_test = df_climate_test.replace({'yes': 1, 'no': 0})
# opportunity/neutral/risk
df_sentiment_test = df_sentiment_test.replace({'opportunity': 0, 'neutral': 1, 'risk': 2})
# yes/no
df_commitment_test = df_commitment_test.replace({'yes': 1, 'no': 0})
# specific language/non-specific language
df_specificity_test = df_specificity_test.replace({'spec': 1, 'non': 0})

def lowercase_delete_special_characters(tokens):
    modified_tokens = []
    for token in tokens:
        if token.isalpha():
            modified_tokens.append(token.lower())
        elif token.isnumeric():
            modified_tokens.append(token)
    return modified_tokens


df_text['tokens'] = df_text['text'].apply(nltk.word_tokenize)
df_text['tokens'] = df_text['tokens'].apply(lowercase_delete_special_characters)

df_text_climate_yes['tokens'] = df_text_climate_yes['text'].apply(nltk.word_tokenize)
df_text_climate_yes['tokens'] = df_text_climate_yes['tokens'].apply(lowercase_delete_special_characters)

df_text_test['tokens'] = df_text_test['text'].apply(nltk.word_tokenize)
df_text_test['tokens'] = df_text_test['tokens'].apply(lowercase_delete_special_characters)

df_text_test_climate_yes['tokens'] = df_text_test_climate_yes['text'].apply(nltk.word_tokenize)
df_text_test_climate_yes['tokens'] = df_text_test_climate_yes['tokens'].apply(lowercase_delete_special_characters)

df_text

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ThreadTheRipper\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,text,tokens
0,The accelerator programs have sub-portfolios o...,"[the, accelerator, programs, have, of, focused..."
1,"Also by means of BNDES Finem, we offer credit ...","[also, by, means, of, bndes, finem, we, offer,..."
2,Climate change Climate change exposes UPM to v...,"[climate, change, climate, change, exposes, up..."
3,Several tools and methodologies aimed at asses...,"[several, tools, and, methodologies, aimed, at..."
4,We worked with the UK government to accelerate...,"[we, worked, with, the, uk, government, to, ac..."
...,...,...
395,"At the beginning of 2019, VINCI Airports signe...","[at, the, beginning, of, 2019, vinci, airports..."
396,We have also signed up to the Partnership for ...,"[we, have, also, signed, up, to, the, partners..."
397,Suzano also is involved and spearheads externa...,"[suzano, also, is, involved, and, spearheads, ..."
398,Risks to the Group’s reputation Risks include ...,"[risks, to, the, group, s, reputation, risks, ..."


In [68]:
import numpy as np
from gensim.models import KeyedVectors

model_word2vec = KeyedVectors.load_word2vec_format('glove.6B.300d.txt', binary=False, no_header=True)
# print(model.most_similar(positive=['sustainability']))

In [69]:
def document_vector_func(doc):
    # Create document vectors by averaging word vectors, Remove out-of-vocabulary words
    doc = [model_word2vec.get_vector(word) for word in doc if word in model_word2vec.index_to_key]
    doc = np.vstack(doc)
    return np.mean(doc, axis=0)

df_text['document_vectors'] = df_text['tokens'].apply(document_vector_func)
df_text_test['document_vectors'] = df_text_test['tokens'].apply(document_vector_func)
df_text_climate_yes['document_vectors'] = df_text_climate_yes['tokens'].apply(document_vector_func)
df_text_test_climate_yes['document_vectors'] = df_text_test_climate_yes['tokens'].apply(document_vector_func)

document_vectors = df_text.document_vectors.tolist()
document_vectors_test = df_text_test.document_vectors.tolist()
document_vectors_climate_yes = df_text_climate_yes.document_vectors.tolist()
document_vectors_test_climate_yes = df_text_test_climate_yes.document_vectors.tolist()

In [70]:
# save all document vectors into one dataframe for each tasks
# cross-validation means that train and validation data doesn't need to be kept separate

document_vectors_all = document_vectors + document_vectors_test
document_vectors_climate_yes_all = document_vectors_climate_yes + document_vectors_test_climate_yes

df_climate_all = pd.concat([df_climate, df_climate_test])
df_sentiment_all = pd.concat([df_sentiment, df_sentiment_test])
df_commitment_all = pd.concat([df_commitment, df_commitment_test])
df_specificity_all = pd.concat([df_specificity, df_specificity_test])

df_climate_all['document_vector'] = document_vectors_all
df_climate_all = df_climate_all.iloc[:,[1,0]]

df_sentiment_all['document_vector'] = document_vectors_climate_yes_all
df_sentiment_all = df_sentiment_all.iloc[:,[1,0]]

df_commitment_all['document_vector'] = document_vectors_climate_yes_all
df_commitment_all = df_commitment_all.iloc[:,[1,0]]

df_specificity_all['document_vector'] = document_vectors_climate_yes_all
df_specificity_all = df_specificity_all.iloc[:,[1,0]]

In [71]:
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
import seaborn as sns
from sklearn.model_selection import KFold
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score

# climate classification

results = []
accuracy = []

# cross-validation with KFold 5-times, 5Fold
n=5
# initialize KFold
kf = KFold(n_splits=n, random_state=72, shuffle=True)

# Kfold can then be used to create the five different training and test sets
for train_index, test_index in kf.split(df_climate_all):
    train_documents = df_climate_all.iloc[train_index]
    test_documents = df_climate_all.iloc[test_index]
    # Unoptimized LR
    # m = LogisticRegression(penalty=None, max_iter=300)
    # Optimized LR
    m = LogisticRegression(penalty='l2', max_iter=300, class_weight='balanced', C=8.0)
    m.fit(train_documents['document_vector'].values.tolist(), train_documents.climate)

    # sns.histplot(m.coef_[0], kde=True, binwidth=0.1)

    pred_class = m.predict(test_documents['document_vector'].values.tolist())

    print(classification_report(test_documents.climate, pred_class))
    # append the metrics to an array
    results.append(precision_recall_fscore_support(test_documents.climate, pred_class, average='macro'))
    accuracy.append(accuracy_score(test_documents.climate, pred_class))

# calculate the average metrics
avg_precision = np.mean([results[0][0], results[1][0], results[2][0], results[3][0], results[4][0]])
avg_recall = np.mean([results[0][1], results[1][1], results[2][1], results[3][1], results[4][1]])
avg_f = np.mean([results[0][2], results[1][2], results[2][2], results[3][2], results[4][2]])
avg_acc = np.mean(accuracy)

print(f"average precision: {avg_precision}")
print(f"average recall: {avg_recall}")
print(f"average f1: {avg_f}")
print(f"average accuracy: {avg_acc}")

# save them in a format so that they can be stored in an excel file using pandas
result_climate = {"macro avg":{"average precision" : avg_precision,
                               "average recall" : avg_recall,
                               "average f1" : avg_f,
                               "average accuracy" : avg_acc
                               }}
result_climate = pd.DataFrame(result_climate).transpose()
with pd.ExcelWriter("metrics_new.xlsx", mode="a", engine="openpyxl", if_sheet_exists='replace') as writer:
    result_climate.to_excel(writer, sheet_name="glove_climate_LR")

              precision    recall  f1-score   support

           0       0.76      0.87      0.81        30
           1       0.97      0.94      0.95       130

    accuracy                           0.93       160
   macro avg       0.87      0.90      0.88       160
weighted avg       0.93      0.93      0.93       160

              precision    recall  f1-score   support

           0       0.72      0.85      0.78        27
           1       0.97      0.93      0.95       133

    accuracy                           0.92       160
   macro avg       0.84      0.89      0.86       160
weighted avg       0.93      0.92      0.92       160

              precision    recall  f1-score   support

           0       0.61      0.85      0.71        26
           1       0.97      0.90      0.93       134

    accuracy                           0.89       160
   macro avg       0.79      0.87      0.82       160
weighted avg       0.91      0.89      0.89       160

              preci

In [72]:
# sentiment classification

results = []
accuracy = []

n=5
kf = KFold(n_splits=n, random_state=72, shuffle=True)

for train_index, test_index in kf.split(df_sentiment_all):
    train_documents = df_sentiment_all.iloc[train_index]
    test_documents = df_sentiment_all.iloc[test_index]
    # Unoptimized LR
    # m_sent = LogisticRegression(penalty=None, max_iter=300)
    # Optimized LR
    m_sent = LogisticRegression(penalty='l2', max_iter=300)
    m_sent.fit(train_documents['document_vector'].values.tolist(), train_documents.sentiment)

    # sns.histplot(m_sent.coef_[0], kde=True, binwidth=0.1)

    pred_class = m_sent.predict(test_documents['document_vector'].values.tolist())

    print(classification_report(test_documents.sentiment, pred_class))
    results.append(precision_recall_fscore_support(test_documents.sentiment, pred_class, average='macro'))
    accuracy.append(accuracy_score(test_documents.sentiment, pred_class))

avg_precision = np.mean([results[0][0], results[1][0], results[2][0], results[3][0], results[4][0]])
avg_recall = np.mean([results[0][1], results[1][1], results[2][1], results[3][1], results[4][1]])
avg_f = np.mean([results[0][2], results[1][2], results[2][2], results[3][2], results[4][2]])
avg_acc = np.mean(accuracy)

print(f"average precision: {avg_precision}")
print(f"average recall: {avg_recall}")
print(f"average f1: {avg_f}")
print(f"average accuracy: {avg_acc}")

result_sent = {"macro avg":{"average precision" : avg_precision,
                               "average recall" : avg_recall,
                               "average f1" : avg_f,
                               "average accuracy" : avg_acc
                               }}
result_sent = pd.DataFrame(result_sent).transpose()
with pd.ExcelWriter("metrics_new.xlsx", mode="a", engine="openpyxl", if_sheet_exists='replace') as writer:
    result_sent.to_excel(writer, sheet_name="glove_sentiment_LR")

              precision    recall  f1-score   support

           0       0.65      0.65      0.65        40
           1       0.70      0.73      0.71        55
           2       0.92      0.87      0.89        38

    accuracy                           0.74       133
   macro avg       0.76      0.75      0.75       133
weighted avg       0.75      0.74      0.75       133

              precision    recall  f1-score   support

           0       0.79      0.59      0.68        32
           1       0.73      0.88      0.80        51
           2       0.96      0.90      0.93        49

    accuracy                           0.82       132
   macro avg       0.82      0.79      0.80       132
weighted avg       0.83      0.82      0.82       132

              precision    recall  f1-score   support

           0       0.92      0.66      0.77        35
           1       0.66      0.87      0.75        47
           2       0.91      0.82      0.86        50

    accuracy        

In [73]:
# commitment classification

results = []
accuracy = []

n=5
kf = KFold(n_splits=n, random_state=72, shuffle=True)
# penalty='l1', C=1.0, solver='liblinear'
# penalty='none'

# penalty='l2', C=1.0, max_iter=100
for train_index, test_index in kf.split(df_commitment_all):
    train_documents = df_commitment_all.iloc[train_index]
    test_documents = df_commitment_all.iloc[test_index]
    # Unoptimized LR
    # m_com = LogisticRegression(penalty=None, max_iter=300)
    # Optimized LR
    m_com = LogisticRegression(penalty='l2', max_iter=300, C=5.0)
    m_com.fit(train_documents['document_vector'].values.tolist(), train_documents.commitment)

    # sns.histplot(m_com.coef_[0], kde=True, binwidth=0.1)

    pred_class = m_com.predict(test_documents['document_vector'].values.tolist())

    print(classification_report(test_documents.commitment, pred_class))
    results.append(precision_recall_fscore_support(test_documents.commitment, pred_class, average='macro'))
    accuracy.append(accuracy_score(test_documents.commitment, pred_class))

avg_precision = np.mean([results[0][0], results[1][0], results[2][0], results[3][0], results[4][0]])
avg_recall = np.mean([results[0][1], results[1][1], results[2][1], results[3][1], results[4][1]])
avg_f = np.mean([results[0][2], results[1][2], results[2][2], results[3][2], results[4][2]])
avg_acc = np.mean(accuracy)

print(f"average precision: {avg_precision}")
print(f"average recall: {avg_recall}")
print(f"average f1: {avg_f}")
print(f"average accuracy: {avg_acc}")

result_commitment = {"macro avg":{"average precision" : avg_precision,
                            "average recall" : avg_recall,
                            "average f1" : avg_f,
                            "average accuracy" : avg_acc
                            }}
result_commitment = pd.DataFrame(result_commitment).transpose()
with pd.ExcelWriter("metrics_new.xlsx", mode="a", engine="openpyxl", if_sheet_exists='replace') as writer:
    result_commitment.to_excel(writer, sheet_name="glove_commitment_LR")

              precision    recall  f1-score   support

           0       0.88      0.78      0.83        74
           1       0.76      0.86      0.81        59

    accuracy                           0.82       133
   macro avg       0.82      0.82      0.82       133
weighted avg       0.83      0.82      0.82       133

              precision    recall  f1-score   support

           0       0.84      0.88      0.86        72
           1       0.84      0.80      0.82        60

    accuracy                           0.84       132
   macro avg       0.84      0.84      0.84       132
weighted avg       0.84      0.84      0.84       132

              precision    recall  f1-score   support

           0       0.79      0.91      0.85        70
           1       0.88      0.73      0.80        62

    accuracy                           0.83       132
   macro avg       0.84      0.82      0.82       132
weighted avg       0.83      0.83      0.82       132

              preci

In [74]:
# specificity classification

results = []
accuracy = []

n=5
kf = KFold(n_splits=n, random_state=72, shuffle=True)
# penalty='l1', C=1.0, solver='liblinear'
# penalty='none'

# penalty='l2', C=1.0, max_iter=100
for train_index, test_index in kf.split(df_specificity_all):
    train_documents = df_specificity_all.iloc[train_index]
    test_documents = df_specificity_all.iloc[test_index]

    # Unoptimized LR
    # m_spec = LogisticRegression(penalty=None, max_iter=300)
    # Optimized LR
    m_spec = LogisticRegression(penalty='l1', C=5.0, solver='liblinear', max_iter=300)

    m_spec.fit(train_documents['document_vector'].values.tolist(), train_documents.specificity)

    # sns.histplot(m_spec.coef_[0], kde=True, binwidth=0.1)

    pred_class = m_spec.predict(test_documents['document_vector'].values.tolist())

    print(classification_report(test_documents.specificity, pred_class))
    results.append(precision_recall_fscore_support(test_documents.specificity, pred_class, average='macro'))
    accuracy.append(accuracy_score(test_documents.specificity, pred_class))

avg_precision = np.mean([results[0][0], results[1][0], results[2][0], results[3][0], results[4][0]])
avg_recall = np.mean([results[0][1], results[1][1], results[2][1], results[3][1], results[4][1]])
avg_f = np.mean([results[0][2], results[1][2], results[2][2], results[3][2], results[4][2]])
avg_acc = np.mean(accuracy)

print(f"average precision: {avg_precision}")
print(f"average recall: {avg_recall}")
print(f"average f1: {avg_f}")
print(f"average accuracy: {avg_acc}")

result_specificity = {"macro avg":{"average precision" : avg_precision,
                                  "average recall" : avg_recall,
                                  "average f1" : avg_f,
                                  "average accuracy" : avg_acc
                                  }}
result_specificity = pd.DataFrame(result_specificity).transpose()
with pd.ExcelWriter("metrics_new.xlsx", mode="a", engine="openpyxl", if_sheet_exists='replace') as writer:
    result_specificity.to_excel(writer, sheet_name="glove_specificity_LR")

              precision    recall  f1-score   support

           0       0.87      0.83      0.85        72
           1       0.81      0.85      0.83        61

    accuracy                           0.84       133
   macro avg       0.84      0.84      0.84       133
weighted avg       0.84      0.84      0.84       133

              precision    recall  f1-score   support

           0       0.78      0.88      0.83        74
           1       0.82      0.69      0.75        58

    accuracy                           0.80       132
   macro avg       0.80      0.78      0.79       132
weighted avg       0.80      0.80      0.79       132

              precision    recall  f1-score   support

           0       0.86      0.91      0.89        82
           1       0.84      0.76      0.80        50

    accuracy                           0.86       132
   macro avg       0.85      0.84      0.84       132
weighted avg       0.86      0.86      0.85       132

              preci