In [63]:
import pandas as pd
import nltk
from pathlib import Path
nltk.download('punkt')

path = str(Path.cwd()) + '\project_training.json'
# print(path)
# Read the JSON file
with open(path, 'r') as f:
    data = f.read()

# Load the JSON data into a dataframe
df_train = pd.read_json(data)

# Create a dataframes for the text and the climate label
df_text = pd.DataFrame(df_train, columns=['text'])
df_climate = pd.DataFrame(df_train, columns=['climate'])
# Keep rows where df_climate is 'yes'
df_filtered = df_train.loc[df_climate['climate'] == 'yes']

# Split the filtered dataframe into separate dataframes
df_text_climate_yes = pd.DataFrame(df_filtered, columns=['text'])
df_sentiment = pd.DataFrame(df_filtered, columns=['sentiment'])
df_commitment = pd.DataFrame(df_filtered, columns=['commitment'])
df_specificity = pd.DataFrame(df_filtered, columns=['specificity'])
# turn all labels into numerical labels

df_climate = df_climate.replace({'yes': 1, 'no': 0})
# opportunity/neutral/risk
df_sentiment = df_sentiment.replace({'opportunity': 0, 'neutral': 1, 'risk': 2})
# yes/no
df_commitment = df_commitment.replace({'yes': 1, 'no': 0})
# specific language/non-specific language
df_specificity = df_specificity.replace({'spec': 1, 'non': 0})

path = str(Path.cwd()) + '\project_validation.json'
with open(path, 'r') as f_test:
    data_test = f_test.read()

# Load the JSON data into a dataframe
df_test = pd.read_json(data_test)
df_text_test = pd.DataFrame(df_test, columns=['text'])
df_climate_test = pd.DataFrame(df_test, columns=['climate'])
# Keep rows where df_climate is 'yes'
df_filtered_test = df_test.loc[df_climate_test['climate'] == 'yes']
df_text_test_climate_yes = pd.DataFrame(df_filtered_test, columns=['text'])
df_sentiment_test = pd.DataFrame(df_filtered_test, columns=['sentiment'])
df_commitment_test = pd.DataFrame(df_filtered_test, columns=['commitment'])
df_specificity_test = pd.DataFrame(df_filtered_test, columns=['specificity'])
# same for climate classification text data
df_climate_test = df_climate_test.replace({'yes': 1, 'no': 0})
# opportunity/neutral/risk
df_sentiment_test = df_sentiment_test.replace({'opportunity': 0, 'neutral': 1, 'risk': 2})
# yes/no
df_commitment_test = df_commitment_test.replace({'yes': 1, 'no': 0})
# specific language/non-specific language
df_specificity_test = df_specificity_test.replace({'spec': 1, 'non': 0})

def lowercase_delete_special_characters(tokens):
    modified_tokens = []
    for token in tokens:
        if token.isalpha():
            modified_tokens.append(token.lower())
        elif token.isnumeric():
            modified_tokens.append(token)
    return modified_tokens


df_text['tokens'] = df_text['text'].apply(nltk.word_tokenize)
df_text['tokens'] = df_text['tokens'].apply(lowercase_delete_special_characters)

df_text_climate_yes['tokens'] = df_text_climate_yes['text'].apply(nltk.word_tokenize)
df_text_climate_yes['tokens'] = df_text_climate_yes['tokens'].apply(lowercase_delete_special_characters)

df_text_test['tokens'] = df_text_test['text'].apply(nltk.word_tokenize)
df_text_test['tokens'] = df_text_test['tokens'].apply(lowercase_delete_special_characters)

df_text_test_climate_yes['tokens'] = df_text_test_climate_yes['text'].apply(nltk.word_tokenize)
df_text_test_climate_yes['tokens'] = df_text_test_climate_yes['tokens'].apply(lowercase_delete_special_characters)

df_text

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ThreadTheRipper\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,text,tokens
0,The accelerator programs have sub-portfolios o...,"[the, accelerator, programs, have, of, focused..."
1,"Also by means of BNDES Finem, we offer credit ...","[also, by, means, of, bndes, finem, we, offer,..."
2,Climate change Climate change exposes UPM to v...,"[climate, change, climate, change, exposes, up..."
3,Several tools and methodologies aimed at asses...,"[several, tools, and, methodologies, aimed, at..."
4,We worked with the UK government to accelerate...,"[we, worked, with, the, uk, government, to, ac..."
...,...,...
395,"At the beginning of 2019, VINCI Airports signe...","[at, the, beginning, of, 2019, vinci, airports..."
396,We have also signed up to the Partnership for ...,"[we, have, also, signed, up, to, the, partners..."
397,Suzano also is involved and spearheads externa...,"[suzano, also, is, involved, and, spearheads, ..."
398,Risks to the Group’s reputation Risks include ...,"[risks, to, the, group, s, reputation, risks, ..."


In [64]:
import numpy as np
from tensorflow.keras.utils import pad_sequences
from gensim.models import KeyedVectors

model_word2vec = KeyedVectors.load_word2vec_format('glove.6B.300d.txt', binary=False, no_header=True)
# print(model.most_similar(positive=['sustainability']))

In [66]:
# Get the number of dimensions in the Word2Vec model
num_dimensions = model_word2vec.vector_size

# replace the tokens with their numerical representations (word_embedding index +1 to account for OOV words in embedding matrix later)
df_text['tokens_num'] = df_text['tokens'].apply(lambda x: [model_word2vec.get_index(j)+1 for j in x if j in model_word2vec.index_to_key])
# calculate number of tokens per paragraph
df_text['num_of_tokens'] = df_text['tokens_num'].apply(len)
# print the highest number of tokens for a paragraph (needed for padding to make paragraphs the same length)
print(df_text['num_of_tokens'].max())

df_text_test['tokens_num'] = df_text_test['tokens'].apply(lambda x: [model_word2vec.get_index(j)+1 for j in x if j in model_word2vec.index_to_key])
df_text_test['num_of_tokens'] = df_text_test['tokens_num'].apply(len)
print(df_text_test['num_of_tokens'].max())

df_text_climate_yes['tokens_num'] = df_text_climate_yes['tokens'].apply(lambda x: [model_word2vec.get_index(j)+1 for j in x if j in model_word2vec.index_to_key])
df_text_climate_yes['num_of_tokens'] = df_text_climate_yes['tokens_num'].apply(len)
print(df_text_climate_yes['num_of_tokens'].max())

df_text_test_climate_yes['tokens_num'] = df_text_test_climate_yes['tokens'].apply(lambda x: [model_word2vec.get_index(j)+1 for j in x if j in model_word2vec.index_to_key])
df_text_test_climate_yes['num_of_tokens'] = df_text_test_climate_yes['tokens_num'].apply(len)
print(df_text_test_climate_yes['num_of_tokens'].max())

# define maximum number of tokens for each paragraph, number is based on preprocessing and can change if preprocessing is modified
max_len_for_padding = 420
# pad lists of tokens at the end to make them uniform in length
padding_type = 'post'

# padding of token lists
df_text['tokens_num'] = df_text['tokens_num'].apply(lambda x: pad_sequences([x], maxlen=max_len_for_padding, padding=padding_type)[0])

df_text_test['tokens_num'] = df_text_test['tokens_num'].apply(lambda x: pad_sequences([x], maxlen=max_len_for_padding, padding=padding_type)[0])

df_text_climate_yes['tokens_num'] = df_text_climate_yes['tokens_num'].apply(lambda x: pad_sequences([x], maxlen=max_len_for_padding, padding=padding_type)[0])

df_text_test_climate_yes['tokens_num'] = df_text_test_climate_yes['tokens_num'].apply(lambda x: pad_sequences([x], maxlen=max_len_for_padding, padding=padding_type)[0])

df_text

418
269
418
269


Unnamed: 0,text,tokens,tokens_num,num_of_tokens
0,The accelerator programs have sub-portfolios o...,"[the, accelerator, programs, have, of, focused...","[1, 21736, 1010, 34, 4, 2336, 9161, 1808, 22, ...",28
1,"Also by means of BNDES Finem, we offer credit ...","[also, by, means, of, bndes, finem, we, offer,...","[53, 22, 890, 4, 107047, 54, 902, 1165, 26, 1,...",31
2,Climate change Climate change exposes UPM to v...,"[climate, change, climate, change, exposes, up...","[1949, 512, 1949, 512, 24518, 49056, 5, 2166, ...",129
3,Several tools and methodologies aimed at asses...,"[several, tools, and, methodologies, aimed, at...","[202, 4316, 6, 40100, 1637, 23, 13228, 1, 4764...",289
4,We worked with the UK government to accelerate...,"[we, worked, with, the, uk, government, to, ac...","[54, 763, 18, 1, 2047, 79, 5, 8710, 1, 3670, 5...",61
...,...,...,...,...
395,"At the beginning of 2019, VINCI Airports signe...","[at, the, beginning, of, 2019, vinci, airports...","[23, 1, 1085, 4, 40469, 17581, 4949, 759, 8, 9...",66
396,We have also signed up to the Partnership for ...,"[we, have, also, signed, up, to, the, partners...","[54, 34, 53, 759, 61, 5, 1, 2884, 11, 4137, 35...",60
397,Suzano also is involved and spearheads externa...,"[suzano, also, is, involved, and, spearheads, ...","[219104, 53, 15, 792, 6, 59691, 3752, 1219, 12...",28
398,Risks to the Group’s reputation Risks include ...,"[risks, to, the, group, s, reputation, risks, ...","[3344, 5, 1, 130, 1535, 3148, 3344, 489, 2538,...",56


In [67]:
# merge all document representations as numbers for each classification task including their label

df_text_all = pd.concat([df_text['tokens_num'], df_text_test['tokens_num']])
df_text_climate_yes_all = pd.concat([df_text_climate_yes['tokens_num'], df_text_test_climate_yes['tokens_num']])

df_climate_all = pd.concat([df_climate, df_climate_test])
df_sentiment_all = pd.concat([df_sentiment, df_sentiment_test])
df_commitment_all = pd.concat([df_commitment, df_commitment_test])
df_specificity_all = pd.concat([df_specificity, df_specificity_test])

df_climate_all['document_vector'] = df_text_all
df_climate_all = df_climate_all.iloc[:,[1,0]]

df_sentiment_all['document_vector'] = df_text_climate_yes_all
df_sentiment_all = df_sentiment_all.iloc[:,[1,0]]

df_commitment_all['document_vector'] = df_text_climate_yes_all
df_commitment_all = df_commitment_all.iloc[:,[1,0]]

df_specificity_all['document_vector'] = df_text_climate_yes_all
df_specificity_all = df_specificity_all.iloc[:,[1,0]]

df_climate_all

Unnamed: 0,document_vector,climate
0,"[1, 21736, 1010, 34, 4, 2336, 9161, 1808, 22, ...",1
1,"[53, 22, 890, 4, 107047, 54, 902, 1165, 26, 1,...",0
2,"[1949, 512, 1949, 512, 24518, 49056, 5, 2166, ...",1
3,"[202, 4316, 6, 40100, 1637, 23, 13228, 1, 4764...",1
4,"[54, 763, 18, 1, 2047, 79, 5, 8710, 1, 3670, 5...",1
...,...,...
395,"[28155, 5920, 1089, 845, 7, 110, 1421, 145, 62...",0
396,"[207, 2044, 5679, 20740, 94834, 13160, 33, 565...",1
397,"[448, 14, 163, 300, 4, 635, 5325, 5473, 54, 75...",1
398,"[7, 40469, 203475, 15800, 1, 43010, 1765, 3360...",1


In [68]:
# necessary to check whether GPU was detected
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 5527922439205444394
xla_global_id: -1
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 6277824512
locality {
  bus_id: 1
  links {
  }
}
incarnation: 9287549024288082359
physical_device_desc: "device: 0, name: NVIDIA GeForce RTX 2070, pci bus id: 0000:42:00.0, compute capability: 7.5"
xla_global_id: 416903419
]


In [73]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import gc
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from sklearn.model_selection import KFold
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score

results = []
accuracy = []

n=5
kf = KFold(n_splits=n, random_state=72, shuffle=True)

for train_index, test_index in kf.split(df_climate_all):
    # Garbage Collector: needed to clear GPU memory
    gc.collect()
    train_documents = df_climate_all.iloc[train_index]
    test_documents = df_climate_all.iloc[test_index]

    # define train test split in order for NN to be able to train
    X_train, X_val, y_train, y_val = train_test_split(train_documents['document_vector'], train_documents['climate'], test_size=0.2)


    # the numerical representations for each text are now in one column saved as a list
    # to be able to feed the data to the Neural Network, a Dataframe is needed with one numerical word representation per column
    # rows still represent the document
    # zeros are used as padding to get the same number of columns
    X_train = pd.DataFrame(X_train.tolist())
    X_val = pd.DataFrame(X_val.tolist())
    y_train = pd.DataFrame(y_train.tolist())
    y_val = pd.DataFrame(y_val.tolist())

    # get number of classes
    num_classes = train_documents['climate'].nunique()

    # vectors themselves, as 2D numpy array
    weights = np.array(model_word2vec.vectors)

    # to create the embedding matrix that is needed, we need to add an additional row with zeros
    # this is necessary as otherwise the model won't be able to handle OOV words
    # also the reason for always taking the word index plus one when converting words into numerical features
    new_row = np.zeros((1, weights.shape[1]))
    # embedding matrix which will be used as the embedding layer in our NN
    weights = np.vstack((new_row, weights))

    # avoid overfitting
    callbacks = [tf.keras.callbacks.EarlyStopping(
        monitor='val_loss', patience=3)]

    # choose Adam optimizer with a learning rate of 1e-3
    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)

    # batch size depends on the classification task, here 50 was determined as well performing
    batch_size = 50

    # define input shape for embedding (first) layer of the model
    input_shape=X_train.shape[1:]

    # model definition
    model = Sequential()
    # embedding layer, make it trainable to achieve better results
    model.add(layers.Embedding(input_dim=weights.shape[0],
                               output_dim=weights.shape[1],
                               input_length=max_len_for_padding,
                               weights=[weights],
                               trainable=True))
    # dropout is a form of regularization
    model.add(layers.Dropout(0.5))
    # convolutional 1D layer as we have text, more dimensions needed for e.g. images
    # numbers represent no. of filters and kernel size
    model.add(layers.Conv1D(256, 3,
                            activation='relu',
                            bias_initializer='random_uniform',
                            padding='same'))
    model.add(layers.Conv1D(256, 3,
                            activation='relu',
                            bias_initializer='random_uniform',
                            padding='same'))
    model.add(layers.MaxPooling1D(pool_size=3))
    model.add(layers.Conv1D(512, 3,
                            activation='relu',
                            bias_initializer='random_uniform',
                            padding='same'))
    model.add(layers.Conv1D(512, 3,
                            activation='relu',
                            bias_initializer='random_uniform',
                            padding='same'))
    model.add(layers.GlobalAveragePooling1D())
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(num_classes, activation='softmax'))
    # print summary of model
    model.summary()

    # compile the model
    model.compile(
        loss="sparse_categorical_crossentropy", optimizer=optimizer, metrics=["acc"]
    )

    # train model and save metrics acc & val_acc
    history = model.fit(X_train, y_train, callbacks=callbacks, batch_size=batch_size, epochs=1000, validation_data=(X_val, y_val))

    # print last acc & val_acc after training
    history = history.history
    print('Validation accuracy: {acc}, loss: {loss}'.format(
        acc=history['val_acc'][-1], loss=history['val_loss'][-1]))

    # convert testing data to be able to feed it to the model to predict
    X_test = pd.DataFrame(test_documents.document_vector.to_list())

    # Predict the probabilities for each class
    y_probs = model.predict(X_test)
    # Get the class with the highest probability for each input data
    y_pred = np.argmax(y_probs, axis=1)
    print(classification_report(test_documents.climate, y_pred))

    # append the results to the arrays
    results.append(precision_recall_fscore_support(test_documents.climate, y_pred, average='macro'))
    accuracy.append(accuracy_score(test_documents.climate, y_pred))

# calculate averages
avg_precision = np.mean([results[0][0], results[1][0], results[2][0], results[3][0], results[4][0]])
avg_recall = np.mean([results[0][1], results[1][1], results[2][1], results[3][1], results[4][1]])
avg_f = np.mean([results[0][2], results[1][2], results[2][2], results[3][2], results[4][2]])
avg_acc = np.mean(accuracy)

print(f"average precision: {avg_precision}")
print(f"average recall: {avg_recall}")
print(f"average f1: {avg_f}")
print(f"average accuracy: {avg_acc}")

result_climate = {"macro avg":{"average precision" : avg_precision,
                               "average recall" : avg_recall,
                               "average f1" : avg_f,
                               "average accuracy" : avg_acc
                               }}
result_climate = pd.DataFrame(result_climate).transpose()
with pd.ExcelWriter("metrics_new.xlsx", mode="a", engine="openpyxl", if_sheet_exists='replace') as writer:
    result_climate.to_excel(writer, sheet_name="glove_climate_NNwEmb")

Model: "sequential_50"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_50 (Embedding)    (None, 420, 300)          120000300 
                                                                 
 dropout_100 (Dropout)       (None, 420, 300)          0         
                                                                 
 conv1d_200 (Conv1D)         (None, 420, 256)          230656    
                                                                 
 conv1d_201 (Conv1D)         (None, 420, 256)          196864    
                                                                 
 max_pooling1d_50 (MaxPoolin  (None, 140, 256)         0         
 g1D)                                                            
                                                                 
 conv1d_202 (Conv1D)         (None, 140, 512)          393728    
                                                     

In [74]:
# save model
model.save(str(Path.cwd()) + '/Climate_model')
# load model for further training or prediction
# reconstructed_model = tf.keras.models.load_model("Climate_model")



INFO:tensorflow:Assets written to: C:\Users\ThreadTheRipper\FAUbox\WS22_23\NLP\Project/Climate_model\assets


INFO:tensorflow:Assets written to: C:\Users\ThreadTheRipper\FAUbox\WS22_23\NLP\Project/Climate_model\assets


In [9]:
# Next classification task sentiment

In [75]:
results = []
accuracy = []

n=5
kf = KFold(n_splits=n, random_state=72, shuffle=True)

for train_index, test_index in kf.split(df_sentiment_all):
    # Garbage Collector: needed to clear GPU memory
    gc.collect()
    train_documents = df_sentiment_all.iloc[train_index]
    test_documents = df_sentiment_all.iloc[test_index]

    X_train, X_val, y_train, y_val = train_test_split(train_documents['document_vector'], train_documents.sentiment, test_size=0.2)
    X_train = pd.DataFrame(X_train.tolist())
    X_val = pd.DataFrame(X_val.tolist())
    y_train = pd.DataFrame(y_train.tolist())
    y_val = pd.DataFrame(y_val.tolist())

    num_classes = train_documents.sentiment.nunique()

    weights = np.array(model_word2vec.vectors)# vectors themselves, as 2D numpy array

    # to create the embedding matrix that is needed, we need to add an additional row with zeros
    # this is necessary as otherwise the model won't be able to handle OOV words
    new_row = np.zeros((1, weights.shape[1]))
    # embedding matrix which will be used as the embedding layer in our NN
    weights = np.vstack((new_row, weights))

    callbacks = [tf.keras.callbacks.EarlyStopping(
        monitor='val_loss', patience=5)]

    # choose Adam optimizer with a learning rate of 1e-3
    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)

    batch_size = 5

    input_shape=X_train.shape[1:]


    model = Sequential()
    model.add(layers.Embedding(input_dim=weights.shape[0],
                               output_dim=weights.shape[1],
                               input_length=max_len_for_padding,
                               weights=[weights],
                               trainable=True))
    model.add(layers.Dropout(0.5))
    model.add(layers.Conv1D(128, 5,
                            activation='relu',
                            bias_initializer='random_uniform',
                            padding='same'))
    model.add(layers.Conv1D(128, 5,
                            activation='relu',
                            bias_initializer='random_uniform',
                            padding='same'))
    model.add(layers.MaxPooling1D(pool_size=3))
    model.add(layers.Conv1D(256, 5,
                            activation='relu',
                            bias_initializer='random_uniform',
                            padding='same'))
    model.add(layers.Conv1D(256, 5,
                            activation='relu',
                            bias_initializer='random_uniform',
                            padding='same'))
    model.add(layers.GlobalAveragePooling1D())
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(num_classes, activation='softmax'))
    model.summary()

    model.compile(
        loss=tf.keras.losses.SparseCategoricalCrossentropy(), optimizer=optimizer, metrics=["acc"]
    )

    # train model and save metrics acc & val_acc
    history = model.fit(X_train, y_train, callbacks=callbacks, batch_size=batch_size, epochs=1000, validation_data=(X_val, y_val))

    # print last acc & val_acc after training
    history = history.history
    print('Validation accuracy: {acc}, loss: {loss}'.format(
        acc=history['val_acc'][-1], loss=history['val_loss'][-1]))

    X_test = pd.DataFrame(test_documents.document_vector.to_list())
    # Predict the probabilities for each class
    y_probs = model.predict(X_test)
    # Get the class with the highest probability for each input data
    y_pred = np.argmax(y_probs, axis=1)
    print(classification_report(test_documents.sentiment, y_pred))

    results.append(precision_recall_fscore_support(test_documents.sentiment, y_pred, average='macro'))
    accuracy.append(accuracy_score(test_documents.sentiment, y_pred))

avg_precision = np.mean([results[0][0], results[1][0], results[2][0], results[3][0], results[4][0]])
avg_recall = np.mean([results[0][1], results[1][1], results[2][1], results[3][1], results[4][1]])
avg_f = np.mean([results[0][2], results[1][2], results[2][2], results[3][2], results[4][2]])
avg_acc = np.mean(accuracy)

print(f"average precision: {avg_precision}")
print(f"average recall: {avg_recall}")
print(f"average f1: {avg_f}")
print(f"average accuracy: {avg_acc}")

result_sentiment = {"macro avg":{"average precision" : avg_precision,
                               "average recall" : avg_recall,
                               "average f1" : avg_f,
                               "average accuracy" : avg_acc
                               }}
result_sentiment = pd.DataFrame(result_sentiment).transpose()
with pd.ExcelWriter("metrics_new.xlsx", mode="a", engine="openpyxl", if_sheet_exists='replace') as writer:
    result_sentiment.to_excel(writer, sheet_name="glove_sentiment_NNwEmb")

Model: "sequential_55"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_55 (Embedding)    (None, 420, 300)          120000300 
                                                                 
 dropout_110 (Dropout)       (None, 420, 300)          0         
                                                                 
 conv1d_220 (Conv1D)         (None, 420, 128)          192128    
                                                                 
 conv1d_221 (Conv1D)         (None, 420, 128)          82048     
                                                                 
 max_pooling1d_55 (MaxPoolin  (None, 140, 128)         0         
 g1D)                                                            
                                                                 
 conv1d_222 (Conv1D)         (None, 140, 256)          164096    
                                                     

In [76]:
model.save(str(Path.cwd()) + '/Sentiment_model')
# reconstructed_model = tf.keras.models.load_model("Sentiment_model")
# reconstructed_model.summary()



INFO:tensorflow:Assets written to: C:\Users\ThreadTheRipper\FAUbox\WS22_23\NLP\Project/Sentiment_model\assets


INFO:tensorflow:Assets written to: C:\Users\ThreadTheRipper\FAUbox\WS22_23\NLP\Project/Sentiment_model\assets


In [18]:
# Next classification task - Commitment

In [78]:
results = []
accuracy = []

n=5
kf = KFold(n_splits=n, random_state=72, shuffle=True)
for train_index, test_index in kf.split(df_commitment_all):
    # Garbage Collector: needed to clear GPU memory
    gc.collect()
    train_documents = df_commitment_all.iloc[train_index]
    test_documents = df_commitment_all.iloc[test_index]

    X_train, X_val, y_train, y_val = train_test_split(train_documents.document_vector, train_documents.commitment, test_size=0.2)
    X_train = pd.DataFrame(X_train.tolist())
    X_val = pd.DataFrame(X_val.tolist())
    y_train = pd.DataFrame(y_train.tolist())
    y_val = pd.DataFrame(y_val.tolist())


    num_classes = train_documents.commitment.nunique()

    weights = np.array(model_word2vec.vectors)# vectors themselves, as 2D numpy array

    # to create the embedding matrix that is needed, we need to add an additional row with zeros
    # this is necessary as otherwise the model won't be able to handle OOV words
    new_row = np.zeros((1, weights.shape[1]))
    # embedding matrix which will be used as the embedding layer in our NN
    weights = np.vstack((new_row, weights))


    callbacks = [tf.keras.callbacks.EarlyStopping(
        monitor='val_loss', patience=5)]

    # choose Adam optimizer with a learning rate of 1e-3
    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)

    batch_size = 20

    input_shape=X_train.shape[1:]

    model = Sequential()
    model.add(layers.Embedding(input_dim=weights.shape[0],
                               output_dim=weights.shape[1],
                               input_length=max_len_for_padding,
                               weights=[weights],
                               trainable=True))
    model.add(layers.Dropout(0.5))
    model.add(layers.Conv1D(256, 3,
                            activation='relu',
                            bias_initializer='random_uniform',
                            padding='same'))
    model.add(layers.Conv1D(256, 3,
                            activation='relu',
                            bias_initializer='random_uniform',
                            padding='same'))
    model.add(layers.MaxPooling1D(pool_size=3))
    model.add(layers.Conv1D(512, 3,
                            activation='relu',
                            bias_initializer='random_uniform',
                            padding='same'))
    model.add(layers.Conv1D(512, 3,
                            activation='relu',
                            bias_initializer='random_uniform',
                            padding='same'))
    model.add(layers.GlobalAveragePooling1D())
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(num_classes, activation='softmax'))
    model.summary()

    model.compile(
        loss=tf.keras.losses.SparseCategoricalCrossentropy(), optimizer=optimizer, metrics=["acc"]
    )

    # train model and save metrics acc & val_acc
    history = model.fit(X_train, y_train, callbacks=callbacks, batch_size=batch_size, epochs=1000, validation_data=(X_val, y_val))

    # print last acc & val_acc after training
    history = history.history
    print('Validation accuracy: {acc}, loss: {loss}'.format(
        acc=history['val_acc'][-1], loss=history['val_loss'][-1]))

    X_test = pd.DataFrame(test_documents.document_vector.to_list())
    # Predict the probabilities for each class
    y_probs = model.predict(X_test)
    # Get the class with the highest probability for each input data
    y_pred = np.argmax(y_probs, axis=1)
    print(classification_report(test_documents.commitment, y_pred))

    results.append(precision_recall_fscore_support(test_documents.commitment, y_pred, average='macro'))
    accuracy.append(accuracy_score(test_documents.commitment, y_pred))


avg_precision = np.mean([results[0][0], results[1][0], results[2][0], results[3][0], results[4][0]])
avg_recall = np.mean([results[0][1], results[1][1], results[2][1], results[3][1], results[4][1]])
avg_f = np.mean([results[0][2], results[1][2], results[2][2], results[3][2], results[4][2]])
avg_acc = np.mean(accuracy)

print(f"average precision: {avg_precision}")
print(f"average recall: {avg_recall}")
print(f"average f1: {avg_f}")
print(f"average accuracy: {avg_acc}")

result_commitment = {"macro avg":{"average precision" : avg_precision,
                                 "average recall" : avg_recall,
                                 "average f1" : avg_f,
                                 "average accuracy" : avg_acc
                                 }}
result_commitment = pd.DataFrame(result_commitment).transpose()
with pd.ExcelWriter("metrics_new.xlsx", mode="a", engine="openpyxl", if_sheet_exists='replace') as writer:
    result_commitment.to_excel(writer, sheet_name="glove_commitment_NNwEmb")

Model: "sequential_65"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_65 (Embedding)    (None, 420, 300)          120000300 
                                                                 
 dropout_130 (Dropout)       (None, 420, 300)          0         
                                                                 
 conv1d_260 (Conv1D)         (None, 420, 256)          230656    
                                                                 
 conv1d_261 (Conv1D)         (None, 420, 256)          196864    
                                                                 
 max_pooling1d_65 (MaxPoolin  (None, 140, 256)         0         
 g1D)                                                            
                                                                 
 conv1d_262 (Conv1D)         (None, 140, 512)          393728    
                                                     

In [79]:
model.save(str(Path.cwd()) + '/Commitment_model')
# reconstructed_model = tf.keras.models.load_model("Commitment_model")
# reconstructed_model.summary()



INFO:tensorflow:Assets written to: C:\Users\ThreadTheRipper\FAUbox\WS22_23\NLP\Project/Commitment_model\assets


INFO:tensorflow:Assets written to: C:\Users\ThreadTheRipper\FAUbox\WS22_23\NLP\Project/Commitment_model\assets


In [23]:
# Next classification task - Specificity

In [83]:
results = []
accuracy = []

n=5
kf = KFold(n_splits=n, random_state=72, shuffle=True)

for train_index, test_index in kf.split(df_specificity_all):
    # Garbage Collector: needed to clear GPU memory
    gc.collect()
    train_documents = df_specificity_all.iloc[train_index]
    test_documents = df_specificity_all.iloc[test_index]

    X_train, X_val, y_train, y_val = train_test_split(train_documents.document_vector, train_documents.specificity, test_size=0.2)
    X_train = pd.DataFrame(X_train.tolist())
    X_val = pd.DataFrame(X_val.tolist())
    y_train = pd.DataFrame(y_train.tolist())
    y_val = pd.DataFrame(y_val.tolist())

    num_classes = train_documents.specificity.nunique()

    weights = np.array(model_word2vec.vectors)# vectors themselves, as 2D numpy array

    # to create the embedding matrix that is needed, we need to add an additional row with zeros
    # this is necessary as otherwise the model won't be able to handle OOV words
    new_row = np.zeros((1, weights.shape[1]))
    # embedding matrix which will be used as the embedding layer in our NN
    weights = np.vstack((new_row, weights))


    callbacks = [tf.keras.callbacks.EarlyStopping(
        monitor='val_loss', patience=3)]

    # choose Adam optimizer with a learning rate of 1e-3
    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)

    batch_size = 8

    input_shape=X_train.shape[1:]


    model = Sequential()
    model.add(layers.Embedding(input_dim=weights.shape[0],
                               output_dim=weights.shape[1],
                               input_length=max_len_for_padding,
                               weights=[weights],
                               trainable=True))
    model.add(layers.Dropout(0.5))
    model.add(layers.Conv1D(256, 3,
                            activation='relu',
                            bias_initializer='random_uniform',
                            padding='same'))
    model.add(layers.Conv1D(256, 3,
                            activation='relu',
                            bias_initializer='random_uniform',
                            padding='same'))
    model.add(layers.MaxPooling1D(pool_size=3))
    model.add(layers.Conv1D(512, 3,
                            activation='relu',
                            bias_initializer='random_uniform',
                            padding='same'))
    model.add(layers.Conv1D(512, 3,
                            activation='relu',
                            bias_initializer='random_uniform',
                            padding='same'))
    model.add(layers.GlobalAveragePooling1D())
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(num_classes, activation='softmax'))
    model.summary()

    model.compile(
        loss=tf.keras.losses.SparseCategoricalCrossentropy(), optimizer=optimizer, metrics=["acc"]
    )

    # train model and save metrics acc & val_acc
    history = model.fit(X_train, y_train, callbacks=callbacks, batch_size=batch_size, epochs=1000, validation_data=(X_val, y_val))

    # print last acc & val_acc after training
    history = history.history
    print('Validation accuracy: {acc}, loss: {loss}'.format(
        acc=history['val_acc'][-1], loss=history['val_loss'][-1]))

    X_test = pd.DataFrame(test_documents.document_vector.to_list())
    # Predict the probabilities for each class
    y_probs = model.predict(X_test)
    # Get the class with the highest probability for each input data
    y_pred = np.argmax(y_probs, axis=1)
    print(classification_report(test_documents.specificity, y_pred))

    results.append(precision_recall_fscore_support(test_documents.specificity, y_pred, average='macro'))
    accuracy.append(accuracy_score(test_documents.specificity, y_pred))

avg_precision = np.mean([results[0][0], results[1][0], results[2][0], results[3][0], results[4][0]])
avg_recall = np.mean([results[0][1], results[1][1], results[2][1], results[3][1], results[4][1]])
avg_f = np.mean([results[0][2], results[1][2], results[2][2], results[3][2], results[4][2]])
avg_acc = np.mean(accuracy)

print(f"average precision: {avg_precision}")
print(f"average recall: {avg_recall}")
print(f"average f1: {avg_f}")
print(f"average accuracy: {avg_acc}")

result_specificity = {"macro avg":{"average precision" : avg_precision,
                                  "average recall" : avg_recall,
                                  "average f1" : avg_f,
                                  "average accuracy" : avg_acc
                                  }}
result_specificity = pd.DataFrame(result_specificity).transpose()
with pd.ExcelWriter("metrics_new.xlsx", mode="a", engine="openpyxl", if_sheet_exists='replace') as writer:
    result_specificity.to_excel(writer, sheet_name="glove_specificity_NNwEmb")

Model: "sequential_85"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_85 (Embedding)    (None, 420, 300)          120000300 
                                                                 
 dropout_170 (Dropout)       (None, 420, 300)          0         
                                                                 
 conv1d_340 (Conv1D)         (None, 420, 256)          230656    
                                                                 
 conv1d_341 (Conv1D)         (None, 420, 256)          196864    
                                                                 
 max_pooling1d_85 (MaxPoolin  (None, 140, 256)         0         
 g1D)                                                            
                                                                 
 conv1d_342 (Conv1D)         (None, 140, 512)          393728    
                                                     

In [84]:
model.save(str(Path.cwd()) + '/Specificity_model')



INFO:tensorflow:Assets written to: C:\Users\ThreadTheRipper\FAUbox\WS22_23\NLP\Project/Specificity_model\assets


INFO:tensorflow:Assets written to: C:\Users\ThreadTheRipper\FAUbox\WS22_23\NLP\Project/Specificity_model\assets
