In [1]:
# supress tensorflow warnings and info
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [2]:
import multiprocessing as mp
import shutil
import time

import numpy as np
import pandas as pd
import tensorflow_text

from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelBinarizer
from tqdm import tqdm

In [3]:
# enables tqdm for pandas
tqdm.pandas()

In [4]:
# load cdp dataset csv as dataframe
df_cdp = pd.read_csv('csvs/cdp_final.csv', sep=',')
df_cdp

Unnamed: 0,area,title+desc
0,energy efficiency (including public lighting),intensity control of led light points for high...
1,waste management (including waste recycling),installation of tire shredding plant. as a per...
2,transport,environmental monitoring and control of new an...
3,waste management (including waste recycling),strengthening inclusive recycling program. thi...
4,waste management (including waste recycling),waste to energy project for blantyre city. reh...
...,...,...
1300,energy efficiency (including public lighting),decarbonisation of local authority maintained ...
1301,buildings,charlotte & william bloomberg public library. ...
1302,energy efficiency (including public lighting),bloomington green home improvement program. th...
1303,water management,greater amman municipality (gam) - saqef al se...


In [5]:
# show sample count for each area (some need to be joined together)
print(pd.DataFrame(df_cdp.value_counts(subset='area')).rename(columns={0: 'sample_count'}).to_markdown())

| area                                          |   sample_count |
|:----------------------------------------------|---------------:|
| transport                                     |            261 |
| waste management (including waste recycling)  |            199 |
| renewable energy                              |            196 |
| energy efficiency (including public lighting) |            178 |
| water management                              |            169 |
| buildings                                     |            155 |
| public and green spaces                       |             65 |
| nature-based solutions                        |             50 |
| sustainable food consumption/production       |             19 |
| land-use                                      |             13 |


In [6]:
# simplify area names
df_corrected = df_cdp.copy()

area_energy = (df_corrected['area'] == 'energy efficiency / retrofit') | (df_corrected['area'] == 'energy efficiency (including public lighting)')
df_corrected.loc[area_energy, 'area'] = 'energy efficiency'

area_waste = df_corrected['area'] == 'waste management (including waste recycling)'
df_corrected.loc[area_waste, 'area'] = 'waste management'

df_corrected

Unnamed: 0,area,title+desc
0,energy effiency,intensity control of led light points for high...
1,waste management,installation of tire shredding plant. as a per...
2,transport,environmental monitoring and control of new an...
3,waste management,strengthening inclusive recycling program. thi...
4,waste management,waste to energy project for blantyre city. reh...
...,...,...
1300,energy effiency,decarbonisation of local authority maintained ...
1301,buildings,charlotte & william bloomberg public library. ...
1302,energy effiency,bloomington green home improvement program. th...
1303,water management,greater amman municipality (gam) - saqef al se...


In [7]:
# show sample count again
area_count = pd.DataFrame(df_corrected.value_counts(subset='area')).rename(columns={0: 'sample_count'})
print(area_count.to_markdown())

| area                                    |   sample_count |
|:----------------------------------------|---------------:|
| transport                               |            261 |
| waste management                        |            199 |
| renewable energy                        |            196 |
| energy effiency                         |            178 |
| water management                        |            169 |
| buildings                               |            155 |
| public and green spaces                 |             65 |
| nature-based solutions                  |             50 |
| sustainable food consumption/production |             19 |
| land-use                                |             13 |


In [8]:
pd.DataFrame(df_corrected.value_counts(subset='area')).rename(columns={0: 'sample_count'})

Unnamed: 0_level_0,sample_count
area,Unnamed: 1_level_1
transport,261
waste management,199
renewable energy,196
energy effiency,178
water management,169
buildings,155
public and green spaces,65
nature-based solutions,50
sustainable food consumption/production,19
land-use,13


In [9]:
# remove categories with few samples
min_sample_count = 100
area_count = area_count.rename({0: 'count'}, axis='columns')
areas_to_keep = area_count[area_count > min_sample_count].dropna().index.to_list()
area_filter = df_corrected['area'].isin(areas_to_keep)
df_clean = df_corrected[area_filter]
df_clean

Unnamed: 0,area,title+desc
0,energy effiency,intensity control of led light points for high...
1,waste management,installation of tire shredding plant. as a per...
2,transport,environmental monitoring and control of new an...
3,waste management,strengthening inclusive recycling program. thi...
4,waste management,waste to energy project for blantyre city. reh...
...,...,...
1300,energy effiency,decarbonisation of local authority maintained ...
1301,buildings,charlotte & william bloomberg public library. ...
1302,energy effiency,bloomington green home improvement program. th...
1303,water management,greater amman municipality (gam) - saqef al se...


In [10]:
# show new sample count
print(pd.DataFrame(df_clean.value_counts(subset='area')).rename(columns={0: 'sample_count'}).to_markdown())

| area             |   sample_count |
|:-----------------|---------------:|
| transport        |            261 |
| waste management |            199 |
| renewable energy |            196 |
| energy effiency  |            178 |
| water management |            169 |
| buildings        |            155 |


In [11]:
df_clean

Unnamed: 0,area,title+desc
0,energy effiency,intensity control of led light points for high...
1,waste management,installation of tire shredding plant. as a per...
2,transport,environmental monitoring and control of new an...
3,waste management,strengthening inclusive recycling program. thi...
4,waste management,waste to energy project for blantyre city. reh...
...,...,...
1300,energy effiency,decarbonisation of local authority maintained ...
1301,buildings,charlotte & william bloomberg public library. ...
1302,energy effiency,bloomington green home improvement program. th...
1303,water management,greater amman municipality (gam) - saqef al se...


In [12]:
# run the same routine for the untranslated dataset
df_cdp_multi = pd.read_csv('csvs/cdp_clean.csv', sep=',').drop(columns=['Unnamed: 0'])[['area', 'title+desc']]

# area_energy = (df_joined_multi['area'] == 'energy efficiency / retrofit') | (df_joined_multi['area'] == 'energy efficiency (including public lighting)')
df_cdp_multi.loc[area_energy, 'area'] = 'energy efficiency'

# area_waste = df_joined_multi['area'] == 'waste management (including waste recycling)'
df_cdp_multi.loc[area_waste, 'area'] = 'waste management'

# min_sample_count = 100
# area_count = area_count.rename({0: 'count'}, axis='columns')
# areas_to_keep = area_count[area_count > min_sample_count].dropna().index.to_list()
area_filter = df_cdp_multi['area'].isin(areas_to_keep)
df_clean_multi = df_cdp_multi[area_filter]
df_clean_multi

Unnamed: 0,area,title+desc
0,energy effiency,intensity control of led light points for high...
1,waste management,instalación de planta trituradora de llantas. ...
2,transport,seguimiento y control ambiental a tecnologías ...
3,waste management,fortalecimiento programa de reciclaje inclusiv...
4,waste management,waste to energy project for blantyre city. reh...
...,...,...
1300,energy effiency,decarbonisation of local authority maintained ...
1301,buildings,charlotte & william bloomberg public library. ...
1302,energy effiency,bloomington green home improvement program. th...
1303,water management,greater amman municipality (gam) - saqef al se...


In [13]:
# function that implements the model creation, training and deletion
def fit_keras(x_train,
              y_train,
              x_val,
              y_val,
              classes,
              model_type,
              hidden_neurons,
              epochs,
              queue):
    
    # tensorflow is imported inside the function because tensorflow is an absolute nightmare with regards to gpu memmory use. doing it like this makes sure the memmory allocated by tensorflow for each model is always deallocated before the next training. otherwise, out of memmory errors occur
    import tensorflow as tf

    from tensorflow.python.util import deprecation
    deprecation._PRINT_DEPRECATION_WARNINGS = False

    import tensorflow_hub as hub

    physical_devices = tf.config.list_physical_devices('GPU')
    tf.config.experimental.set_memory_growth(physical_devices[0], True)

    # bring pre-trained model from tf hub as a layer
    if model_type == 'nnlm50':
        hub_layer = hub.KerasLayer('https://tfhub.dev/google/nnlm-en-dim50-with-normalization/2', input_shape=[], dtype=tf.string)
    elif model_type == 'nnlm128':
        hub_layer = hub.KerasLayer('https://tfhub.dev/google/nnlm-en-dim128-with-normalization/2', input_shape=[], dtype=tf.string)
    elif model_type == 'use':
        hub_layer = hub.KerasLayer('https://tfhub.dev/google/universal-sentence-encoder/4', input_shape=[], dtype=tf.string)
    elif model_type == 'use-multi':
        hub_layer = hub.KerasLayer('https://tfhub.dev/google/universal-sentence-encoder-multilingual/3', input_shape=[], dtype=tf.string)

    # create the model with the parameters given
    model = tf.keras.Sequential()
    model.add(hub_layer)
    if hidden_neurons > 0:
        model.add(tf.keras.layers.Dropout(0.2))
        model.add(tf.keras.layers.Dense(hidden_neurons, activation='relu'))
    model.add(tf.keras.layers.Dropout(0.2))
    model.add(tf.keras.layers.Dense(len(classes), activation='softmax'))

    # compile the model
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
                  loss=tf.losses.CategoricalCrossentropy(),
                  metrics=[tf.metrics.CategoricalAccuracy()])
    
    t0 = time.time()

    # finally, fit the model on the data given
    history = model.fit(x_train,
                        y_train,
                        epochs=epochs,
                        batch_size=32,
                        validation_data=[x_val, y_val],
                        verbose=0)
    
    train_time = time.time() - t0

    # return the last validation accuracy
    val_acc = history.history['val_categorical_accuracy'][-1]
    queue.put((val_acc, train_time, model.count_params()))

In [14]:
# this function implements cross validation for keras classifiers (which are not supported by scikit-learn)
def cross_validate_keras(df_dataset, 
                         num_folds,
                         input_column,
                         output_column,
                         model_type,
                         hidden_neurons,
                         epochs):
    
    # multiprocessing queue for retrieving the fit_keras result
    queue = mp.Queue()
    
    # x is input, y is output
    x = df_dataset[input_column].to_numpy()
    y = df_dataset[output_column].to_numpy()

    # one hot encode the output
    label_binarizer = LabelBinarizer()
    label_binarizer.fit(y)

    # manually create the folds and iterate through them
    cv_metrics = {
        'score': [],
        'time': [],
        'params': []
    }
    skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)
    for k, (train, val) in enumerate(skf.split(x, y)):

        # the function is called in another process so that the vram tensorflow allocated can be deallocated after each model is trained
        p = mp.Process(target=fit_keras, args=(x[train],
                                               label_binarizer.transform(y[train]),
                                               x[val],
                                               label_binarizer.transform(y[val]),
                                               np.unique(y),
                                               model_type,
                                               hidden_neurons,
                                               epochs,
                                               queue))
        p.start()
        p.join()

        # retrieve the validation accuracy from the queue
        val_acc, train_time, model_params = queue.get()
        cv_metrics['score'].append(val_acc)
        cv_metrics['time'].append(train_time)
        cv_metrics['params'].append(model_params)
        print(f'col: {input_column}, model: {model_type}, neurons: {hidden_neurons}, epochs: {epochs}, fold: {k}, acc: {val_acc:.4f}, time: {int(train_time)}')

    return cv_metrics


In [18]:
# delete results folder if existent
# create new results folder
results_folder = 'results_tl'
if os.path.exists(results_folder):
    shutil.rmtree(results_folder)
os.makedirs(results_folder)

In [15]:
# parameter sets
param_sets = [
    # model_type, hidden_neurons, epochs
    # (False, 'nnlm128', 0, 50),
    (False, 'nnlm128', 12, 50),
    # (False, 'use', 0, 50),
    (False, 'use', 24, 50),
    # (True, 'use-multi', 0, 50),
    (True, 'use-multi', 24, 50)
]

In [19]:
# the loop implement the grid search to find the best parameters for the model
queue = mp.Queue()

for multilingual, model_type, hidden_neurons, epochs in param_sets:

    t0 = time.time()

    df_dataset = df_clean_multi if multilingual else df_clean

    cv_metrics = cross_validate_keras(df_dataset=df_dataset,
                                    num_folds=5,
                                    input_column='title+desc',
                                    output_column='area',
                                    model_type=model_type,
                                    hidden_neurons=hidden_neurons,
                                    epochs=epochs)
    
    cv_total_time = time.time() - t0
    
    print(f'total time: {int(cv_total_time)}\n')
    
    mean_cv_score = np.mean(cv_metrics['score'])
    mean_cv_time = np.mean(cv_metrics['time'])
    cv_params = np.mean(cv_metrics['params'])
    
    file_path = os.path.join(results_folder, f'{model_type}_{hidden_neurons}_{epochs}')
    with open(file_path, 'w') as f:
        f.write(f'{mean_cv_score:.4f} {int(mean_cv_time)} {cv_params}')

col: title+desc, model: nnlm128, neurons: 12, epochs: 50, fold: 0, acc: 0.7629, time: 9
col: title+desc, model: nnlm128, neurons: 12, epochs: 50, fold: 1, acc: 0.7241, time: 9
col: title+desc, model: nnlm128, neurons: 12, epochs: 50, fold: 2, acc: 0.7112, time: 8
col: title+desc, model: nnlm128, neurons: 12, epochs: 50, fold: 3, acc: 0.7056, time: 8
col: title+desc, model: nnlm128, neurons: 12, epochs: 50, fold: 4, acc: 0.7489, time: 8
total time: 55

col: title+desc, model: use, neurons: 24, epochs: 50, fold: 0, acc: 0.7672, time: 34
col: title+desc, model: use, neurons: 24, epochs: 50, fold: 1, acc: 0.8017, time: 34
col: title+desc, model: use, neurons: 24, epochs: 50, fold: 2, acc: 0.6853, time: 34
col: title+desc, model: use, neurons: 24, epochs: 50, fold: 3, acc: 0.7619, time: 34
col: title+desc, model: use, neurons: 24, epochs: 50, fold: 4, acc: 0.7359, time: 34
total time: 213

col: title+desc, model: use-multi, neurons: 24, epochs: 50, fold: 0, acc: 0.8190, time: 36
col: title+

In [20]:
# read the files generated by the grid search and put results on a table
keras_columns = ['model_type', 'hidden_neurons', 'epochs', 'cv_score', 'cv_time', 'params']
df_results_keras = pd.DataFrame(columns=keras_columns)

for file_path in sorted(os.listdir(results_folder)):
    with open(os.path.join(results_folder, file_path), 'r') as f:
        parameters = file_path.split('_')
        results = f.read()
        cv_score, cv_time, trainable_params = results.split(' ')
        data = parameters + [float(cv_score), float(cv_time), float(trainable_params)]
        df_row = pd.DataFrame(data=[data], columns=keras_columns)
        df_results_keras = pd.concat([df_results_keras, df_row])

In [21]:
def format_params(row):
    row['params'] = f'{int(row["params"]/1e6)} M'
    return row

In [22]:
# group by each parameter
print(df_results_keras.groupby(['model_type', 'hidden_neurons'])[['cv_score', 'params']].mean().apply(format_params, axis=1).reset_index().to_markdown(index=False))

| model_type   |   hidden_neurons |   cv_score | params   |
|:-------------|-----------------:|-----------:|:---------|
| nnlm128      |               12 |     0.7306 | 124 M    |
| use          |               24 |     0.7504 | 256 M    |
| use-multi    |               24 |     0.7685 | 68 M     |


In [23]:
print(df_results_keras.groupby(['model_type'])[['cv_score', 'params']].mean().apply(format_params, axis=1).to_markdown())

| model_type   |   cv_score | params   |
|:-------------|-----------:|:---------|
| nnlm128      |     0.7306 | 124 M    |
| use          |     0.7504 | 256 M    |
| use-multi    |     0.7685 | 68 M     |
