In [1]:
import multiprocessing as mp
import os
import shutil
import time

import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelBinarizer
from tqdm import tqdm

In [2]:
# enables tqdm for pandas
tqdm.pandas()

In [3]:
# supress tensorflow warnings and info
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [4]:
# load cdp dataset csv as dataframe
df_cdp = pd.read_csv('csvs/cdp_final.csv', sep=',')
df_cdp

Unnamed: 0,area,title+desc
0,energy efficiency (including public lighting),intensity control of led light points for high...
1,waste management (including waste recycling),installation of tire shredding plant. as a per...
2,transport,environmental monitoring and control of new an...
3,waste management (including waste recycling),strengthening inclusive recycling program. thi...
4,waste management (including waste recycling),waste to energy project for blantyre city. reh...
...,...,...
1300,energy efficiency (including public lighting),decarbonisation of local authority maintained ...
1301,buildings,charlotte & william bloomberg public library. ...
1302,energy efficiency (including public lighting),bloomington green home improvement program. th...
1303,water management,greater amman municipality (gam) - saqef al se...


In [5]:
# load es dataset csv as dataframe
df_es = pd.read_csv('csvs/es_final.csv', sep=',')
df_es

Unnamed: 0,area,title+desc
0,water management,arroyo medrano restoration in parque saavedra....
1,water management,cildañez stream reservoir activation. the poss...
2,water management,water interpretation center school visits to a...
3,water management,hydraulic adaptation plan to climate change. p...
4,transport,expansion of the bike path network. the work c...
...,...,...
485,outdoor lighting,led lighting. the purpose of changing the ligh...
486,waste management,implementation of the sanitary landfill in the...
487,waste recycling,implementation of selective collection in the ...
488,water management,protection and restoration of springs in the s...


In [6]:
# join two datasets
df_joined = pd.concat([df_cdp, df_es], axis=0) 
df_joined

Unnamed: 0,area,title+desc
0,energy efficiency (including public lighting),intensity control of led light points for high...
1,waste management (including waste recycling),installation of tire shredding plant. as a per...
2,transport,environmental monitoring and control of new an...
3,waste management (including waste recycling),strengthening inclusive recycling program. thi...
4,waste management (including waste recycling),waste to energy project for blantyre city. reh...
...,...,...
485,outdoor lighting,led lighting. the purpose of changing the ligh...
486,waste management,implementation of the sanitary landfill in the...
487,waste recycling,implementation of selective collection in the ...
488,water management,protection and restoration of springs in the s...


In [7]:
# show sample count for each area (some need to be joined together)
print(pd.DataFrame(df_joined.value_counts(subset='area')).rename(columns={0: 'sample_count'}).to_markdown())

| area                                          |   sample_count |
|:----------------------------------------------|---------------:|
| transport                                     |            347 |
| water management                              |            269 |
| renewable energy                              |            224 |
| waste management (including waste recycling)  |            199 |
| energy efficiency (including public lighting) |            178 |
| buildings                                     |            177 |
| waste management                              |            116 |
| public and green spaces                       |             65 |
| waste recycling                               |             63 |
| energy efficiency / retrofit                  |             50 |
| nature-based solutions                        |             50 |
| outdoor lighting                              |             25 |
| sustainable food consumption/production       |             

In [8]:
# simplify area names
df_corrected = df_joined.copy()

area_energy = (df_corrected['area'] == 'energy efficiency / retrofit') | (df_corrected['area'] == 'energy efficiency (including public lighting)')
df_corrected.loc[area_energy, 'area'] = 'energy effiency'

area_waste = df_corrected['area'] == 'waste management (including waste recycling)'
df_corrected.loc[area_waste, 'area'] = 'waste management'

df_corrected

Unnamed: 0,area,title+desc
0,energy effiency,intensity control of led light points for high...
1,waste management,installation of tire shredding plant. as a per...
2,transport,environmental monitoring and control of new an...
3,waste management,strengthening inclusive recycling program. thi...
4,waste management,waste to energy project for blantyre city. reh...
...,...,...
485,outdoor lighting,led lighting. the purpose of changing the ligh...
486,waste management,implementation of the sanitary landfill in the...
487,waste recycling,implementation of selective collection in the ...
488,water management,protection and restoration of springs in the s...


In [9]:
# show sample count again
area_count = pd.DataFrame(df_corrected.value_counts(subset='area')).rename(columns={0: 'sample_count'})
print(area_count.to_markdown())

| area                                    |   sample_count |
|:----------------------------------------|---------------:|
| transport                               |            347 |
| waste management                        |            315 |
| water management                        |            269 |
| energy effiency                         |            228 |
| renewable energy                        |            224 |
| buildings                               |            177 |
| public and green spaces                 |             65 |
| waste recycling                         |             63 |
| nature-based solutions                  |             50 |
| outdoor lighting                        |             25 |
| sustainable food consumption/production |             19 |
| land-use                                |             13 |


In [18]:
# remove categories with few samples
min_sample_count = 100
area_count = area_count.rename({0: 'count'}, axis='columns')
areas_to_keep = area_count[area_count > min_sample_count].dropna().index.to_list()
area_filter = df_corrected['area'].isin(areas_to_keep)
df_clean = df_corrected[area_filter]
df_clean

Unnamed: 0,area,title+desc
0,energy effiency,intensity control of led light points for high...
1,waste management,installation of tire shredding plant. as a per...
2,transport,environmental monitoring and control of new an...
3,waste management,strengthening inclusive recycling program. thi...
4,waste management,waste to energy project for blantyre city. reh...
...,...,...
482,water management,restoration of water basins. it is intended to...
483,energy effiency,sustainable tourism development program in iga...
484,water management,water path. the project provides for the holdi...
486,waste management,implementation of the sanitary landfill in the...


In [19]:
# show new sample count
print(pd.DataFrame(df_clean.value_counts(subset='area')).rename(columns={0: 'sample_count'}).to_markdown())

| area             |   sample_count |
|:-----------------|---------------:|
| transport        |            347 |
| waste management |            315 |
| water management |            269 |
| energy effiency  |            228 |
| renewable energy |            224 |
| buildings        |            177 |


In [20]:
# function that implements the model creation, training and deletion
def fit_keras(x_train,
              y_train,
              x_val,
              y_val,
              classes,
              model_type,
              hidden_neurons,
              epochs,
              queue):
    
    # tensorflow is imported inside the function because tensorflow is an absolute nightmare with regards to gpu memmory use. doing it like this makes sure the memmory allocated by tensorflow for each model is always deallocated before the next training. otherwise, out of memmory errors occur
    import tensorflow as tf

    from tensorflow.python.util import deprecation
    deprecation._PRINT_DEPRECATION_WARNINGS = False

    import tensorflow_hub as hub

    physical_devices = tf.config.list_physical_devices('GPU')
    tf.config.experimental.set_memory_growth(physical_devices[0], True)

    # bring pre-trained model from tf hub as a layer
    if model_type == 'nnlm128':
        model_link = "https://tfhub.dev/google/nnlm-en-dim128-with-normalization/2"
    elif model_type == 'use':
        model_link = "https://tfhub.dev/google/universal-sentence-encoder/4"
    hub_layer = hub.KerasLayer(model_link, input_shape=[], dtype=tf.string, trainable=True)

    # create the model with the parameters given
    model = tf.keras.Sequential()
    model.add(hub_layer)
    if hidden_neurons > 0:
        model.add(tf.keras.layers.Dropout(0.2))
        model.add(tf.keras.layers.Dense(hidden_neurons, activation='relu'))
    model.add(tf.keras.layers.Dropout(0.2))
    model.add(tf.keras.layers.Dense(len(classes), activation='softmax'))

    # compile the model
    model.compile(optimizer='adam',
                  loss=tf.losses.CategoricalCrossentropy(),
                  metrics=[tf.metrics.CategoricalAccuracy()])

    # finally, fit the model on the data given
    history = model.fit(x_train,
                        y_train,
                        epochs=epochs,
                        batch_size=32,
                        validation_data=[x_val, y_val],
                        verbose=0)

    # return the last validation accuracy
    val_acc = history.history['val_categorical_accuracy'][-1]
    queue.put(val_acc)

In [21]:
# this function implements cross validation for keras classifiers (which are not supported by scikit-learn)
def cross_validate_keras(df_dataset, 
                         num_folds,
                         input_column,
                         output_column,
                         model_type,
                         hidden_neurons,
                         epochs):
    
    # multiprocessing queue for retrieving the fit_keras result
    queue = mp.Queue()
    
    # x is input, y is output
    x = df_dataset[input_column].to_numpy()
    y = df_dataset[output_column].to_numpy()

    # one hot encode the output
    label_binarizer = LabelBinarizer()
    label_binarizer.fit(y)

    # manually create the folds and iterate through them
    cv_score = []
    skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)
    for k, (train, val) in enumerate(skf.split(x, y)):

        # the function is called in another process so that the vram tensorflow allocated can be deallocated after each model is trained
        p = mp.Process(target=fit_keras, args=(x[train],
                                               label_binarizer.transform(y[train]),
                                               x[val],
                                               label_binarizer.transform(y[val]),
                                               np.unique(y),
                                               model_type,
                                               hidden_neurons,
                                               epochs,
                                               queue))
        p.start()
        p.join()

        # retrieve the validation accuracy from the queue
        val_acc = queue.get()
        cv_score.append(val_acc)
        print(f'col: {input_column}, model: {model_type}, neurons: {hidden_neurons}, epochs: {epochs}, fold: {k}, val_acc: {val_acc:.4f}')

    return cv_score


In [22]:
# delete results folder if existent
# create new results folder
if os.path.exists('results'):
    shutil.rmtree('results')
os.makedirs('results')

In [23]:
# parameter sets (smaller this time)
param_sets = [
    # input_column, model_type, hidden_neurons, epochs
    ('title+desc', 'nnlm128', 0, 10),
    ('title+desc', 'nnlm128', 64, 10),
    ('title+desc', 'use', 0, 10),
    ('title+desc', 'use', 64, 10)
]

In [24]:
# the loop implement the grid search to find the best parameters for our solution
queue = mp.Queue()

for input_column, model_type, hidden_neurons, epochs in param_sets:

    t0 = time.time()

    cv_score = cross_validate_keras(df_dataset=df_clean,
                                    num_folds=5,
                                    input_column=input_column,
                                    output_column='area',
                                    model_type=model_type,
                                    hidden_neurons=hidden_neurons,
                                    epochs=epochs)
    
    cv_time = time.time() - t0
    
    print(f'time: {int(cv_time)}\n')
    
    mean_cv_score = np.mean(cv_score)
    
    file_path = os.path.join('results', f'{input_column}_{model_type}_{hidden_neurons}_{epochs}')
    with open(file_path, 'w') as f:
        f.write(f'{mean_cv_score:.4f} {int(cv_time)}')

col: title+desc, model: nnlm128, neurons: 0, epochs: 10, fold: 0, val_acc: 0.8237
col: title+desc, model: nnlm128, neurons: 0, epochs: 10, fold: 1, val_acc: 0.8333
col: title+desc, model: nnlm128, neurons: 0, epochs: 10, fold: 2, val_acc: 0.8526
col: title+desc, model: nnlm128, neurons: 0, epochs: 10, fold: 3, val_acc: 0.8141
col: title+desc, model: nnlm128, neurons: 0, epochs: 10, fold: 4, val_acc: 0.8333
time: 224

col: title+desc, model: nnlm128, neurons: 64, epochs: 10, fold: 0, val_acc: 0.8141
col: title+desc, model: nnlm128, neurons: 64, epochs: 10, fold: 1, val_acc: 0.8237
col: title+desc, model: nnlm128, neurons: 64, epochs: 10, fold: 2, val_acc: 0.8429
col: title+desc, model: nnlm128, neurons: 64, epochs: 10, fold: 3, val_acc: 0.8109
col: title+desc, model: nnlm128, neurons: 64, epochs: 10, fold: 4, val_acc: 0.8269
time: 223

col: title+desc, model: use, neurons: 0, epochs: 10, fold: 0, val_acc: 0.8205
col: title+desc, model: use, neurons: 0, epochs: 10, fold: 1, val_acc: 0.83

In [32]:
# read the files generated by the grid search and put results on a table
keras_columns = ['input_column', 'model_type', 'hidden_neurons', 'epochs', 'cv_score', 'cv_time']
df_results_keras = pd.DataFrame(columns=keras_columns)

for file_path in sorted(os.listdir('results')):
    with open(os.path.join('results', file_path), 'r') as f:
        parameters = file_path.split('_')
        results = f.read()
        cv_score, cv_time = results.split(' ')
        data = parameters + [float(cv_score), float(cv_time)]
        df_row = pd.DataFrame(data=[data], columns=keras_columns)
        df_results_keras = pd.concat([df_results_keras, df_row])

In [34]:
# group by each parameter
print(df_results_keras.groupby(['input_column', 'model_type', 'hidden_neurons'])[['cv_score', 'cv_time']].mean().reset_index().to_markdown(index=False))

| input_column   | model_type   |   hidden_neurons |   cv_score |   cv_time |
|:---------------|:-------------|-----------------:|-----------:|----------:|
| title+desc     | nnlm128      |                0 |     0.8314 |       224 |
| title+desc     | nnlm128      |               64 |     0.8237 |       223 |
| title+desc     | use          |                0 |     0.8282 |      1209 |
| title+desc     | use          |               64 |     0.8128 |      1242 |


In [35]:
print(df_results_keras.groupby(['model_type'])[['cv_score', 'cv_time']].mean().to_markdown())

| model_type   |   cv_score |   cv_time |
|:-------------|-----------:|----------:|
| nnlm128      |    0.82755 |     223.5 |
| use          |    0.8205  |    1225.5 |


In [36]:
print(df_results_keras.groupby(['hidden_neurons'])[['cv_score', 'cv_time']].mean().to_markdown())

|   hidden_neurons |   cv_score |   cv_time |
|-----------------:|-----------:|----------:|
|                0 |    0.8298  |     716.5 |
|               64 |    0.81825 |     732.5 |
