In [1]:
import multiprocessing as mp
import os
import shutil
import time

import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelBinarizer
from tqdm import tqdm

In [2]:
# supress tensorflow warnings and info
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [3]:
import tensorflow as tf

from tensorflow.python.util import deprecation
deprecation._PRINT_DEPRECATION_WARNINGS = False

import tensorflow_hub as hub

physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [4]:
# enables tqdm for pandas
tqdm.pandas()

In [5]:
# load cdp dataset csv as dataframe
df_cdp = pd.read_csv('csvs/cdp_final.csv', sep=',')
df_cdp

Unnamed: 0,area,title+desc
0,energy efficiency (including public lighting),intensity control of led light points for high...
1,waste management (including waste recycling),installation of tire shredding plant. as a per...
2,transport,environmental monitoring and control of new an...
3,waste management (including waste recycling),strengthening inclusive recycling program. thi...
4,waste management (including waste recycling),waste to energy project for blantyre city. reh...
...,...,...
1300,energy efficiency (including public lighting),decarbonisation of local authority maintained ...
1301,buildings,charlotte & william bloomberg public library. ...
1302,energy efficiency (including public lighting),bloomington green home improvement program. th...
1303,water management,greater amman municipality (gam) - saqef al se...


In [6]:
# load es dataset csv as dataframe
df_es = pd.read_csv('csvs/es_final.csv', sep=',')
df_es

Unnamed: 0,area,title+desc
0,water management,arroyo medrano restoration in parque saavedra....
1,water management,cildañez stream reservoir activation. the poss...
2,water management,water interpretation center school visits to a...
3,water management,hydraulic adaptation plan to climate change. p...
4,transport,expansion of the bike path network. the work c...
...,...,...
485,outdoor lighting,led lighting. the purpose of changing the ligh...
486,waste management,implementation of the sanitary landfill in the...
487,waste recycling,implementation of selective collection in the ...
488,water management,protection and restoration of springs in the s...


In [7]:
# join two datasets
df_joined = pd.concat([df_cdp, df_es], axis=0) 
df_joined

Unnamed: 0,area,title+desc
0,energy efficiency (including public lighting),intensity control of led light points for high...
1,waste management (including waste recycling),installation of tire shredding plant. as a per...
2,transport,environmental monitoring and control of new an...
3,waste management (including waste recycling),strengthening inclusive recycling program. thi...
4,waste management (including waste recycling),waste to energy project for blantyre city. reh...
...,...,...
485,outdoor lighting,led lighting. the purpose of changing the ligh...
486,waste management,implementation of the sanitary landfill in the...
487,waste recycling,implementation of selective collection in the ...
488,water management,protection and restoration of springs in the s...


In [8]:
# show sample count for each area (some need to be joined together)
print(pd.DataFrame(df_joined.value_counts(subset='area')).rename(columns={0: 'sample_count'}).to_markdown())

| area                                          |   sample_count |
|:----------------------------------------------|---------------:|
| transport                                     |            347 |
| water management                              |            269 |
| renewable energy                              |            224 |
| waste management (including waste recycling)  |            199 |
| energy efficiency (including public lighting) |            178 |
| buildings                                     |            177 |
| waste management                              |            116 |
| public and green spaces                       |             65 |
| waste recycling                               |             63 |
| energy efficiency / retrofit                  |             50 |
| nature-based solutions                        |             50 |
| outdoor lighting                              |             25 |
| sustainable food consumption/production       |             

In [9]:
# simplify area names
df_corrected = df_joined.copy()

area_energy = (df_corrected['area'] == 'energy efficiency / retrofit') | (df_corrected['area'] == 'energy efficiency (including public lighting)')
df_corrected.loc[area_energy, 'area'] = 'energy effiency'

area_waste = df_corrected['area'] == 'waste management (including waste recycling)'
df_corrected.loc[area_waste, 'area'] = 'waste management'

df_corrected

Unnamed: 0,area,title+desc
0,energy effiency,intensity control of led light points for high...
1,waste management,installation of tire shredding plant. as a per...
2,transport,environmental monitoring and control of new an...
3,waste management,strengthening inclusive recycling program. thi...
4,waste management,waste to energy project for blantyre city. reh...
...,...,...
485,outdoor lighting,led lighting. the purpose of changing the ligh...
486,waste management,implementation of the sanitary landfill in the...
487,waste recycling,implementation of selective collection in the ...
488,water management,protection and restoration of springs in the s...


In [10]:
# show sample count again
area_count = pd.DataFrame(df_corrected.value_counts(subset='area')).rename(columns={0: 'sample_count'})
print(area_count.to_markdown())

| area                                    |   sample_count |
|:----------------------------------------|---------------:|
| transport                               |            347 |
| waste management                        |            315 |
| water management                        |            269 |
| energy effiency                         |            228 |
| renewable energy                        |            224 |
| buildings                               |            177 |
| public and green spaces                 |             65 |
| waste recycling                         |             63 |
| nature-based solutions                  |             50 |
| outdoor lighting                        |             25 |
| sustainable food consumption/production |             19 |
| land-use                                |             13 |


In [11]:
# remove categories with few samples
min_sample_count = 100
area_count = area_count.rename({0: 'count'}, axis='columns')
areas_to_keep = area_count[area_count > min_sample_count].dropna().index.to_list()
area_filter = df_corrected['area'].isin(areas_to_keep)
df_dataset = df_corrected[area_filter]
df_dataset

Unnamed: 0,area,title+desc
0,energy effiency,intensity control of led light points for high...
1,waste management,installation of tire shredding plant. as a per...
2,transport,environmental monitoring and control of new an...
3,waste management,strengthening inclusive recycling program. thi...
4,waste management,waste to energy project for blantyre city. reh...
...,...,...
482,water management,restoration of water basins. it is intended to...
483,energy effiency,sustainable tourism development program in iga...
484,water management,water path. the project provides for the holdi...
486,waste management,implementation of the sanitary landfill in the...


In [12]:
# show new sample count
print(pd.DataFrame(df_dataset.value_counts(subset='area')).rename(columns={0: 'sample_count'}).to_markdown())

| area             |   sample_count |
|:-----------------|---------------:|
| transport        |            347 |
| waste management |            315 |
| water management |            269 |
| energy effiency  |            228 |
| renewable energy |            224 |
| buildings        |            177 |


In [13]:
# function that implements the model creation, training and deletion
def fit_keras(x_train,
              y_train,
              x_val,
              y_val,
              classes,
              model_type,
              hidden_neurons,
              epochs):
    
    # bring pre-trained model from tf hub as a layer
    if model_type == 'nnlm128':
        model_link = "https://tfhub.dev/google/nnlm-en-dim128-with-normalization/2"
    elif model_type == 'use':
        model_link = "https://tfhub.dev/google/universal-sentence-encoder/4"
    hub_layer = hub.KerasLayer(model_link, input_shape=[], dtype=tf.string, trainable=True)

    # create the model with the parameters given
    model = tf.keras.Sequential()
    model.add(hub_layer)
    if hidden_neurons > 0:
        model.add(tf.keras.layers.Dropout(0.2))
        model.add(tf.keras.layers.Dense(hidden_neurons, activation='relu'))
    model.add(tf.keras.layers.Dropout(0.2))
    model.add(tf.keras.layers.Dense(len(classes), activation='softmax'))

    # compile the model
    model.compile(optimizer='adam',
                  loss=tf.losses.CategoricalCrossentropy(),
                  metrics=[tf.metrics.CategoricalAccuracy()])

    # finally, fit the model on the data given
    history = model.fit(x_train,
                        y_train,
                        epochs=epochs,
                        batch_size=32,
                        validation_data=[x_val, y_val],
                        verbose=1)

    # return model
    return model

In [82]:
# x is input, y is output
x = df_dataset['title+desc'].to_numpy()
y = df_dataset['area'].to_numpy()

# one hot encode the output
label_binarizer = LabelBinarizer()
y_e = label_binarizer.fit_transform(y)

In [15]:
param_sets = [
    # model_type, hidden_neurons, epochs
    ('title+desc', 'nnlm128', 0, 10),
    # ('title+desc', 'nnlm128', 32, 10),
    # ('title+desc', 'nnlm128', 64, 10),
    # ('title+desc', 'use', 0, 10),
    # ('title+desc', 'use', 32, 10),
    # ('title+desc', 'use', 64, 10)
]

In [16]:
# split the dataset into train, test and val
x_train, x_valtest, y_train, y_valtest = train_test_split(x, y_e, test_size=0.15, random_state=42)
x_val, x_test, y_val, y_test = train_test_split(x_valtest, y_valtest, test_size=0.15, random_state=42)

In [17]:
# selecting the parameters
input_column, model_type, hidden_neurons, epochs = param_sets[0]

In [18]:
# train a model
model = fit_keras(x_train=x_train,
                  y_train=y_train,
                  x_val=x_val,
                  y_val=y_val,
                  classes=np.unique(y),
                  model_type=model_type,
                  hidden_neurons=hidden_neurons,
                  epochs=epochs)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [36]:
# get train accuracy
train_loss, train_acc = model.evaluate(x_train, y_train, batch_size=32)
train_acc



0.9954751133918762

In [37]:
# get val accuracy
val_loss, val_acc = model.evaluate(x_val, y_val, batch_size=32)
val_acc



0.8232323527336121

In [38]:
# get test accuracy
test_loss, test_acc = model.evaluate(x_test, y_test, batch_size=32)
test_acc



0.7777777910232544

In [44]:
# put accuracy results on a table
df_acc = pd.DataFrame({'train': [train_acc], 'val': [val_acc], 'test': [test_acc]}, index=['acc']).T
df_acc

Unnamed: 0,acc
train,0.995475
val,0.823232
test,0.777778


In [56]:
# print table
print(df_acc.apply(lambda r: f'{r["acc"]:.4f}', axis=1).to_markdown())

|       |      0 |
|:------|-------:|
| train | 0.9955 |
| val   | 0.8232 |
| test  | 0.7778 |
