In [1]:
import multiprocessing as mp
import os
import shutil
import time

import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F

from sentence_transformers import SentenceTransformer
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelBinarizer
from tqdm import tqdm

In [2]:
# enables tqdm for pandas
tqdm.pandas()

In [3]:
# run the same routine for the untranslated dataset
df_cdp_multi = pd.read_csv('csvs/cdp_clean.csv', sep=',').drop(columns=['Unnamed: 0'])[['area', 'title+desc']]

area_energy = (df_cdp_multi['area'] == 'energy efficiency / retrofit') | (df_cdp_multi['area'] == 'energy efficiency (including public lighting)')
df_cdp_multi.loc[area_energy, 'area'] = 'energy efficiency'

area_waste = df_cdp_multi['area'] == 'waste management (including waste recycling)'
df_cdp_multi.loc[area_waste, 'area'] = 'waste management'

min_sample_count = 100
area_count = pd.DataFrame(df_cdp_multi.value_counts(subset='area')).rename(columns={0: 'sample_count'})
area_count = area_count.rename({0: 'count'}, axis='columns')
areas_to_keep = area_count[area_count > min_sample_count].dropna().index.to_list()
area_filter = df_cdp_multi['area'].isin(areas_to_keep)
df_clean_multi = df_cdp_multi[area_filter]
df_clean_multi

Unnamed: 0,area,title+desc
0,energy effiency,intensity control of led light points for high...
1,waste management,instalación de planta trituradora de llantas. ...
2,transport,seguimiento y control ambiental a tecnologías ...
3,waste management,fortalecimiento programa de reciclaje inclusiv...
4,waste management,waste to energy project for blantyre city. reh...
...,...,...
1300,energy effiency,decarbonisation of local authority maintained ...
1301,buildings,charlotte & william bloomberg public library. ...
1302,energy effiency,bloomington green home improvement program. th...
1303,water management,greater amman municipality (gam) - saqef al se...


In [4]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = torch.from_numpy(y.astype(float))

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

In [5]:
class CustomNet(torch.nn.Module):
    def __init__(self, encoder, hidden_neurons, num_classes, encoder_trainable=False):
        super().__init__()
        self.encoder = encoder
        for param in self.encoder.parameters():
            param.requires_grad = encoder_trainable
        self.encoder
        self.dropout1 = torch.nn.Dropout(0.2)
        self.fc1 = torch.nn.Linear(self.encoder.get_sentence_embedding_dimension(), hidden_neurons)
        self.dropout2 = torch.nn.Dropout(0.2)
        self.fc2 = torch.nn.Linear(hidden_neurons, num_classes)

    def forward(self, x):
        x = torch.from_numpy(self.encoder.encode(x))
        x = F.relu(self.fc1(self.dropout1(x)))
        x = F.softmax(self.fc2(self.dropout2(x)), 1)
        return x

In [6]:
# function that implements the model creation, training and deletion
def fit_torch(x_train,
              y_train,
              x_val,
              y_val,
              classes,
              model_type,
              hidden_neurons,
              epochs,
              queue):
    
    # bring pre-trained model from tf hub as a layer
    if model_type == 'labse':
        encoder = SentenceTransformer('sentence-transformers/LaBSE')
    elif model_type == 'duse':
        encoder = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased-v1')
    elif model_type == 'minilm':
        encoder = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
    elif model_type == 'mpnet':
        encoder = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')

    # create the model with the parameters given
    # model.max_seq_length = 512
    model = CustomNet(encoder, hidden_neurons, len(classes))

    # setup loss
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    
    # create dataloader
    train_set = CustomDataset(x_train, y_train)
    val_set = CustomDataset(x_val, y_val)
    train_dataloader = torch.utils.data.DataLoader(train_set, batch_size=32, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val_set, batch_size=32, shuffle=True)

    t0 = time.time()

    # train the model
    best_vloss = 1_000_000.0
    for epoch in range(epochs):

        # print(f'Epoch {epoch + 1}/{epochs}')

        model.train(True)
        correct = 0
        running_loss = 0.0
        for i, data in enumerate(train_dataloader):
            inputs, labels = data
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            _, pred_idx = outputs.max(1)
            _, true_idx = labels.max(1)
            correct += (pred_idx == true_idx).sum().item()
        avg_loss = running_loss / (i + 1)
        accuracy = correct / len(train_set)

        model.train(False)
        vcorrect = 0
        running_vloss = 0.0
        for i, vdata in enumerate(val_dataloader):
            vinputs, vlabels = vdata
            voutputs = model(vinputs)
            vloss = criterion(voutputs, vlabels)
            running_vloss += vloss.item()
            _, vpred_idx = voutputs.max(1)
            _, vtrue_idx = vlabels.max(1)
            vcorrect += (vpred_idx == vtrue_idx).sum().item()
        avg_vloss = running_vloss / (i + 1)
        vaccuracy = vcorrect / len(val_set)

        # print(f'loss train {avg_loss:.4f} - acc train {accuracy:.4f} - loss validation {avg_vloss:.4f} - acc validation {vaccuracy:.4f}')

        # Track best performance, and save the model's state
        # if avg_vloss < best_vloss:
        #     best_vloss = avg_vloss
        #     model_path = f'models/model_{epoch}'
        #     torch.save(model.state_dict(), model_path)
    
    train_time = time.time() - t0

    # return the last validation accuracy
    val_acc = vaccuracy
    num_params = sum(p.numel() for p in model.parameters())
    queue.put((val_acc, train_time, num_params))

In [7]:
# this function implements cross validation for keras classifiers (which are not supported by scikit-learn)
def cross_validate_torch(df_dataset, 
                         num_folds,
                         input_column,
                         output_column,
                         model_type,
                         hidden_neurons,
                         epochs):
    
    # multiprocessing queue for retrieving the fit_keras result
    queue = mp.Queue()
    
    # x is input, y is output
    x = df_dataset[input_column].to_numpy()
    y = df_dataset[output_column].to_numpy()

    # one hot encode the output
    label_binarizer = LabelBinarizer()
    label_binarizer.fit(y)

    # manually create the folds and iterate through them
    cv_metrics = {
        'score': [],
        'time': [],
        'params': []
    }
    skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)
    for k, (train, val) in enumerate(skf.split(x, y)):

        p = mp.Process(target=fit_torch, args=(x[train],
                                               label_binarizer.transform(y[train]),
                                               x[val],
                                               label_binarizer.transform(y[val]),
                                               np.unique(y),
                                               model_type,
                                               hidden_neurons,
                                               epochs,
                                               queue))
        p.start()
        p.join()
        
        # fit_torch(x[train],
        #           label_binarizer.transform(y[train]),
        #           x[val],
        #           label_binarizer.transform(y[val]),
        #           np.unique(y),
        #           model_type,
        #           hidden_neurons,
        #           epochs,
        #           queue)

        # retrieve the validation accuracy from the queue
        val_acc, train_time, model_params = queue.get()
        cv_metrics['score'].append(val_acc)
        cv_metrics['time'].append(train_time)
        cv_metrics['params'].append(model_params)
        print(f'col: {input_column}, model: {model_type}, neurons: {hidden_neurons}, epochs: {epochs}, fold: {k}, acc: {val_acc:.4f}, time: {int(train_time)}')

    return cv_metrics


In [8]:
# delete results folder if existent
# create new results folder
if os.path.exists('results_hf'):
    shutil.rmtree('results_hf')
os.makedirs('results_hf')

In [9]:
# parameter sets
param_sets = [
    # model_type, hidden_neurons, epochs
    (True, 'labse', 28, 50),
    (True, 'duse', 24, 50),
    (True, 'minilm', 20, 50),
    (True, 'mpnet', 28, 50)
]

In [10]:
# the loop implement the grid search to find the best parameters for the model
queue = mp.Queue()

for _, model_type, hidden_neurons, epochs in param_sets:

    t0 = time.time()

    df_dataset = df_clean_multi

    cv_metrics = cross_validate_torch(df_dataset=df_dataset,
                                      num_folds=5,
                                      input_column='title+desc',
                                      output_column='area',
                                      model_type=model_type,
                                      hidden_neurons=hidden_neurons,
                                      epochs=epochs)
    
    cv_total_time = time.time() - t0
    
    print(f'total time: {int(cv_total_time)}\n')
    
    mean_cv_score = np.mean(cv_metrics['score'])
    mean_cv_time = np.mean(cv_metrics['time'])
    cv_params = np.mean(cv_metrics['params'])
    
    file_path = os.path.join('results_hf', f'{model_type}_{hidden_neurons}_{epochs}')
    with open(file_path, 'w') as f:
        f.write(f'{mean_cv_score:.4f} {int(mean_cv_time)} {cv_params}')

col: title+desc, model: labse, neurons: 28, epochs: 50, fold: 0, acc: 0.8017, time: 430
col: title+desc, model: labse, neurons: 28, epochs: 50, fold: 1, acc: 0.7629, time: 421
col: title+desc, model: labse, neurons: 28, epochs: 50, fold: 2, acc: 0.7198, time: 401
col: title+desc, model: labse, neurons: 28, epochs: 50, fold: 3, acc: 0.7749, time: 394
col: title+desc, model: labse, neurons: 28, epochs: 50, fold: 4, acc: 0.7619, time: 388
total time: 2059

col: title+desc, model: duse, neurons: 24, epochs: 50, fold: 0, acc: 0.7931, time: 98
col: title+desc, model: duse, neurons: 24, epochs: 50, fold: 1, acc: 0.7672, time: 98
col: title+desc, model: duse, neurons: 24, epochs: 50, fold: 2, acc: 0.7284, time: 98
col: title+desc, model: duse, neurons: 24, epochs: 50, fold: 3, acc: 0.7922, time: 98
col: title+desc, model: duse, neurons: 24, epochs: 50, fold: 4, acc: 0.7532, time: 98
total time: 497

col: title+desc, model: minilm, neurons: 20, epochs: 50, fold: 0, acc: 0.7586, time: 77
col: ti

In [15]:
# read the files generated by the grid search and put results on a table
torch_columns = ['model_type', 'hidden_neurons', 'epochs', 'cv_score', 'cv_time', 'trainable_params']
df_results_torch = pd.DataFrame(columns=torch_columns)

for file_path in sorted(os.listdir('results_hf')):
    with open(os.path.join('results_hf', file_path), 'r') as f:
        parameters = file_path.split('_')
        results = f.read()
        cv_score, cv_time, trainable_params = results.split(' ')
        data = parameters + [float(cv_score), float(cv_time), float(trainable_params)]
        df_row = pd.DataFrame(data=[data], columns=torch_columns)
        df_results_torch = pd.concat([df_results_torch, df_row])

In [17]:
def format_params(row):
    row['trainable_params'] = f'{int(row["trainable_params"]/1e6)} M'
    return row

In [19]:
# group by each parameter
print(df_results_torch.groupby(['model_type', 'hidden_neurons'])[['cv_score', 'trainable_params']].mean().apply(format_params, axis=1).reset_index().to_markdown(index=False))

| model_type   |   hidden_neurons |   cv_score | trainable_params   |
|:-------------|-----------------:|-----------:|:-------------------|
| duse         |               24 |     0.7668 | 135 M              |
| labse        |               28 |     0.7643 | 471 M              |
| minilm       |               20 |     0.7418 | 117 M              |
| mpnet        |               28 |     0.7789 | 278 M              |


In [21]:
print(df_results_torch.groupby(['model_type'])[['cv_score', 'trainable_params']].mean().apply(format_params, axis=1).to_markdown())

| model_type   |   cv_score | trainable_params   |
|:-------------|-----------:|:-------------------|
| duse         |     0.7668 | 135 M              |
| labse        |     0.7643 | 471 M              |
| minilm       |     0.7418 | 117 M              |
| mpnet        |     0.7789 | 278 M              |
