In [None]:
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False
if IN_COLAB:
    !pip install -q transformers
    from google.colab import drive
    drive.mount('/content/drive')
    import sys
    sys.path.append('/content/drive/MyDrive/gan_bert_for_cpv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from transformers import logging
logging.set_verbosity_error()
import torch
from create_dataset import CreateDataset
from CPVDataset import CPVDataset
from process_data import ProcessData
from utils import *
from torch.utils.data import DataLoader
from run_model import RunModel
from models import BertGenerator, BertDiscriminator,EmbeddingClassifier
from transformers import AutoTokenizer, AutoModel, AutoConfig

In [None]:
label_column = "groep"
text_column = "Korte beschrijving aanbesteding"
input_df = pd.read_csv("/content/drive/MyDrive/gan_bert_for_cpv/data/cleaned_dutch_ted1.csv")
input_df2=pd.read_csv("/content/drive/MyDrive/gan_bert_for_cpv/data/cleaned_foreign_ted1.csv")
all_input=pd.concat([input_df,input_df2])

In [None]:
all_input=all_input.sample(frac=1).groupby(label_column, sort=False).head(2000)

In [None]:
all_input.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'Korte beschrijving aanbesteding',
       'Naam Aanbestedende dienst', 'CPV-code', 'Omschrijving',
       'Omschrijving standaard', 'Publicatiedatum', 'afdeling', 'groep',
       'klasse', 'categorie', 'Unnamed: 0.2'],
      dtype='object')

In [None]:
running_device = torch.device("cuda")

In [None]:
prd = ProcessData(all_input, label_column, text_column, sample_num=1000, aug_limit=1000,test_per_label_num=30, drop_limit=50,
                  balanced_test=False)
test_df, suf_df, insuf_df, label_encoder, onehot = prd.process()

value_counts = insuf_df[label_column].value_counts()
model_runner = RunModel(running_device)

After removing duplicate sentence: 184548
After drop cpv with low observations we have : 183571
The num of class is  247
The number of total data is:  134553
Total number of test data is:  26908
Total number of train data is:  100947


In [None]:
balanced_test=test_df.sample(frac=1).groupby(label_column, sort=False).head(30)

In [None]:
value_counts

Reparatie en onderhoud van medische en precisie-uitrusting                                                                           791
Diensten in verband met de bouw                                                                                                      791
Reparatie, onderhoud en aanverwante diensten in verband met PC's, kantooruitrusting, telecommunicatie- en audiovisuele uitrusting    790
Dienstverlening op het gebied van stedenbouw en landschapsarchitectuur                                                               788
Farmaceutische producten                                                                                                             785
                                                                                                                                    ... 
Militaire lucht- en ruimtevaartuigen en raketten                                                                                      42
Zout en zuiver natriumchloride           

In [None]:
from joblib import dump, load
dump(onehot, '/content/drive/MyDrive/gan_bert_for_cpv/output/group/onehot.joblib')

['/content/drive/MyDrive/gan_bert_for_cpv/output/group/onehot.joblib']

In [None]:
dump(label_encoder, '/content/drive/MyDrive/gan_bert_for_cpv/output/group/labelencoder.joblib')

['/content/drive/MyDrive/gan_bert_for_cpv/output/group/labelencoder.joblib']

In [None]:
tokenizer = AutoTokenizer.from_pretrained("GroNLP/bert-base-dutch-cased")
config = AutoConfig.from_pretrained("GroNLP/bert-base-dutch-cased", output_hidden_states=True)
bert = AutoModel.from_pretrained("GroNLP/bert-base-dutch-cased", config=config).to(running_device)

In [None]:
cd = CreateDataset(text_column, label_column, label_encoder, onehot, 0, running_device, max_length=100)
test_dataset = cd.create(balanced_test)

Init new bert


Getting bert hidden states from layer: 0: 100%|██████████| 6894/6894 [01:29<00:00, 77.21it/s]


In [None]:
insuf_dataset=cd.create(insuf_df)

Getting bert hidden states from layer: 0: 100%|██████████| 100947/100947 [21:48<00:00, 77.17it/s]


In [None]:
test_dl = DataLoader(test_dataset, shuffle=True, batch_size=20)
train_dataset = torch.utils.data.ConcatDataset([suf_dataset,insuf_dataset])
train_dl = DataLoader(train_dataset, shuffle=True, batch_size=20)

In [None]:
classifier = EmbeddingClassifier(prd.num_class, bert)
classifier = model_runner.train_classifier(train_dl, test_dl, classifier, lr=2e-5, n_epochs=5)

In [None]:
suf_dataset=CPVDataset(suf_df, label_column, text_column, running_device, bert, tokenizer, 0, cls_rep=False, max_len=100, label_encoder=label_encoder,onehot_encoder=onehot)
test_dataset=CPVDataset(test_df, label_column, text_column, running_device, bert, tokenizer, 0, cls_rep=False, max_len=100, label_encoder=label_encoder,onehot_encoder=onehot)

In [None]:
insuf_ds_dict = {}
for label in set(insuf_df[label_column].values):
    sub_df = insuf_df[insuf_df[label_column] == label]
    sub_ds = CPVDataset(sub_df, label_column, text_column, running_device, bert, tokenizer, 0, cls_rep=False, max_len=100, label_encoder=label_encoder,onehot_encoder=onehot)
    insuf_ds_dict[label] = sub_ds

In [None]:
test_dl = DataLoader(test_dataset, shuffle=True, batch_size=40)
insuf_ds_list = list(insuf_ds_dict.values())
original_train_dataset = torch.utils.data.ConcatDataset(insuf_ds_list+[suf_dataset])
original_dl = DataLoader(original_train_dataset, shuffle=True, batch_size=40)