In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m27.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m32.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m77.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m73.8 MB/s[0m eta [36m0:00:0

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [1]:
# -*- coding: utf-8 -*-
"""
Created on aug 18 2023

@author: Daniel Duque Lozano

This code is adapted from https://github.com/ofiscal/contract-transparency---copia
"""
import tensorflow as tf
from transformers import TFAutoModelForSequenceClassification,AutoTokenizer
from transformers.onnx import config
import pandas as pd

import numpy as np
from sklearn.metrics import r2_score


#We start a tf session that is prone to run eagerly
tf.config.run_functions_eagerly(True)

path_to_data=r"/content/drive/MyDrive/Observatorio Fiscal Conjunto/Gasto Público/SAPO/Otros/datos_paraguay/datos/records.csv"
#This parameters are used to normalize, however they may be misleading as they
#Aren´t really means or standard deviation in any actual structure
mean=1e+12
ssd=1e+17


entrenar="TR"
setsize=20000
#we run a subset of the 2023 dataset from contracts in paraguay open data
#we can find the source here https://www.contrataciones.gov.py/datos/api/v3/doc/
#As we need complet information to train we drop empty values and normalize
data=pd.read_csv(path_to_data,nrows=setsize)
data=data.dropna(subset="compiledRelease/tender/value/amount")
data=data.dropna(subset="compiledRelease/planning/budget/description")
data["valor norm"]=data["compiledRelease/tender/value/amount"].apply(
    lambda x:(x-mean)/ssd
)



#We use pre trained weights for bert multilingual case, we use the next
#code in order to load those weights, tokenize our data in bert fomat an organ-
#ize it in a way it makes sense for the training proces
checkpoint="bert-base-multilingual-cased"
tokenizer=AutoTokenizer.from_pretrained(checkpoint)
pre_token=data["compiledRelease/planning/budget/description"].tolist()
tokens=tokenizer(pre_token,padding=True,return_tensors="np")
tokenized_data = dict(tokens)
labels = np.array(data["valor norm"])
model = TFAutoModelForSequenceClassification.from_pretrained(
    checkpoint,num_labels=1)

"""
The next part is the portion of the model that we stablish, we use adam optimiz-
er based on gradient descent aplying adaptative momentes, pretty usefull when
trying to reduce convergence to local minimums

"""
adamizer=tf.keras.optimizers.Adam(
    learning_rate=0.00001,
    beta_1=0.9,
    beta_2=0.999,
    epsilon=1e-07,

)

model.compile(optimizer=adamizer)
#The next part of de model stablish that the loaded weights won´t be trained,
#just the later stage that connects the result.
model.layers[0].trainable = False

model.fit(x=tokenized_data,y=labels,batch_size=4, epochs=1, validation_split=0.2)
results=model.predict(tokenized_data).logits
resultados=pd.DataFrame(results)

data["Precio Predecido"]=resultados[0].apply(lambda x: (x*ssd)+mean)

data.to_excel("/content/drive/MyDrive/Observatorio Fiscal Conjunto/Gasto Público/SAPO/Otros/datos_paraguay/datos/resultshugging.xlsx")

"""
As ending remarks of this script:
This is just a trial that shows the ability to train in a limited/small environ-
ment.

"""


ModuleNotFoundError: ignored