In [1]:
import pandas as pd
import csv
from src.utils import load_data, load_config

In [2]:
config = load_config(file_name="processing_config", env="prod", folder="../../config")

In [24]:
tfidf_df = pd.read_csv(
    "../../output_data/prod/unsupervised_sentence_transformer_dataset.csv"
)

In [25]:
tfidf_df.sample()

Unnamed: 0,codigo,id_pieza,cod_articulo,desc_problema_translated,descripcion_translated,problema_translated,CODART_A3,Fuzzy_Score,CODART,DESCART,...,cosine_similarity_4.41,cosine_similarity_4.42,cosine_similarity_4.43,cosine_similarity_4.49,cosine_similarity_4.61,cosine_similarity_4.62,cosine_similarity_4.63,cosine_similarity_4.69,highest_score,highest_score_error
2140,MGXPLGLRB7,35812,AZX6QADAPTMHI,"SEGUN SERVICIO TECNICO, LA PASARELA DA ERROR. ...",PASARELA MITSUBISHI PESADO,PASARELA NO FUNCIONA DA ERROR,AZX6QADAPTMHI,100.0,AZX6QADAPTMHI,Pasarela comunicaciones Airzone-Mitsubishi Heavy,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.240239,3.49


In [26]:
top50_per_error = (
    tfidf_df[["codigo", "text_to_analyse", "highest_score", "highest_score_error"]]
    .groupby("highest_score_error", group_keys=False)
    .apply(lambda x: x.nlargest(50, "highest_score"))
    .reset_index(drop=True)
)

  top50_per_error = tfidf_df[['codigo','text_to_analyse', 'highest_score', 'highest_score_error']] \


In [27]:
top50_per_error

Unnamed: 0,codigo,text_to_analyse,highest_score,highest_score_error
0,MGRWM2XS39,EL TERMOSTATO BLOQUEADO YA NO FUNCIONA TERMOST...,0.952250,3.01
1,BGTGYGSU7C,TERMOSTATO QUE SE BLOQUEA SOLO Y NO FUNCIONA B...,0.909094,3.01
2,L2LOMW5OF7,termostacto funciona pero se bloquea,0.887725,3.01
3,MGXXMGDUA1,Termostacto funciona pero se queda bloqueado,0.857667,3.01
4,ZMTIZ2IZ1F,EL TECLADO DEL TERMOSTATO SE HA BLOQUEADO Y NO...,0.825569,3.01
...,...,...,...,...
907,LPIVLJVO79,STech274610\r\nBOLETO 91657\r\nCAJA DE SEGURID...,0.270321,4.69
908,MMTUM2HPE4,por favor envíame un AZX6POWER 100% de descuen...,0.268134,4.69
909,AM1OAWHQ5C,Sonda de temperatura AZASONDTEMP no funciona. ...,0.265700,4.69
910,Z2DMZ2IT1C,MEDIDOR DE CONSUMO EN MAL ESTADO MEDIDOR DE CO...,0.263547,4.69


In [None]:
top50_per_error.to_csv(
    "../DATA/processed/2024-05-14/top50_per_error.csv",
    index=False,
    encoding="utf-8",
    quoting=csv.QUOTE_NONNUMERIC,
)

In [28]:
config = load_config(file_name="training_config", env="prod", folder="../../config")

In [29]:
model_type = "XGBoost"

In [30]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

models = {
    "XGBoost": XGBClassifier,
    "RandomForest": RandomForestClassifier,
    "SVC": SVC,
}

In [31]:
if model_type not in models:
    raise ValueError(f"Model type {model_type} not supported")

In [32]:
model_config = config.training[model_type]

In [33]:
model_params = model_config.params_list

In [34]:
params = {}
for param in model_params:
    params[param] = model_config[param]

In [35]:
from itertools import product

param_combinations = list(product(*params.values()))

In [36]:
current_params = {}
for combination in param_combinations:
    for param, value in zip(params.keys(), combination):
        current_params[param] = value
    print(current_params)

{'device': 'cuda', 'learning_rate': 0.05, 'max_depth': 50, 'n_estimators': 200, 'subsample': 0.7, 'sampling_method': 'gradient_based', 'colsample_bytree': 0.7, 'scale_pos_weight': 95, 'max_delta_step': 0, 'tree_method': 'hist', 'random_state': 42}
{'device': 'cuda', 'learning_rate': 0.05, 'max_depth': 50, 'n_estimators': 200, 'subsample': 0.7, 'sampling_method': 'gradient_based', 'colsample_bytree': 0.7, 'scale_pos_weight': 95, 'max_delta_step': 1, 'tree_method': 'hist', 'random_state': 42}
{'device': 'cuda', 'learning_rate': 0.05, 'max_depth': 50, 'n_estimators': 200, 'subsample': 0.7, 'sampling_method': 'gradient_based', 'colsample_bytree': 0.7, 'scale_pos_weight': 98, 'max_delta_step': 0, 'tree_method': 'hist', 'random_state': 42}
{'device': 'cuda', 'learning_rate': 0.05, 'max_depth': 50, 'n_estimators': 200, 'subsample': 0.7, 'sampling_method': 'gradient_based', 'colsample_bytree': 0.7, 'scale_pos_weight': 98, 'max_delta_step': 1, 'tree_method': 'hist', 'random_state': 42}
{'device

In [37]:
len(param_combinations)

192

In [38]:
df = pd.read_parquet("../../DATA/train_data.parquet")

In [39]:
df.sample(10)

Unnamed: 0,id,text,label,sentiment,embedding
15879,23964,Happy moms day,2,positive,"[-0.06689286231994629, 0.016767103224992752, -..."
31145,6699,Seems like a good idea but the widget is invis...,1,neutral,"[-0.015102826058864594, -0.027292301878333092,..."
4767,26763,flo can spell antidisestablishmentarianism of...,1,neutral,"[-0.005056086461991072, 0.0288715660572052, -0..."
17575,18717,I got the ticket dismissed,2,positive,"[-0.010813377797603607, -0.010639999993145466,..."
17694,9260,_coza Yep. Live and learn,1,neutral,"[0.013380270451307297, 0.04944470524787903, 0...."
14928,16652,I was thinking there should be a thumbs down ...,0,negative,"[-0.015006864443421364, 0.03168737143278122, 0..."
28896,14566,"if you could get down to easton, you could jo...",1,neutral,"[-0.05218508839607239, 0.08367354422807693, -0..."
29235,3713,haha Mon...Do you really want me too send it?,1,neutral,"[-0.005338672082871199, 0.05979635939002037, -..."
17163,19706,"So far (with the exception of the rain), my m...",2,positive,"[-0.018784314393997192, 0.05234287306666374, -..."
22846,20874,"trying to upload a custom background on here, ...",0,negative,"[0.055107083171606064, -0.024561911821365356, ..."


In [43]:
len(df["embedding"].values[0])

768