In [6]:
import argparse
import os

import numpy as np
import pandas as pd
import matplotlib

import matplotlib.pyplot as plt

from mapie.classification import MapieClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

from src.conformal_prediction.utils import chunked_mapie_predict

# For loading your trained TransformerClassifier
from src.training.model import TransformerClassifier
from src.utils import load_config, get_logger

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [7]:
# Load config
env = "prod"
model_type = "BERT"

training_config = load_config(
    file_name="training_config", env=env, folder="../../config"
)

In [8]:
input_dataset = f"../../output_data/{env}/supervised_dataset.parquet"
input_model = f"../../output_models/{env}/trained_model"
# input_model = f"../../output_models/{env}/temp_model_16-01"
input_outliers = f"../../output_data/{env}/supervised_dataset_phase2.parquet"
output_reports = f"../../output_reports/{env}/outlier_detection"
alpha = 0.2

In [9]:
df = pd.read_parquet(input_dataset)

In [10]:
# Select features and target
X = df[training_config.training[model_type].features]
y = df[training_config.training[model_type].target]
num_labels = y.nunique()

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=training_config.training[model_type].test_size,
    random_state=training_config.training.random_state,
    stratify=y,
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train,
    y_train,
    test_size=training_config.training[model_type].val_size,
    random_state=training_config.training.random_state,
    stratify=y_train,
)
X_test, X_cp, y_test, y_cp = train_test_split(
    X_test,
    y_test,
    test_size=training_config.training[model_type].cp_size,
    random_state=training_config.training.random_state,
    stratify=y_test,
)

In [13]:
teste = X_train[:10]

In [2]:
# Load a pre-trained model for paraphrasing (e.g., FLAN-T5 or similar)
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

Generated Paraphrases: ['The product is not working properly.', 'The product is not working properly and shows an error code.', "The user is in a hurry and won't get what they expect."]


In [16]:
for text in teste:
    # Example input text
    # text = "The product is not working properly and shows an error code."
    input_text = f"paraphrase: {text} </s>"

    # Tokenize and generate paraphrase
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
    outputs = model.generate(
        **inputs, max_length=512, num_return_sequences=3, do_sample=True, top_k=50
    )

    # Decode generated outputs
    paraphrases = [
        tokenizer.decode(output, skip_special_tokens=True) for output in outputs
    ]

    print(f"Original text: {text}")
    for i, paraphrase in enumerate(paraphrases):
        print(f"Paraphrase {i+1}: {paraphrase}")

Original text: mando blueface nº serie foofzxw queda blanco pantalla mando blueface nº serie foofzy9 ve doble pantalla falla mando blueface nº serie foofzyc enciende estando off mando conecta maquina comprobado descaonectando mandos mando blueface nº serie foofzxs queda simbolo centro pantalla rejillas salon habitacion crujido cerrarse mando blueface queda simbolo centro pantalla
Paraphrase 1: blueface no serie foofzy09 ve doble pantalla falla mando blueface no serie foofzyc enciende estando off mando conectama comprobado descaonectando mandos mando blueface no serie foofzys queda simbolo centro pantalla rejillas salon habitacion crujido cerrarse mando blueface queda simbolo centro pantalla
Paraphrase 2: mando blueface no serie foofzy9 pantalla mando fogy7 ve do double pantalla falla mando fogyfoofzyc enciende estando on mando conecta medusa comprobado descaonectando mandos mando fogyface no serie foofzy
Paraphrase 3: olvy flct no serie foofzxxw es ve vble pantalla falla mando blueface

In [17]:
df = df.sample(10)

In [19]:
def augment_dataset(dataset, column_name, num_paraphrases=3):
    augmented_data = []
    for text in dataset[column_name]:
        input_text = f"paraphrase: {text} </s>"
        inputs = tokenizer(
            input_text, return_tensors="pt", max_length=512, truncation=True
        )
        outputs = model.generate(
            **inputs,
            max_length=512,
            num_return_sequences=num_paraphrases,
            do_sample=True,
            top_k=50,
        )
        paraphrases = [
            tokenizer.decode(output, skip_special_tokens=True) for output in outputs
        ]
        augmented_data.extend(paraphrases)
    return augmented_data


# Example usage
augment_dataset(df, "text_to_analyse")

['ESTE LUNCH A CALORE. Y ME GLASER IN EL TUNA. Y ME UTEN SALTO CABLEADO DE COPIBOL. Y CUTEN LES PROBOLLES DE DEL TUNA SUE HISTME DE LENA Y DE AL TINCO.',
 'REGULAR TERMOSTATO QUE DEBA A PUEDA. SOME JUSTIONS HAS BLOQUEAD CABLEAD Y TUTENCICIA Y LAS OPPORNIAS CARINAS POSSIBLES CON EL TÉNCICO PERO NADA.',
 'GROUP LE PRESENTE A CLIMADO BLOQUEADO Y NO RESPONDE. ECHO COMPRABO CESARAR Y TODOS LAS OPORTIONES POSIBLES CON EL TÉCNITO SE SURNATA SU VIDEO COMMOSSO. SE QUE DE POLITUDE BLOQUEADO Y NO RESPONDA. ECHO COMPREPOABIRA EN CASION DE CHELEN TONO DE LA PUBESACION. ECHO COMPRABO EN TÉCONICO A FOCIA DE LA TCNA CANDOUBRE Y NO ENTRAN CUBRERIDO DESDE AS.',
 'BLUEFACE GLEEN A CAR BAHAMIA DE COLOR TERMOSTATE DE CABLE INTERLIGENTE',
 "The Luftwaffe's TERMOSTATO COLOR CAMBIA A RAYES uses two types of wire (a collar, in the lower right hand corner, one with a nylon or an upper right hook and the other with a slack hook).",
 'ADRONCA_BALBAND AIRZONE BL 8 DEVIAR O ENTEMADO CABLE NUEVO',
 '2 termostatos Th