In [11]:
import pandas as pd
df = pd.read_csv('../data/filtered.tsv', sep='\t', encoding='utf-8')

In [14]:
from transformers import T5ForConditionalGeneration, T5TokenizerFast
# import torch
#
# device = torch.device('cuda:0')
def load_model_and_generate_n_predictions(model_directory: str, dataset: pd.DataFrame, input_column: str, output_path: str, n: int):
    """
    Load a T5 model and tokenizer, generate predictions for the first n entries of the input data, and save the predictions.

    Args:
    model_directory (str): Path to the directory where the model and tokenizer are saved.
    dataset (pd.DataFrame): DataFrame containing the input data for prediction.
    input_column (str): Name of the column in dataset containing the input text.
    output_path (str): Path where the predictions should be saved.
    n (int): Number of entries from the dataset to generate predictions for.
    """
    # Load the model and tokenizer
    model = T5ForConditionalGeneration.from_pretrained(model_directory)
    tokenizer = T5TokenizerFast.from_pretrained('ceshine/t5-paraphrase-paws-msrp-opinosis')

    # Slice the dataset to only include the first n entries
    dataset = dataset.head(n)

    # Prepare the dataset for prediction
    inputs = tokenizer(dataset[input_column].tolist(), return_tensors="pt", padding=True, truncation=True)

    # Generate predictions
    output_sequences = model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])

    # Decode the predictions into text
    predictions = [tokenizer.decode(generated_sequence, skip_special_tokens=True) for generated_sequence in output_sequences]

    # Save the predictions to a file
    predictions_df = pd.DataFrame({input_column: dataset[input_column], 'prediction': predictions})
    predictions_df.to_csv(output_path, index=False, sep='\t')
    print(f"Predictions saved to {output_path}")

load_model_and_generate_n_predictions('../models/t5-cechine-nmt-mined-detox1', df, 'reference', '../data/results.tsv',n=100)


Predictions saved to ../data/results.tsv


In [7]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

def generate_prediction_from_text(model_directory: str, input_text: str) -> str:
    """
    Load a T5 model and tokenizer, generate a prediction for the input text.

    Args:
    model_directory (str): Path to the directory where the model and tokenizer are saved.
    input_text (str): Text to generate prediction from.

    Returns:
    str: The model's predicted output.
    """
    # Load the model and tokenizer
    model = T5ForConditionalGeneration.from_pretrained(model_directory)
    tokenizer = T5Tokenizer.from_pretrained('ceshine/t5-paraphrase-paws-msrp-opinosis')

    # Tokenize the input text for the model
    input_ids = tokenizer.encode(input_text, return_tensors="pt")

    # Generate predictions
    output_sequences = model.generate(input_ids=input_ids)

    # Decode the prediction into text
    prediction = tokenizer.decode(output_sequences[0], skip_special_tokens=True)

    return prediction

prediction = generate_prediction_from_text('../models/t5-cechine-nmt-mined-detox1', "You motherfucker, come on you little ass… fuck with me, eh? You fucking little asshole, dickhead cocksucker…You fuckin' come on, come fuck with me! I'll get your ass, you jerk! Oh, you fuckhead motherfucker! Fuck all you and your family! Come on, you cocksucker, slime bucket, shitface turdball! Come on, you scum sucker, you fucking with me? Come on, you asshole")
print(prediction)




come on, you little punk, you punk, you punk, you punk
