In [32]:
import nltk
from nltk.corpus import wordnet
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import pandas as pd
import re
from transformers import AutoTokenizer
from datasets import load_dataset

In [13]:
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Dasha\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Dasha\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dasha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dasha\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [12]:
data_file = '../data/intermediate/filtered_data.tsv'

data = pd.read_csv(data_file, sep='\t')

data.head()

Unnamed: 0,reference,translation,ref_tox,trn_tox
0,if alkar floods her with her mental waste it w...,if alkar is flooding her with psychic waste th...,0.981983,0.014195
1,i have orders to kill her,ive got orders to put her down,0.999348,0.009402
2,im not gonna have a child with the same geneti...,im not going to breed kids with a genetic diso...,0.950956,0.035846
3,theyre all laughing at us so well kick your ass,theyre laughing at us well show you,0.999492,0.000131
4,youd probably want me to buy you some chocolat...,i suppose you want me to buy you flowers and c...,0.980341,7.8e-05


Creating an algorithm that identifies negative sentiment words and replaces them with synonyms involves several steps:

- Sentiment Analysis: Perform sentiment analysis on each sentence to determine the overall sentiment and identify the negative words.

- Word Tokenization: Break down the sentences into individual words or tokens.

- Part-of-Speech Tagging: Determine the part of speech for each word because synonyms need to match the original part of speech to make sense in context.

- Synonym Replacement: For each identified negative word, find synonyms and select an appropriate replacement that is neutral or positive.

- Reconstruction: Reconstruct the sentence with the new synonym(s).

In [19]:
# Initialize sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Function to identify negative words based on sentiment analysis
def identify_negative_words(text):
    tokenized_text = word_tokenize(text)
    pos_tags = nltk.pos_tag(tokenized_text)
    
    negative_words = []
    for word, tag in pos_tags:
        if sia.polarity_scores(word)['compound'] < 0:
            # Check if word is an adverb (often intensify negativity)
            if tag.startswith('RB') or tag.startswith('RBR') or tag.startswith('RBS'):
                negative_words.append((word, tag, 'omit'))  # Mark for omission
            else:
                negative_words.append((word, tag, 'replace'))  # Mark for replacement
    return negative_words

# Function to find synonyms
def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            if lemma.name() != word:
                synonyms.add(lemma.name().replace('_', ' '))
    return list(synonyms)

# Function to replace or omit negative words
def modify_negative_words(text, negative_words):
    tokenized_text = word_tokenize(text)
    modified_text = []
    
    for word in tokenized_text:
        negation = next((neg for neg in negative_words if neg[0] == word), None)
        if negation:
            word, tag, action = negation
            if action == 'replace':
                synonyms = get_synonyms(word)
                if synonyms:
                    word = synonyms[0]  # Just pick the first synonym for simplicity
            elif action == 'omit':
                word = ''  # Omit the word from the sentence
        
        # Add the word if it's not marked for omission
        if word:
            modified_text.append(word)
            
    return ' '.join(modified_text)

In [20]:
data = pd.DataFrame({
    'reference': [
        'I hate this shit so much.',
        'The food was fucking terrible.',
        'He is a horrible nigger.',
        'They are laughing because we will fuck up.'
    ]
})

for index, row in data.iterrows():
    sentence = row['reference']
    negative_words = identify_negative_words(sentence)
    new_sentence = modify_negative_words(sentence, negative_words)
    data.at[index, 'translation'] = new_sentence

data.head()

Unnamed: 0,reference,translation
0,I hate this shit so much.,I hatred this Irish bull so much .
1,The food was fucking terrible.,The food was fucking fearful .
2,He is a horrible nigger.,He is a horrifying spade .
3,They are laughing because we will fuck up.,They are laughing because we will make out up .


I've added feature of ommiting negative sentiments words if there are no synonyms for them. But it didn't work.

In [25]:
sia = SentimentIntensityAnalyzer()

def identify_negative_words(text):
    tokenized_text = word_tokenize(text)
    pos_tags = nltk.pos_tag(tokenized_text)
    
    negative_words = []
    for word, tag in pos_tags:
        if sia.polarity_scores(word)['compound'] < 0:
            negative_words.append((word, tag))
    return negative_words

def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            if lemma.name() != word:
                synonyms.add(lemma.name().replace('_', ' '))
    return list(synonyms)

def modify_negative_words(text, negative_words):
    for word, tag in negative_words:
        synonyms = get_synonyms(word)
        non_negative_synonyms = [syn for syn in synonyms if sia.polarity_scores(syn)['compound'] >= 0]
        if non_negative_synonyms:
            text = text.replace(word, non_negative_synonyms[0], 1)
    return text

def reframe_sentence(text, negative_words):
    tokenized_text = word_tokenize(text)
    for word, tag in negative_words:
        # Check for negation patterns in the sentence
        if word.lower() in ["not", "no", "never", "n't"] and len(tokenized_text) > tokenized_text.index(word) + 1:
            # Attempt to negate the following word directly if it's an adjective or verb
            next_word = tokenized_text[tokenized_text.index(word) + 1]
            if tag.startswith('JJ') or tag.startswith('VB'):
                synonyms = get_synonyms(next_word)
                non_negative_synonyms = [syn for syn in synonyms if sia.polarity_scores(syn)['compound'] >= 0]
                if non_negative_synonyms:
                    # Replace the negation + word with a non-negative synonym
                    text = text.replace(word + ' ' + next_word, non_negative_synonyms[0], 1)
        else:
            synonyms = get_synonyms(word)
            non_negative_synonyms = [syn for syn in synonyms if sia.polarity_scores(syn)['compound'] >= 0]
            if non_negative_synonyms:
                text = text.replace(word, non_negative_synonyms[0], 1)
    return text

In [28]:
data = pd.DataFrame({
    'reference': [
        'I hate this shit so much.',
        'The food was fucking terrible.',
        'He is a horrible nigger.',
        'They are laughing because we will fuck up.'
    ]
})

for index, row in data.iterrows():
    sentence = row['reference']
    negative_words = identify_negative_words(sentence)
    
    # Step 1: Modify negative words
    modified_sentence = modify_negative_words(sentence, negative_words)
    
    # Step 2: Check the sentiment of the modified sentence
    reframed_sentence = reframe_sentence(modified_sentence, negative_words)
    data.at[index, 'translation'] = reframed_sentence

data.head()

Unnamed: 0,reference,translation
0,I hate this shit so much.,I detest this Irish bull so much.
1,The food was fucking terrible.,The food was fucking tremendous.
2,He is a horrible nigger.,He is a atrocious spade.
3,They are laughing because we will fuck up.,They are laughing because we will make out up.


I've added sentence reconstruction after pulling in synonyms. It didn't work well.

Let's try different approach.

I downloaded list of banned words from https://www.freewebheaders.com/full-list-of-bad-words-banned-by-google/ by Google. Let's try to work with this.

In [31]:
banned_words_file = '../data/external/bad-words.txt'

with open(banned_words_file, 'r') as file:
    banned_words_list = file.read().splitlines()

banned_df = pd.DataFrame(banned_words_list, columns=['banned_word'])

banned_df.head()

Unnamed: 0,banned_word
0,2 girls 1 cup
1,2g1c
2,4r5e
3,5h1t
4,5hit


Integrate the given get_synonyms function into the workflow. If a synonym can't be found for a banned word, the word will be omitted from the sentence.

In [42]:
def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            if lemma.name().lower() != word:
                synonyms.add(lemma.name().lower().replace('_', ' '))
    return list(synonyms)

# Define the function to identify and replace negative phrases
def detoxify_sentence(sentence, banned_words_df):
    # Lowercase the sentence to ensure proper matching
    lowered_sentence = sentence.lower()
    for banned_phrase in banned_words_df['banned_word']:
        # Check if the banned phrase is in the sentence
        if re.search(r'\b' + re.escape(banned_phrase) + r'\b', lowered_sentence, flags=re.IGNORECASE):
            # Split the banned phrase to check for synonyms for single words only
            banned_words = banned_phrase.split()
            # Only find synonyms if the banned phrase is a single word
            if len(banned_words) == 1:
                synonyms = get_synonyms(banned_words[0])
                if synonyms:
                    replacement = synonyms[0]
                else:
                    replacement = ''
            else:
                # For multi-word phrases, it's more complex to find synonyms
                # So we'll opt to just remove the phrase
                replacement = ''
            # Replace the banned phrase with the replacement or remove it
            sentence = re.sub(r'\b' + re.escape(banned_phrase) + r'\b', replacement, sentence, flags=re.IGNORECASE)
    return sentence.strip()

In [47]:
data = pd.DataFrame({
    'reference': [
        'That was a stupid idea, you ass dlck!',
        'Motherfucka! a55! style doggy!',
        'He is such a jerk when he drinks.',
        'She is a slutty fox in her debates.',
        'They are laughing because we will fuck up.',
    ]
})

# Apply the detoxify_sentence function to the 'reference' column
data['translation'] = data['reference'].apply(lambda x: detoxify_sentence(x, banned_df))

data.head()

Unnamed: 0,reference,translation
0,"That was a stupid idea, you ass dlck!","That was a pudden-head idea, you nates !"
1,Motherfucka! a55! style doggy!,! ! !
2,He is such a jerk when he drinks.,He is such a twitch when he drinks.
3,She is a slutty fox in her debates.,She is a slutty fox in her debates.
4,They are laughing because we will fuck up.,They are laughing because we will make out up.


In [None]:
# Load your data into a Hugging Face Dataset
data = load_dataset('csv', data_files='../data/intermediate/filtered_data.tsv', sep='\t')['train']

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

### Tokenization

This tokenizer will convert text into a sequence of integers, where each integer represents a specific token as understood by the transformer model.
- model_inputs would have input_ids and attention_mask, which are fed to the model.
- labels are the expected correct outputs during training, which the model will learn to predict.
- In a sequence-to-sequence model like T5, during training, the model uses attention_mask to know which tokens are padding and which are not.

In [None]:
tokenizer = AutoTokenizer.from_pretrained('t5-small')

def tokenize_function(examples):
    return tokenizer(examples['reference'], examples['translation'], 
                     max_length=128, truncation=True, padding='max_length')

def prepare_data(examples):
    # Tokenize the reference texts
    model_inputs = tokenizer(examples["reference"], max_length=128, truncation=True, padding="max_length")

    # Tokenize the translation texts with the same tokenizer but do not pad yet, as we need raw token ids for labels
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["translation"], max_length=128, truncation=True)["input_ids"]

    # Pad labels to max_length
    labels = [label + [tokenizer.pad_token_id] * (128 - len(label)) for label in labels]

    model_inputs["labels"] = labels

    return model_inputs

tokenized_data = data.map(tokenize_function, batched=True)
model_data = tokenized_data.map(prepare_data, batched=True)

print(model_data.column_names)

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]



Map:   0%|          | 0/336256 [00:00<?, ? examples/s]

Map:   0%|          | 0/336256 [00:00<?, ? examples/s]



['reference', 'translation', 'ref_tox', 'trn_tox', 'input_ids', 'attention_mask', 'labels']


In [None]:
from transformers import (
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)
from datasets import load_dataset, DatasetDict

# Assuming 'model_data' is your tokenized and prepared dataset
columns_to_remove = ['reference', 'translation', 'ref_tox', 'trn_tox']
for column in columns_to_remove:
    if column in model_data.features:
        model_data = model_data.remove_columns(column)

# Now model_data should only contain the columns necessary for training
print(model_data.column_names)

# Update the dataset split since we modified model_data
dataset = DatasetDict({
    'train': model_data.train_test_split(test_size=0.1)['train'],
    'validation': model_data.train_test_split(test_size=0.1)['test']
})

# Initialize the T5 model for sequence-to-sequence LM
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

# Data collator used for dynamically padding the inputs and labels
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Define the training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True
)


['input_ids', 'attention_mask', 'labels']


In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer
)

# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.2084,0.190702
2,0.2017,0.185323
3,0.1994,0.183889


IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out


TrainOutput(global_step=28374, training_loss=0.23742393342537954, metrics={'train_runtime': 2658.1415, 'train_samples_per_second': 341.551, 'train_steps_per_second': 10.674, 'total_flos': 3.071886703460352e+16, 'train_loss': 0.23742393342537954, 'epoch': 3.0})

In [None]:
# saving model
trainer.save_model('models/model 1')

In [None]:
# loading the model and run inference for it
model = AutoModelForSeq2SeqLM.from_pretrained('models/model 1')
model.eval()
model.config.use_cache = False

In [None]:
def translate(model, inference_request, tokenizer=tokenizer):
    input_ids = tokenizer(inference_request, return_tensors="pt").input_ids
    outputs = model.generate(input_ids=input_ids)
    print(tokenizer.decode(outputs[0], skip_special_tokens=True,temperature=0))

In [None]:
inference_request = 'Eat my juicy fat cock'
translate(model, inference_request,tokenizer)

eat my juicy fat cock


