In [13]:
%pip install pandas -q
%pip install torch -q
%pip install transformers -q
%pip install ipywidgets -q
%pip install --user -U nltk -q

# Imports

In [15]:
import zipfile
import pandas as pd
import os

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import re

# Initial data exploration

In [23]:
zip_file_path = '../data/raw/filtered_paranmt.zip'
extracted_dir = '../data/raw/'

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extracted_dir)

In [24]:
tsv_file_name = 'filtered.tsv' 

# Construct the full path to the TSV file
tsv_file_path = os.path.join(extracted_dir, tsv_file_name)
print(f"Path to tsv file: {tsv_file_path}")

# Read the TSV file into a DataFrame
df = pd.read_csv(tsv_file_path, delimiter='\t')

df.head()


Path to tsv file: ../data/raw/filtered.tsv


Unnamed: 0.1,Unnamed: 0,reference,translation,similarity,lenght_diff,ref_tox,trn_tox
0,0,"If Alkar is flooding her with psychic waste, t...","if Alkar floods her with her mental waste, it ...",0.785171,0.010309,0.014195,0.981983
1,1,Now you're getting nasty.,you're becoming disgusting.,0.749687,0.071429,0.065473,0.999039
2,2,"Well, we could spare your life, for one.","well, we can spare your life.",0.919051,0.268293,0.213313,0.985068
3,3,"Ah! Monkey, you've got to snap out of it.","monkey, you have to wake up.",0.664333,0.309524,0.053362,0.994215
4,4,I've got orders to put her down.,I have orders to kill her.,0.726639,0.181818,0.009402,0.999348


## Preprocesing

### Preprocessing - swap translation and reference

In [25]:
def swap_ref_trn(row):
    if row['ref_tox'] < row['trn_tox']:
        row['reference'], row['translation'] = row['translation'], row['reference']
        row['ref_tox'], row['trn_tox'] = row['trn_tox'], row['ref_tox']
    return row

In [26]:
def add_stat_columns(df):
    df['tox_diff'] = round(df['ref_tox'] - df['trn_tox'], 3)
    return df

In [27]:
def round_columns(df):
    df['similarity'] = round(df['similarity'], 2)
    df['lenght_diff'] = round(df['lenght_diff'], 2)
    return df

### 

### Main preprocessing function

In [28]:
def preprocess(df):
    df = df.apply(swap_ref_trn, axis=1)
    df = add_stat_columns(df)
    df = round_columns(df)
    return df

In [82]:
df_processed = preprocess(df)

In [34]:
df_processed

Unnamed: 0.1,Unnamed: 0,reference,translation,similarity,lenght_diff,ref_tox,trn_tox,tox_diff,length_diff
0,0,"if Alkar floods her with her mental waste, it ...","If Alkar is flooding her with psychic waste, t...",0.79,0.010309,0.981983,0.014195,0.968,0.01
1,1,you're becoming disgusting.,Now you're getting nasty.,0.75,0.071429,0.999039,0.065473,0.934,0.07
2,2,"well, we can spare your life.","Well, we could spare your life, for one.",0.92,0.268293,0.985068,0.213313,0.772,0.27
3,3,"monkey, you have to wake up.","Ah! Monkey, you've got to snap out of it.",0.66,0.309524,0.994215,0.053362,0.941,0.31
4,4,I have orders to kill her.,I've got orders to put her down.,0.73,0.181818,0.999348,0.009402,0.990,0.18
...,...,...,...,...,...,...,...,...,...
577772,577772,you didn't know that Estelle stole your fish f...,You didn't know that Estelle had stolen some f...,0.87,0.030769,0.949143,0.000121,0.949,0.03
577773,577773,It'il suck the life out of you!,you'd be sucked out of your life!,0.72,0.058824,0.996124,0.215794,0.780,0.06
577774,577774,"I can't fuckin' take that, bruv.",I really can't take this.,0.62,0.212121,0.984538,0.000049,0.984,0.21
577775,577775,They called me a fucking hero. The truth is I ...,"they said I was a hero, but I didn't care.",0.68,0.358209,0.991945,0.000124,0.992,0.36


### Maybe I should assign more weight to the high toxic and good paraphrased data points when training

In [21]:
df_processed = df_processed.sort_values(by=['tox_diff', 'similarity', 'length_diff'], ascending=[False, False, True]).iloc[:10].to_numpy()

NameError: name 'df_processed' is not defined

# Data preprocessing - basic ideas

### Common preprocessing steps:
1. Lowercasting
1. Removing punctuation
1. Removing numbers
1. Stemming and lemmatization
1. Tokenization

In [79]:
def lower_text(text: str):
    return text.lower()

def remove_numbers(text: str):
    without_numbers = re.sub(r'\d+', ' ', text)
    return without_numbers

def remove_punctuation(text: str):
    without_punctuation = re.sub(r'[^a-z|\s]+', ' ', text)
    return without_punctuation

def remove_multiple_spaces(text: str):
    without_doublespace = re.sub('\s+', ' ', text).strip()
    return without_doublespace

nltk.download('stopwords')
nltk.download('punkt')
stopwords_set = set(stopwords.words('english'))
from nltk.stem import PorterStemmer
ps = PorterStemmer()

def tokenize_text(text: str):
    return word_tokenize(text)

def remove_stop_words(tokenized_text: list[str]):
    return [
        w for w in tokenized_text
        if w not in stopwords_set
    ]

def stem_words(tokenized_text: list[str]):
    return [
        ps.stem(w)
        for w in tokenized_text
    ]

def clean_data(sentence):
    _lowered = lower_text(sentence)
    _without_numbers = remove_numbers(_lowered)
    _without_punct = remove_punctuation(_without_numbers)
    _single_spaced = remove_multiple_spaces(_without_punct)
    _tokenized = tokenize_text(_single_spaced)
    _without_sw = remove_stop_words(_tokenized)
    _stemmed = stem_words(_without_sw)
    return _stemmed
    
def clean_dataframe(df, filename='cleaned.csv', dir='../data/interim/', override=False):
    file = os.path.join(dir, filename)

    # Check cache
    if (os.path.exists(file) and not override):
        return pd.read_csv(file)
    
    df['reference'] = df['reference'].apply(lambda s: clean_data(s))
    df['translation'] = df['translation'].apply(lambda s: clean_data(s))

    # Cache version
    df.to_csv(file, index=False)

    return df


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [83]:
df_cleaned = clean_dataframe(df_processed, override=True)
df_cleaned.head()

Unnamed: 0.1,Unnamed: 0,reference,translation,similarity,lenght_diff,ref_tox,trn_tox,tox_diff,length_diff
0,0,"[alkar, flood, mental, wast, would, explain, h...","[alkar, flood, psychic, wast, explain, high, l...",0.79,0.010309,0.981983,0.014195,0.968,0.01
1,1,"[becom, disgust]","[get, nasti]",0.75,0.071429,0.999039,0.065473,0.934,0.07
2,2,"[well, spare, life]","[well, could, spare, life, one]",0.92,0.268293,0.985068,0.213313,0.772,0.27
3,3,"[monkey, wake]","[ah, monkey, got, snap]",0.66,0.309524,0.994215,0.053362,0.941,0.31
4,4,"[order, kill]","[got, order, put]",0.73,0.181818,0.999348,0.009402,0.99,0.18


### Check what words were removed after paraphrasing

In [None]:
from collections import Counter
import ast
counter = Counter()

for i in df_cleaned.iterrows():
    ref = i[1]['reference']
    trn = i[1]['translation']

    if not isinstance(ref, list):    
        ref = ast.literal_eval(ref)

    if not isinstance(ref, list):    
        trn = ast.literal_eval(trn)
    
    
    for r in ref:
        if (r not in trn): 
            counter[r] += 1

    

In [1]:
'fuck' in counter, 'idiot' in counter

NameError: name 'counter' is not defined

In [146]:
import numpy as np
import torch
import torch.nn.functional as nn
# counter.keys(), 
inp = torch.tensor(list(counter.values()), dtype=torch.float)
probs = inp / inp.max()

percent = 0.15

z = zip(counter.keys(), probs)
for i, j in enumerate(z):
    
    if ( j[1] > percent): print(j)

('kill', tensor(0.2149))
('gon', tensor(0.2484))
('na', tensor(0.2793))
('ass', tensor(0.2120))
('go', tensor(0.2452))
('shit', tensor(0.5674))
('like', tensor(0.2144))
('fuck', tensor(1.))
('got', tensor(0.1766))
('damn', tensor(0.4085))
('get', tensor(0.2468))
('hell', tensor(0.2663))
('stupid', tensor(0.1612))


# Evalution
## Methods
1. BLUE score
1. Perplexity
1. BERT assesment
1. Other nlp assesment


# Test

In [3]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

model_name = "gpt2"  # You can use other model names like "gpt2-medium" or "gpt2-large"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)


  from .autonotebook import tqdm as notebook_tqdm
2023-09-30 12:37:02.770132: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-09-30 12:37:02.770185: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-09-30 12:37:02.775373: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-09-30 12:37:03.817451: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [179]:
train_texts = [
    {"toxic": "You're such an idiot!", "rephrased": "I disagree with you."},
    {"toxic": "This is terrible!", "rephrased": "This is not ideal."},
    # Add more pairs as needed
]

tokenizer.pad_token = tokenizer.eos_token
def tok(sent):
    return tokenizer(sent, padding='max_length', truncation=True, max_length=10, return_tensors="pt")

tok(train_texts[0]['toxic']), tok(train_texts[0]['rephrased'])

({'input_ids': tensor([[ 1639,   821,   884,   281, 22324, 21551,     0, 50256, 50256, 50256]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0]])},
 {'input_ids': tensor([[   40, 12546,   351,   345,    13, 50256, 50256, 50256, 50256, 50256]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0]])})

In [11]:
prompt = "What is the synonim to the word fuck? Tell only one word as our output"
input_ids = tokenizer.encode(prompt, return_tensors="pt")

# Generate text
output = model.generate(input_ids, max_length=50, num_return_sequences=1, no_repeat_ngram_size=2, top_k=50)

# Decode and print the generated text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True, tempreture=0)

print(generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


What is the synonim to the word fuck? Tell only one word as our output.

The synonym for fuck is a word that is used to describe a sexual act. It is often used in the context of a relationship, but


# Methods
1. Select toxic words or parts and then paraphrase them
1. Translate into French as the most similar to English and then back to English


identify toxic words and replace them with non-toxic synonyms.
since Such markers (i) carry most of stylistic information of a sentence(i.e.their presence is a strong indicator of this class),(ii) have synonyms which are free from this stylistic information.
need to somehow identify what words are toxic in the current dataset