In [1]:
import pandas as pd
from nltk.tokenize import word_tokenize

# 1. Interleave space


In [6]:
def interleave(word, char_to_interleave= ' '):
    """
    This function modifies abusive words by interleaving char by space (default) or preferred symbol ("_")
    
    :type: word: abusive word on the df 
    :char_to_interleave: generate spaces in between the char of word, by default it will generate underscore ("_") 
    
    return new_word: obfuscated word
    """
    if len(word) == 1:
        return word
    new_word = word[0]
    for char in word[1:]:
        new_word = new_word + char_to_interleave + char
    return new_word

comment = pd.read_csv("../data/ALYT_test.csv")
keyword = pd.read_csv("../keyword/keyword_obfuscated.csv")

swearwords = sorted(set(keyword['word'].tolist()))

def contains_swearwords(comment, lexicon):
    clean_comment = set(word_tokenize(comment))
    for l in lexicon:
        if l in clean_comment:
            return True
    return False

def return_sw(comment, lexicon):
    sw = []
    clean_comment = set(word_tokenize(comment))
    for l in lexicon:
        if l in clean_comment:
            sw.append(l)
    return sw

def obfuscate_swearwords(comment, lexicon):
    clean_comment = set(word_tokenize(comment))
    for l in lexicon:
        if l in clean_comment:
            comment = comment.replace(l, interleave(l, " "))
    return comment

comment['found'] = comment['comment'].apply(lambda x: contains_swearwords(x, swearwords))
comment['sw_found'] =comment['comment'].apply(lambda x: return_sw(x, swearwords))
comment['obfuscated_comment'] = comment['comment'].apply(lambda x: obfuscate_swearwords(x, swearwords))
comment = comment[['comment', 'obfuscated_comment', 'found', 'sw_found', 'label']]
comment.to_csv('../data/alyt_obf/ALYT_OBF_1interleave.csv', index=False)

comment['label'] = comment['label'].replace({'NOT': 0, 'ABU': 1})
comment.to_csv('../data/alyt_obf_bert/ALYT_OBF_bert_1interleave.csv', index=False)

# 2 Swap Two Character

In [7]:
def swap(word):
    """
    this code will modify the word in df by swapping the first and the second character of word
    :type: word:abusive word on the df 

    """
    if len(word) < 2:
        return word 
    else:
        return word[1] + word[0] + word[2:]
    
comment = pd.read_csv("../data/ALYT_test.csv")
keyword = pd.read_csv("../keyword/keyword_obfuscated.csv")

swearwords = sorted(set(keyword['word'].tolist()))

def contains_swearwords(comment, lexicon):
    clean_comment = set(word_tokenize(comment))
    for l in lexicon:
        if l in clean_comment:
            return True
    return False

def return_sw(comment, lexicon):
    sw = []
    clean_comment = set(word_tokenize(comment))
    for l in lexicon:
        if l in clean_comment:
            sw.append(l)
    return sw

def obfuscate_swearwords(comment, lexicon):
    clean_comment = set(word_tokenize(comment))
    for l in lexicon:
        if l in clean_comment:
            comment = comment.replace(l, swap(l))
    return comment

comment['found'] = comment['comment'].apply(lambda x: contains_swearwords(x, swearwords))
comment['sw_found'] =comment['comment'].apply(lambda x: return_sw(x, swearwords))
comment['obfuscated_comment'] = comment['comment'].apply(lambda x: obfuscate_swearwords(x, swearwords))
comment = comment[['comment', 'obfuscated_comment', 'found', 'sw_found', 'label']]
comment.to_csv('../data/alyt_obf/ALYT_OBF_2swapchar.csv', index=False)

# save dataset for BERT, label 1 or 0)
comment['label'] = comment['label'].replace({'NOT': 0, 'ABU': 1})
comment.to_csv('../data/alyt_obf_bert/ALYT_OBF_bert_2swapchar.csv', index=False)

# 3. replace o with 0

In [9]:
def obfuscate_char(word, char_to_replace, new_char):
    """
    This function obfuscates the word by replacing first vowel of abusive words with asterisk 
    
    :type: word: abusive word on the df 
    :char_to_replace: replace the first found vowel in the word
    :new_char: the replaced character, in this case first vowel -> *, o -> 0 
    
    return new_word: obfuscated word
    """
    
    first_char_found = False
    new_word = ""

    for char in word:
        if not first_char_found and char in char_to_replace:
            new_word += new_char
            first_char_found = True
        else:
            new_word += char

    return new_word

comment = pd.read_csv("../data/ALYT_test.csv")
keyword = pd.read_csv("../keyword/keyword_obfuscated.csv")

swearwords = sorted(set(keyword['word'].tolist()))

def contains_swearwords(comment, lexicon):
    clean_comment = set(word_tokenize(comment))
    for l in lexicon:
        if l in clean_comment:
            return True
    return False

def return_sw(comment, lexicon):
    sw = []
    clean_comment = set(word_tokenize(comment))
    for l in lexicon:
        if l in clean_comment:
            sw.append(l)
    return sw

def obfuscate_swearwords(comment, lexicon):
    clean_comment = set(word_tokenize(comment))
    for l in lexicon:
        if l in clean_comment:
            comment = comment.replace(l, obfuscate_char(l,"o",'0'))
    return comment

comment['found'] = comment['comment'].apply(lambda x: contains_swearwords(x, swearwords))
comment['sw_found'] =comment['comment'].apply(lambda x: return_sw(x, swearwords))
comment['obfuscated_comment'] = comment['comment'].apply(lambda x: obfuscate_swearwords(x, swearwords))
comment = comment[['comment', 'obfuscated_comment', 'found', 'sw_found', 'label']]
comment.to_csv('../data/alyt_obf/ALYT_OBF_3replace_o.csv', index=False)

comment['label'] = comment['label'].replace({'NOT': 0, 'ABU': 1})
comment.to_csv('../data/alyt_obf_bert/ALYT_OBF_bert_3replace_o.csv', index=False)

# 4. Ommit Char

In [10]:
def ommit_char(word, char_to_replace, new_char):
    """
    This function obfuscates the word ommiting all vowels in abusive words
     
    :type: word: abusive word on the df 
    :char_to_replace: replace the first found vowel in the word
    :new_char: the replaced character, in this case all vowel will be replaced by empty space ("") 
    
    return new_word: obfuscated word
    """
        
    new_word = ""    
    for char in word:
        if char not in char_to_replace:
            new_word += char
    return new_word

comment = pd.read_csv("../data/ALYT_test.csv")
keyword = pd.read_csv("../keyword/keyword_obfuscated.csv")

swearwords = sorted(set(keyword['word'].tolist()))

def contains_swearwords(comment, lexicon):
    clean_comment = set(word_tokenize(comment))
    for l in lexicon:
        if l in clean_comment:
            return True
    return False

def return_sw(comment, lexicon):
    sw = []
    clean_comment = set(word_tokenize(comment))
    for l in lexicon:
        if l in clean_comment:
            sw.append(l)
    return sw

def obfuscate_swearwords(comment, lexicon):
    clean_comment = set(word_tokenize(comment))
    for l in lexicon:
        if l in clean_comment:
            comment = comment.replace(l, ommit_char(l,"aiueo",''))
    return comment

comment['found'] = comment['comment'].apply(lambda x: contains_swearwords(x, swearwords))
comment['sw_found'] =comment['comment'].apply(lambda x: return_sw(x, swearwords))
comment['obfuscated_comment'] = comment['comment'].apply(lambda x: obfuscate_swearwords(x, swearwords))
comment = comment[['comment', 'obfuscated_comment', 'found', 'sw_found', 'label']]
comment.to_csv('../data/alyt_obf/ALYT_OBF_4ommit_char.csv', index=False)

comment['label'] = comment['label'].replace({'NOT': 0, 'ABU': 1})
comment.to_csv('../data/alyt_obf_bert/ALYT_OBF_bert_4ommit_char.csv', index=False)

# 5. Extra Char

In [11]:
def extra_char(word):
    """
    This function will modify the word by adding random ascii character, specified on alphabet
    
    :type: word: abusive word on df
    """
    import random
    random_ascii_character = chr(random.randint(97, 122))
    return word + random_ascii_character

comment = pd.read_csv("../data/ALYT_test.csv")
keyword = pd.read_csv("../keyword/keyword_obfuscated.csv")

swearwords = sorted(set(keyword['word'].tolist()))

def contains_swearwords(comment, lexicon):
    clean_comment = set(word_tokenize(comment))
    for l in lexicon:
        if l in clean_comment:
            return True
    return False

def return_sw(comment, lexicon):
    sw = []
    clean_comment = set(word_tokenize(comment))
    for l in lexicon:
        if l in clean_comment:
            sw.append(l)
    return sw

def obfuscate_swearwords(comment, lexicon):
    clean_comment = set(word_tokenize(comment))
    for l in lexicon:
        if l in clean_comment:
            comment = comment.replace(l, extra_char(l))
    return comment

comment['found'] = comment['comment'].apply(lambda x: contains_swearwords(x, swearwords))
comment['sw_found'] =comment['comment'].apply(lambda x: return_sw(x, swearwords))
comment['obfuscated_comment'] = comment['comment'].apply(lambda x: obfuscate_swearwords(x, swearwords))
comment = comment[['comment', 'obfuscated_comment', 'found', 'sw_found', 'label']]
comment.to_csv('../data/alyt_obf/ALYT_OBF_5extra_char.csv', index=False)

comment['label'] = comment['label'].replace({'NOT': 0, 'ABU': 1})
comment.to_csv('../data/alyt_obf_bert/ALYT_OBF_bert_5extra_char.csv', index=False)

# 6. Replace First vowel to asterisks (*)

In [12]:
def obfuscate_char(word, char_to_replace, new_char):
    """
    This function obfuscates the word by replacing first vowel of abusive words with asterisk 
    
    :type: word: abusive word on the df 
    :char_to_replace: replace the first found vowel in the word
    :new_char: the replaced character, in this case first vowel -> *, o -> 0 
    
    return new_word: obfuscated word
    """
    
    first_char_found = False
    new_word = ""

    for char in word:
        if not first_char_found and char in char_to_replace:
            new_word += new_char
            first_char_found = True
        else:
            new_word += char

    return new_word

comment = pd.read_csv("../data/ALYT_test.csv")
keyword = pd.read_csv("../keyword/keyword_obfuscated.csv")

swearwords = sorted(set(keyword['word'].tolist()))

def contains_swearwords(comment, lexicon):
    clean_comment = set(word_tokenize(comment))
    for l in lexicon:
        if l in clean_comment:
            return True
    return False

def return_sw(comment, lexicon):
    sw = []
    clean_comment = set(word_tokenize(comment))
    for l in lexicon:
        if l in clean_comment:
            sw.append(l)
    return sw

def obfuscate_swearwords(comment, lexicon):
    clean_comment = set(word_tokenize(comment))
    for l in lexicon:
        if l in clean_comment:
            comment = comment.replace(l, obfuscate_char(l,"aiueo",'*'))
    return comment

comment['found'] = comment['comment'].apply(lambda x: contains_swearwords(x, swearwords))
comment['sw_found'] =comment['comment'].apply(lambda x: return_sw(x, swearwords))
comment['obfuscated_comment'] = comment['comment'].apply(lambda x: obfuscate_swearwords(x, swearwords))
comment = comment[['comment', 'obfuscated_comment', 'found', 'sw_found', 'label']]
comment.to_csv('../data/alyt_obf/ALYT_OBF_6to_asterisks.csv', index=False)

comment['label'] = comment['label'].replace({'NOT': 0, 'ABU': 1})
comment.to_csv('../data/alyt_obf_bert/ALYT_OBF_bert_6to_asterisks.csv', index=False)

# 7. Duplicate first vowel

In [13]:
def duplicate_char(word, char_to_duplicate):
    """
    This function will modify the word by adding extra character of first found vowel
    
    :type: word: abusive word on df
    :type: char_to_duplicate ('aiueo')
    
    return new_word: obfuscated word
    """
    first_char_found = False
    new_word = ""

    for char in word:
        if not first_char_found and char in char_to_duplicate:
            new_word += char * 5
            first_char_found = True
        else:
            new_word += char

    return new_word

comment = pd.read_csv("../data/ALYT_test.csv")
keyword = pd.read_csv("../keyword/keyword_obfuscated.csv")

swearwords = sorted(set(keyword['word'].tolist()))

def contains_swearwords(comment, lexicon):
    clean_comment = set(word_tokenize(comment))
    for l in lexicon:
        if l in clean_comment:
            return True
    return False

def return_sw(comment, lexicon):
    sw = []
    clean_comment = set(word_tokenize(comment))
    for l in lexicon:
        if l in clean_comment:
            sw.append(l)
    return sw

def obfuscate_swearwords(comment, lexicon):
    clean_comment = set(word_tokenize(comment))
    for l in lexicon:
        if l in clean_comment:
            comment = comment.replace(l, duplicate_char(l, "aiueo"))
    return comment

comment['found'] = comment['comment'].apply(lambda x: contains_swearwords(x, swearwords))
comment['sw_found'] =comment['comment'].apply(lambda x: return_sw(x, swearwords))
comment['obfuscated_comment'] = comment['comment'].apply(lambda x: obfuscate_swearwords(x, swearwords))
comment = comment[['comment', 'obfuscated_comment', 'found', 'sw_found', 'label']]
comment.to_csv('../data/alyt_obf/ALYT_OBF_7duplicate_char.csv', index=False)

comment['label'] = comment['label'].replace({'NOT': 0, 'ABU': 1})
comment.to_csv('../data/alyt_obf_bert/ALYT_OBF_bert_7duplicate_char.csv', index=False)

# 8. Random Obfuscation

In [14]:
def random_obfuscation(word):
    """
    This function will modify the word by with all the method above.
    
    :type: word: abusive word on df
    """

    import random
    method = random.randint(0,7)
    if method == 0:
        return obfuscate_char(word, "aeiou", '*')
    if method == 1:
        return obfuscate_char(word, "o", "0")
    if method == 2:
        return ommit_char(word, "aiueo", "")
    if method == 3:
        return interleave(word, " ")
    if method == 4:
        return interleave(word, "_")
    if method == 5:
        return swap(word)
    if method == 6:
        return duplicate_char(word, "aeiou")
    if method == 7:
        return extra_char(word)
    
comment = pd.read_csv("../data/ALYT_test.csv")
keyword = pd.read_csv("../keyword/keyword_obfuscated.csv")

swearwords = sorted(set(keyword['word'].tolist()))

def contains_swearwords(comment, lexicon):
    clean_comment = set(word_tokenize(comment))
    for l in lexicon:
        if l in clean_comment:
            return True
    return False

def return_sw(comment, lexicon):
    sw = []
    clean_comment = set(word_tokenize(comment))
    for l in lexicon:
        if l in clean_comment:
            sw.append(l)
    return sw

def obfuscate_swearwords(comment, lexicon):
    clean_comment = set(word_tokenize(comment))
    for l in lexicon:
        if l in clean_comment:
            comment = comment.replace(l, random_obfuscation(l))
    return comment

comment['found'] = comment['comment'].apply(lambda x: contains_swearwords(x, swearwords))
comment['sw_found'] =comment['comment'].apply(lambda x: return_sw(x, swearwords))
comment['obfuscated_comment'] = comment['comment'].apply(lambda x: obfuscate_swearwords(x, swearwords))
comment = comment[['comment', 'obfuscated_comment', 'found', 'sw_found', 'label']]
comment.to_csv('../data/alyt_obf/ALYT_OBF_8random_obf.csv', index=False)

comment['label'] = comment['label'].replace({'NOT': 0, 'ABU': 1})
comment.to_csv('../data/alyt_obf_bert/ALYT_OBF_bert_8random_obf.csv', index=False)