In [1]:
import pandas as pd
from nltk.tokenize import word_tokenize

# 1 Interleave space

In [2]:
def interleave(word, char_to_interleave= ' '):
    """
    This function modifies abusive words by interleaving char by space (default) or preferred symbol ("_")
    
    :type: word: abusive word on the df 
    :char_to_interleave: generate spaces in between the char of word, by default it will generate underscore ("_") 
    
    return new_word: obfuscated word
    """
    if len(word) == 1:
        return word
    new_word = word[0]
    for char in word[1:]:
        new_word = new_word + char_to_interleave + char
    return new_word

tweets = pd.read_csv("../data/OLID_test.csv")
keyword = pd.read_csv("../keyword/keyword_obfuscated.csv")

swearwords = sorted(set(keyword['word'].tolist()))

def contains_swearwords(tweet, lexicon):
    clean_tweet = set(word_tokenize(tweet))
    for l in lexicon:
        if l in clean_tweet:
            return True
    return False

def return_sw(tweet, lexicon):
    sw = []
    clean_tweet = set(word_tokenize(tweet))
    for l in lexicon:
        if l in clean_tweet:
            sw.append(l)
    return sw

def obfuscate_swearwords(tweet, lexicon):
    clean_tweet = set(word_tokenize(tweet))
    for l in lexicon:
        if l in clean_tweet:
            tweet = tweet.replace(l, interleave(l," "))
    return tweet

tweets['found'] = tweets['tweet'].apply(lambda x: contains_swearwords(x, swearwords))
tweets['sw_found'] =tweets['tweet'].apply(lambda x: return_sw(x, swearwords))
tweets['obfuscated_tweet'] = tweets['tweet'].apply(lambda x: obfuscate_swearwords(x, swearwords))
tweets = tweets[['tweet', 'obfuscated_tweet', 'found', 'sw_found', 'label']]
tweets.to_csv('../data/olid_obf/OLID_OBF_1interleave.csv', index=False)

#save for BERT dataset, label (0 and 1)
tweets['label'] = tweets['label'].replace({'NOT': 0, 'OFF': 1})
tweets.to_csv('../data/olid_obf_bert/OLID_OBF_bert_1interleave.csv', index=False)

# 2. Swap Char

In [3]:
def swap(word):
    """
    this code will modify the word in df by swapping the first and the second character of word
    :type: word:abusive word on the df 

    """
    if len(word) < 2:
        return word 
    else:
        return word[1] + word[0] + word[2:]

tweets = pd.read_csv("../data/OLID_test.csv")
keyword = pd.read_csv("../keyword/keyword_obfuscated.csv")

swearwords = sorted(set(keyword['word'].tolist()))

def contains_swearwords(tweet, lexicon):
    clean_tweet = set(word_tokenize(tweet))
    for l in lexicon:
        if l in clean_tweet:
            return True
    return False

def return_sw(tweet, lexicon):
    sw = []
    clean_tweet = set(word_tokenize(tweet))
    for l in lexicon:
        if l in clean_tweet:
            sw.append(l)
    return sw

def obfuscate_swearwords(tweet, lexicon):
    clean_tweet = set(word_tokenize(tweet))
    for l in lexicon:
        if l in clean_tweet:
            tweet = tweet.replace(l, swap(l))
    return tweet


tweets['found'] = tweets['tweet'].apply(lambda x: contains_swearwords(x, swearwords))
tweets['sw_found'] =tweets['tweet'].apply(lambda x: return_sw(x, swearwords))
tweets['obfuscated_tweet'] = tweets['tweet'].apply(lambda x: obfuscate_swearwords(x, swearwords))
tweets = tweets[['tweet', 'obfuscated_tweet', 'found', 'sw_found', 'label']]
tweets.to_csv('../data/olid_obf/OLID_OBF_2swapchar.csv', index=False)

#save for BERT dataset, label (0 and 1)
tweets['label'] = tweets['label'].replace({'NOT': 0, 'OFF': 1})
tweets.to_csv('../data/olid_obf_bert/OLID_OBF_bert_2swapchar.csv', index=False)

# 3. Replace o with 0

In [4]:
## 2. create obufscated test dataset using obfuscate_char. o -> 0 ##

def obfuscate_char(word, char_to_replace, new_char):
    """
    This function obfuscates the word by replacing first vowel of abusive words with asterisk 
    
    :type: word: abusive word on the df 
    :char_to_replace: replace the first found vowel in the word
    :new_char: the replaced character, in this case first vowel -> *, o -> 0 
    
    return new_word: obfuscated word
    """
    
    first_char_found = False
    new_word = ""

    for char in word:
        if not first_char_found and char in char_to_replace:
            new_word += new_char
            first_char_found = True
        else:
            new_word += char

    return new_word

tweets = pd.read_csv("../data/OLID_test.csv")
keyword = pd.read_csv("../keyword/keyword_obfuscated.csv")

swearwords = sorted(set(keyword['word'].tolist()))

def contains_swearwords(tweet, lexicon):
    clean_tweet = set(word_tokenize(tweet))
    for l in lexicon:
        if l in clean_tweet:
            return True
    return False

def return_sw(tweet, lexicon):
    sw = []
    clean_tweet = set(word_tokenize(tweet))
    for l in lexicon:
        if l in clean_tweet:
            sw.append(l)
    return sw

def obfuscate_swearwords(tweet, lexicon):
    clean_tweet = set(word_tokenize(tweet))
    for l in lexicon:
        if l in clean_tweet:
            tweet = tweet.replace(l, obfuscate_char(l,"o",'0'))
    return tweet

tweets['found'] = tweets['tweet'].apply(lambda x: contains_swearwords(x, swearwords))
tweets['sw_found'] =tweets['tweet'].apply(lambda x: return_sw(x, swearwords))
tweets['obfuscated_tweet'] = tweets['tweet'].apply(lambda x: obfuscate_swearwords(x, swearwords))
tweets = tweets[['tweet', 'obfuscated_tweet', 'found', 'sw_found', 'label']]
tweets.to_csv('../data/olid_obf/OLID_OBF_3replace_o.csv', index=False)

#save for BERT dataset, label (0 and 1)
tweets['label'] = tweets['label'].replace({'NOT': 0, 'OFF': 1})
tweets.to_csv('../data/olid_obf_bert/OLID_OBF_bert_3replace_o.csv', index=False)

# 4. Ommit Char

In [5]:
def ommit_char(word, char_to_replace, new_char):
    """
    This function obfuscates the word ommiting all vowels in abusive words
     
    :type: word: abusive word on the df 
    :char_to_replace: replace the first found vowel in the word
    :new_char: the replaced character, in this case all vowel will be replaced by empty space ("") 
    
    return new_word: obfuscated word
    """
        
    new_word = ""    
    for char in word:
        if char not in char_to_replace:
            new_word += char
    return new_word

tweets = pd.read_csv("../data/OLID_test.csv")
keyword = pd.read_csv("../keyword/keyword_obfuscated.csv")

swearwords = sorted(set(keyword['word'].tolist()))

def contains_swearwords(tweet, lexicon):
    clean_tweet = set(word_tokenize(tweet))
    for l in lexicon:
        if l in clean_tweet:
            return True
    return False

def return_sw(tweet, lexicon):
    sw = []
    clean_tweet = set(word_tokenize(tweet))
    for l in lexicon:
        if l in clean_tweet:
            sw.append(l)
    return sw

def obfuscate_swearwords(tweet, lexicon):
    clean_tweet = set(word_tokenize(tweet))
    for l in lexicon:
        if l in clean_tweet:
            tweet = tweet.replace(l, ommit_char(l,"aiueo",''))
    return tweet

tweets['found'] = tweets['tweet'].apply(lambda x: contains_swearwords(x, swearwords))
tweets['sw_found'] =tweets['tweet'].apply(lambda x: return_sw(x, swearwords))
tweets['obfuscated_tweet'] = tweets['tweet'].apply(lambda x: obfuscate_swearwords(x, swearwords))
tweets = tweets[['tweet', 'obfuscated_tweet', 'found', 'sw_found', 'label']]
tweets.to_csv('../data/olid_obf/OLID_OBF_4ommit_char.csv', index=False)

#save for BERT dataset, label (0 and 1)
tweets['label'] = tweets['label'].replace({'NOT': 0, 'OFF': 1})
tweets.to_csv('../data/olid_obf_bert/OLID_OBF_bert_4ommit_char.csv', index=False)

# 5. Extra Char

In [6]:
def extra_char(word):
    """
    This function will modify the word by adding random ascii character, specified on alphabet
    
    :type: word: abusive word on df
    """
    import random
    random_ascii_character = chr(random.randint(97, 122))
    return word + random_ascii_character

tweets = pd.read_csv("../data/OLID_test.csv")
keyword = pd.read_csv("../keyword/keyword_obfuscated.csv")

swearwords = sorted(set(keyword['word'].tolist()))

def contains_swearwords(tweet, lexicon):
    clean_tweet = set(word_tokenize(tweet))
    for l in lexicon:
        if l in clean_tweet:
            return True
    return False

def return_sw(tweet, lexicon):
    sw = []
    clean_tweet = set(word_tokenize(tweet))
    for l in lexicon:
        if l in clean_tweet:
            sw.append(l)
    return sw

def obfuscate_swearwords(tweet, lexicon):
    clean_tweet = set(word_tokenize(tweet))
    for l in lexicon:
        if l in clean_tweet:
            tweet = tweet.replace(l, extra_char(l))
    return tweet

tweets['found'] = tweets['tweet'].apply(lambda x: contains_swearwords(x, swearwords))
tweets['sw_found'] =tweets['tweet'].apply(lambda x: return_sw(x, swearwords))
tweets['obfuscated_tweet'] = tweets['tweet'].apply(lambda x: obfuscate_swearwords(x, swearwords))
tweets = tweets[['tweet', 'obfuscated_tweet', 'found', 'sw_found', 'label']]
tweets.to_csv('../data/olid_obf/OLID_OBF_5extra_char.csv', index=False)

#save for BERT dataset, label (0 and 1)
tweets['label'] = tweets['label'].replace({'NOT': 0, 'OFF': 1})
tweets.to_csv('../data/olid_obf_bert/OLID_OBF_bert_5extra_char.csv', index=False)

# 6. First vowel to asterisks

In [7]:
def obfuscate_char(word, char_to_replace, new_char):
    """
    This function obfuscates the word by replacing first vowel of abusive words with asterisk 
    
    :type: word: abusive word on the df 
    :char_to_replace: replace the first found vowel in the word
    :new_char: the replaced character, in this case first vowel -> *, o -> 0 
    
    return new_word: obfuscated word
    """
    
    first_char_found = False
    new_word = ""

    for char in word:
        if not first_char_found and char in char_to_replace:
            new_word += new_char
            first_char_found = True
        else:
            new_word += char

    return new_word

tweets = pd.read_csv("../data/OLID_test.csv")
keyword = pd.read_csv("../keyword/keyword_obfuscated.csv")

swearwords = sorted(set(keyword['word'].tolist()))

def contains_swearwords(tweet, lexicon):
    clean_tweet = set(word_tokenize(tweet))
    for l in lexicon:
        if l in clean_tweet:
            return True
    return False

def return_sw(tweet, lexicon):
    sw = []
    clean_tweet = set(word_tokenize(tweet))
    for l in lexicon:
        if l in clean_tweet:
            sw.append(l)
    return sw

def obfuscate_swearwords(tweet, lexicon):
    clean_tweet = set(word_tokenize(tweet))
    for l in lexicon:
        if l in clean_tweet:
            tweet = tweet.replace(l, obfuscate_char(l,"aiueo",'*'))
    return tweet

tweets['found'] = tweets['tweet'].apply(lambda x: contains_swearwords(x, swearwords))
tweets['sw_found'] =tweets['tweet'].apply(lambda x: return_sw(x, swearwords))
tweets['obfuscated_tweet'] = tweets['tweet'].apply(lambda x: obfuscate_swearwords(x, swearwords))
tweets = tweets[['tweet', 'obfuscated_tweet', 'found', 'sw_found', 'label']]
tweets.to_csv('../data/olid_obf/OLID_OBF_6to_asterisks.csv', index=False)

#save for BERT dataset, label (0 and 1)
tweets['label'] = tweets['label'].replace({'NOT': 0, 'OFF': 1})
tweets.to_csv('../data/olid_obf_bert/OLID_OBF_bert_6to_asterisks.csv', index=False)

In [8]:
def duplicate_char(word, char_to_duplicate):
    """
    This function will modify the word by adding extra character of first found vowel
    
    :type: word: abusive word on df
    :type: char_to_duplicate ('aiueo')
    
    return new_word: obfuscated word
    """
    first_char_found = False
    new_word = ""

    for char in word:
        if not first_char_found and char in char_to_duplicate:
            new_word += char * 5
            first_char_found = True
        else:
            new_word += char

    return new_word

tweets = pd.read_csv("../data/OLID_test.csv")
keyword = pd.read_csv("../keyword/keyword_obfuscated.csv")

swearwords = sorted(set(keyword['word'].tolist()))

def contains_swearwords(tweet, lexicon):
    clean_tweet = set(word_tokenize(tweet))
    for l in lexicon:
        if l in clean_tweet:
            return True
    return False

def return_sw(tweet, lexicon):
    sw = []
    clean_tweet = set(word_tokenize(tweet))
    for l in lexicon:
        if l in clean_tweet:
            sw.append(l)
    return sw

def obfuscate_swearwords(tweet, lexicon):
    clean_tweet = set(word_tokenize(tweet))
    for l in lexicon:
        if l in clean_tweet:
            tweet = tweet.replace(l, duplicate_char(l, "aiueo"))
    return tweet

tweets['found'] = tweets['tweet'].apply(lambda x: contains_swearwords(x, swearwords))
tweets['sw_found'] =tweets['tweet'].apply(lambda x: return_sw(x, swearwords))
tweets['obfuscated_tweet'] = tweets['tweet'].apply(lambda x: obfuscate_swearwords(x, swearwords))
tweets = tweets[['tweet', 'obfuscated_tweet', 'found', 'sw_found', 'label']]
tweets.to_csv('../data/olid_obf/OLID_OBF_7duplicate_char.csv', index=False)

#save for BERT dataset, label (0 and 1)
tweets['label'] = tweets['label'].replace({'NOT': 0, 'OFF': 1})
tweets.to_csv('../data/olid_obf_bert/OLID_OBF_bert_7duplicate_char.csv', index=False)

# 8. Random Obfuscation

In [9]:
def random_obfuscation(word):
    """
    This function will modify the word by with all the method above.
    
    :type: word: abusive word on df
    """

    import random
    method = random.randint(0,7)
    if method == 0:
        return obfuscate_char(word, "aeiou", '*')
    if method == 1:
        return obfuscate_char(word, "o", "0")
    if method == 2:
        return ommit_char(word, "aiueo", "")
    if method == 3:
        return interleave(word, " ")
    if method == 4:
        return interleave(word, "_")
    if method == 5:
        return swap(word)
    if method == 6:
        return duplicate_char(word, "aeiou")
    if method == 7:
        return extra_char(word)
    
tweets = pd.read_csv("../data/OLID_test.csv")
keyword = pd.read_csv("../keyword/keyword_obfuscated.csv")

swearwords = sorted(set(keyword['word'].tolist()))

def contains_swearwords(tweet, lexicon):
    clean_tweet = set(word_tokenize(tweet))
    for l in lexicon:
        if l in clean_tweet:
            return True
    return False

def return_sw(tweet, lexicon):
    sw = []
    clean_tweet = set(word_tokenize(tweet))
    for l in lexicon:
        if l in clean_tweet:
            sw.append(l)
    return sw

def obfuscate_swearwords(tweet, lexicon):
    clean_tweet = set(word_tokenize(tweet))
    for l in lexicon:
        if l in clean_tweet:
            tweet = tweet.replace(l, random_obfuscation(l))
    return tweet

tweets['found'] = tweets['tweet'].apply(lambda x: contains_swearwords(x, swearwords))
tweets['sw_found'] =tweets['tweet'].apply(lambda x: return_sw(x, swearwords))
tweets['obfuscated_tweet'] = tweets['tweet'].apply(lambda x: obfuscate_swearwords(x, swearwords))
tweets = tweets[['tweet', 'obfuscated_tweet', 'found', 'sw_found', 'label']]
tweets.to_csv('../data/olid_obf/OLID_OBF_8random_obf.csv', index=False)

#save for BERT dataset, label (0 and 1)
tweets['label'] = tweets['label'].replace({'NOT': 0, 'OFF': 1})
tweets.to_csv('../data/olid_obf_bert/OLID_OBF_bert_8random_obf.csv', index=False)