In [1]:
import pandas as pd
from nltk.tokenize import word_tokenize

In [6]:
with open('../keyword/expandedLexicon.txt', 'r') as f:
    lines = f.readlines()

words = [l.split('_')[0] for l in lines]
scores = [float(l.split('\t')[1].strip('\n')) for l in lines]
df = pd.DataFrame()
df['word'] = words
df['score'] = scores
df = df[df['score']>0]
# df.to_csv('../keyword/abusive_lexicon.csv', index=False)
df.head()

Unnamed: 0,word,score
0,horrible,3.679601
1,disgusting,3.493682
2,moron,3.469677
3,bastard,3.399238
4,stupid,3.323882


In [3]:
def obfuscate_char(word, char_to_replace, new_char):
    """
    This function obfuscates the word by replacing first vowel of abusive words with asterisk 
    
    :type: word: abusive word on the df 
    :char_to_replace: replace the first found vowel in the word
    :new_char: the replaced character, in this case first vowel -> *, o -> 0 
    
    return new_word: obfuscated word
    """
    
    first_char_found = False
    new_word = ""

    for char in word:
        if not first_char_found and char in char_to_replace:
            new_word += new_char
            first_char_found = True
        else:
            new_word += char

    return new_word

def ommit_char(word, char_to_replace, new_char):
    """
    This function obfuscates the word ommiting all vowels in abusive words
     
    :type: word: abusive word on the df 
    :char_to_replace: replace the first found vowel in the word
    :new_char: the replaced character, in this case all vowel will be replaced by empty space ("") 
    
    return new_word: obfuscated word
    """
        
    new_word = ""    
    for char in word:
        if char not in char_to_replace:
            new_word += char
    return new_word

def interleave(word, char_to_interleave= ' '):
    """
    This function modifies abusive words by interleaving char by space (default) or preferred symbol ("_")
    
    :type: word: abusive word on the df 
    :char_to_interleave: generate spaces in between the char of word, by default it will generate underscore ("_") 
    
    return new_word: obfuscated word
    """
    if len(word) == 1:
        return word
    new_word = word[0]
    for char in word[1:]:
        new_word = new_word + char_to_interleave + char
    return new_word

def swap(word):
    """
    this code will modify the word in df by swapping the first and the second character of word
    :type: word:abusive word on the df 

    """
    if len(word) < 2:
        return word 
    else:
        return word[1] + word[0] + word[2:]

def duplicate_char(word, char_to_duplicate):
    """
    This function will modify the word by adding extra character of first found vowel
    
    :type: word: abusive word on df
    :type: char_to_duplicate ('aiueo')
    
    return new_word: obfuscated word
    """
    first_char_found = False
    new_word = ""

    for char in word:
        if not first_char_found and char in char_to_duplicate:
            new_word += char * 5
            first_char_found = True
        else:
            new_word += char

    return new_word

def extra_char(word):
    """
    This function will modify the word by adding random ascii character, specified on alphabet
    
    :type: word: abusive word on df
    """
    import random
    random_ascii_character = chr(random.randint(97, 122))
    return word + random_ascii_character

def random_obfuscation(word):
    """
    This function will modify the word by with all the method above.
    
    :type: word: abusive word on df
    """

    import random
    method = random.randint(0,7)
    if method == 0:
        return obfuscate_char(word, "aeiou", '*')
    if method == 1:
        return obfuscate_char(word, "o", "0")
    if method == 2:
        return ommit_char(word, "aiueo", "")
    if method == 3:
        return interleave(word, " ")
    if method == 4:
        return interleave(word, "_")
    if method == 5:
        return swap(word)
    if method == 6:
        return duplicate_char(word, "aeiou")
    if method == 7:
        return extra_char(word)

In [4]:
df = pd.read_csv('../keyword/abusive_lexicon.csv')

df['to_asterisks'] = df['word'].apply(lambda x: obfuscate_char(x, "aiueo", "*"))
df['to_number'] = df['word'].apply(lambda x: obfuscate_char(x, "o", "0"))
df['to_ommit'] = df['word'].apply(lambda x: ommit_char(x, "aiueo", ""))
df['interleave_space'] = df['word'].apply(lambda x: interleave(x, " "))
df['interleave_underscore'] = df['word'].apply(lambda x: interleave(x, "_"))
df['swap'] = df['word'].apply(swap)
df['duplicate_char'] = df['word'].apply(lambda x: duplicate_char(x, "aeiou"))
df['extra_char'] = df['word'].apply(extra_char)
df['random'] = df['word'].apply(random_obfuscation)

df = df.drop(["score"], axis=1)
df.to_csv("../keyword/keyword_obfuscated1.csv", index=False)

In [23]:
tweets = pd.read_csv("../data/OLID_train.csv")
keyword = pd.read_csv("../keyword/keyword_obfuscated.csv")

swearwords = sorted(set(keyword['word'].tolist()))

def contains_swearwords(tweet, lexicon):
    clean_tweet = set(word_tokenize(tweet))
    for l in lexicon:
        if l in clean_tweet:
            return True
    return False

def return_sw(tweet, lexicon):
    sw = []
    clean_tweet = set(word_tokenize(tweet))
    for l in lexicon:
        if l in clean_tweet:
            sw.append(l)
    return sw

def obfuscate_swearwords(tweet, lexicon):
    clean_tweet = set(word_tokenize(tweet))
    for l in lexicon:
        if l in clean_tweet:
            tweet = tweet.replace(l, obfuscate_char(l,"aiueo",'*'))
    return tweet

In [24]:
tweets['found'] = tweets['tweet'].apply(lambda x: contains_swearwords(x, swearwords))
tweets['sw_found'] =tweets['tweet'].apply(lambda x: return_sw(x, swearwords))
tweets['obf'] = tweets['tweet'].apply(lambda x: obfuscate_swearwords(x, swearwords))
tweets.to_csv('./test.csv', index=False)