## Cyberbullying Detection: A Machine Learning Approach

In [1]:
#imports

import pandas as pd 
import nltk
import re
import matplotlib.pyplot as plt
import numpy as np

### Step 1: Load the Dataset 

In [None]:
df = pd.read_csv("cyberbullying_tweets.csv")

### Step 2 : Data Preprocessing 

In [None]:
#to convert uppercase to lowercase characters
def lower_word(t):
    new_text = "".join(t.lower())
    return new_text

In [None]:
#remove usernames, url and non utf8/ascii characters 
def rem_url(t):
    text1 = "".join(re.sub(r'(?:\@|https?\://)\S+', '', t))
    text = "".join(re.sub(r'[^\x00-\x7f]',r'', text1))
    return text

In [None]:
#to remove punctuation 
def rem_punc(t):
        new_text = "".join(re.sub(r'[^\w\s]', '', t))
        return new_text

In [None]:
#to break the sentence into tokens
from nltk.tokenize import word_tokenize

def create_token(t):
        token_text = " ".join(word_tokenize(t))
        return token_text

In [None]:
#split strings into list and join as string 
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
stop_words.remove('not')
stop_words.extend(['rt', 'mkr', 'httpâ', 'tvwâ', 'etc'])

def rem_stopword(t):
    new_text = " ".join([word for word in t.split() if word not in stop_words])
    return new_text

In [None]:
# WORDNET LEMMATIZER (with appropriate pos tags)
from nltk.stem import wordnet 
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

def lemma_postag(t):
    lemmatizer = WordNetLemmatizer()

# Define function to lemmatize each word with its POS tag

    def pos_tagger(nltk_tag):
        if nltk_tag.startswith('J'):
            return wordnet.ADJ
        elif nltk_tag.startswith('V'):
            return wordnet.VERB
        elif nltk_tag.startswith('N'):
            return wordnet.NOUN
        elif nltk_tag.startswith('R'):
            return wordnet.ADV
        else:
            return None

    # tokenize the sentence and find the POS tag for each token
    pos_tagged = nltk.pos_tag(nltk.word_tokenize(t))

    # our own pos_tagger function to make things simpler to understand.
    wordnet_tagged = list(map(lambda x: (x[0], pos_tagger(x[1])), pos_tagged))

    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            # if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:
            # else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    lemmatized_sentence = " ".join(lemmatized_sentence)

    return lemmatized_sentence


### Step 3: Sentiment Analysis 

#### To label the data into their respective categories

In [None]:
#comparing TextBlob with VADER
from textblob import TextBlob

#TextBlob
def getPolarity_TB(t):
    result = TextBlob(t).sentiment.polarity
    return result

In [None]:
def getLabel(polarity):
    if polarity < 0:
        return 'Negative'
    elif polarity == 0:
        return 'Neutral'
    else:
        return 'Positive'

#### Test find abusive words in a dataframe 

In [None]:
#clean the data
def cleanData(x):
    lower = lower_word(x)
    no_url = rem_url(lower)
    no_punc = rem_punc(no_url)
    token = create_token(no_punc)
    no_sw = rem_stopword(token)
    new_text = lemma_postag(no_sw)
    return new_text

#### Test on actual data 

In [None]:
testData2 = pd.DataFrame(df['tweet_text'])

testData2['clean'] = testData2['tweet_text'].apply(lambda x: cleanData(x))
testData2.head()

In [None]:
testData2['tokenized'] = testData2['clean'].apply(lambda x: word_tokenize(x))
testData2.head()

In [None]:
testData2['tokenized'].to_csv("tokenized_words.csv")

In [None]:
#find the polarity of each word and put it in a list
abusiveWords2 = [] #create a list
positiveWords2 = []
neutralWords2 = []
i = len(testData2['tokenized'])
j = 0
    
while i != 0:
    for x in testData2['tokenized'][j]:
        #print(x)
        word = x
        polarity = getPolarity_TB(word)
        #print('Word: {}   Polarity: {}'.format(word, polarity))
        if polarity <= 0.0:
            abusiveWords2.append(word)
        elif polarity == 0.0:
            neutralWords2.append(word)
        else:
            positiveWords2.append(word)
    i = i - 1
    j = j + 1


In [None]:
#remove the duplicated words in abusiveWords2 

final_abusiveWords2 = list(dict.fromkeys(abusiveWords2))
print(final_abusiveWords2)

In [None]:
print("The length of the orginal list of abusive words:", len(abusiveWords2))
print("After removing the duplicated words, this is the length of the new list:", len(final_abusiveWords2))

In [None]:
#copy all abusive words into a txt file
with open(r'abusiveWordsTEXTBLOB.txt', 'w') as filePath:
    for a in final_abusiveWords2:
        filePath.write("%s\n" % a)
    print('Done')

In [None]:
#remove the duplicated words in positiveWords2

final_positiveWords2 = list(dict.fromkeys(positiveWords2))
print(final_positiveWords2)

In [None]:
print("The length of the orginal list of positive words:", len(positiveWords2))
print("After removing the duplicated words, this is the length of the new list:", len(final_positiveWords2))

In [None]:
#copy all positive words into a txt file
with open(r'positiveWordsTEXTBLOB.txt', 'w') as filePath:
    for a in final_positiveWords2:
        filePath.write("%s\n" % a)
    print('Done')

In [None]:
#remove the duplicated words in neutralWords2

final_neutralWords2 = list(dict.fromkeys(neutralWords2))
print(final_neutralWords2)

In [None]:
print("The length of the orginal list of neutral words:", len(neutralWords2))
print("After removing the duplicated words, this is the length of the new list:", len(final_neutralWords2))

In [None]:
#copy all neutral words into a txt file
with open(r'neutralWordsTEXTBLOB.txt', 'w') as filePath:
    for a in final_neutralWords2:
        filePath.write("%s\n" % a)
    print('Done')

#### Try using VADER

Since it will take a very long time to process all the rows in one go, so it will be separated 5 different range starting from (1 - 10000), (10001 - 20000), (20001 - 30000), (30001 - 40000), (40001 - 47692). In this process, the words of each line will be categorized into their own categories. It has 3 categories which are abusive words, positive words, and neutral words. 

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

def getCompound(sentence):
    sid_obj = SentimentIntensityAnalyzer()
    sentiment_dict = sid_obj.polarity_scores(sentence)
    result = sentiment_dict['compound']
    return result

In [None]:
#find the polarity of each word and put it in a list

abusiveWords4 = [] #create a list that contains all words that have negative meaning
positiveWords4 = []
neutralWords4 = []
i = len(testData2['tokenized'][1:10000])
j = 0
    
while i != 0:
    for x in testData2['tokenized'][j]:
        #print(x)
        word = x
        polarity = getCompound(word)
        #print('Word: {}   Polarity: {}'.format(word, polarity))
        if polarity >= 0.05:
            positiveWords4.append(word)
        elif polarity <= - 0.05:
            abusiveWords4.append(word)
        else:
            neutralWords4.append(word)
    i = i - 1
    j = j + 1


In [None]:
#remove the duplicated words in abusiveWords4

final_abusiveWords4 = list(dict.fromkeys(abusiveWords4))
print(final_abusiveWords4)

In [None]:
#print the length of original abusive words list and the length of new abusive words list

print("The length of the orginal list of abusive words:", len(abusiveWords4))
print("After removing the duplicated words, this is the length of the new list:", len(final_abusiveWords4))

In [None]:
#copy all abusive words into a txt file

with open(r'abusiveWordsVADER1.txt', 'w') as fp:
    for item in final_abusiveWords4:
        fp.write("%s\n" % item) #write each word in new line
    print('Done')

In [None]:
#remove the duplicated words in positiveWords4 

final_positiveWords4 = list(dict.fromkeys(positiveWords4))
print(final_positiveWords4)

In [None]:
print("The length of the orginal list of positive words:", len(positiveWords4))
print("After removing the duplicated words, this is the length of the new list:", len(final_positiveWords4))

In [None]:
#copy all positive words into a txt file

with open(r'positiveWordsVADER1.txt', 'w') as fp:
    for item in final_positiveWords4:
        fp.write("%s\n" % item)
    print('Done')

In [None]:
#remove the duplicated words in neutralWords4

final_neutralWords4 = list(dict.fromkeys(neutralWords4))
print(final_neutralWords4)

In [None]:
print("The length of the orginal list of neutral words:", len(neutralWords4))
print("After removing the duplicated words, this is the length of the new list:", len(final_neutralWords4))

In [None]:
#copy all neutral words into a txt file
with open(r'neutralWordsVADER1.txt', 'w') as fp:
    for item in final_neutralWords4:
        fp.write("%s\n" % item)
    print('Done')

#### 10001 - 20000

In [None]:
#find the polarity of each word and put it in a list
abusiveWords5 = [] #create a list
positiveWords5 = []
neutralWords5 = []
i = len(testData2['tokenized'][10001:20000])
j = 10001
    
while i != 0:
    for x in testData2['tokenized'][j]:
        word = x
        polarity = getCompound(word)
        if polarity >= 0.05:
            positiveWords5.append(word)
        elif polarity <= - 0.05:
            abusiveWords5.append(word)
        else:
            neutralWords5.append(word)
    i = i - 1
    j = j + 1


In [None]:
print(testData2['tokenized'][10001:20000])

In [None]:
#remove the duplicated words

final_abusiveWords5 = list(dict.fromkeys(abusiveWords5))
print(final_abusiveWords5)

In [None]:
print("The length of the orginal list of abusive words:", len(abusiveWords5))
print("After removing the duplicated words, this is the length of the new list:", len(final_abusiveWords5))

In [None]:
#copy all abusive words into a txt file
with open(r'abusiveWordsVADER2.txt', 'w') as fp:
    for item in final_abusiveWords5:
        # write each item on a new line
        fp.write("%s\n" % item)
    print('Done')

#### 20001 - 30000

In [None]:
#find the polarity of each word and put it in a list
abusiveWords6 = [] #create a list
positiveWords6 = []
neutralWords6 = []
i = len(testData2['tokenized'][20001:30000])
j = 20001
    
while i != 0:
    for x in testData2['tokenized'][j]:
        #print(x)
        word = x
        polarity = getCompound(word)
        #print('Word: {}   Polarity: {}'.format(word, polarity))
        if polarity >= 0.05:
            positiveWords6.append(word)
        elif polarity <= - 0.05:
            abusiveWords6.append(word)
        else:
            neutralWords6.append(word)
    i = i - 1
    j = j + 1


In [None]:
#remove the duplicated words

final_abusiveWords6 = list(dict.fromkeys(abusiveWords6))
print(final_abusiveWords6)

In [None]:
print("The length of the orginal list of abusive words:", len(abusiveWords6))
print("After removing the duplicated words, this is the length of the new list:", len(final_abusiveWords6))

In [None]:
#copy all abusive words into a txt file
with open(r'abusiveWordsVADER3.txt', 'w') as fp:
    for item in final_abusiveWords6:
        # write each item on a new line
        fp.write("%s\n" % item)
    print('Done')

#### 30001 - 40000

In [None]:
#find the polarity of each word and put it in a list
abusiveWords7 = [] #create a list
positiveWords7 = []
neutralWords7 = []
i = len(testData2['tokenized'][30001:40000])
j = 30001
    
while i != 0:
    for x in testData2['tokenized'][j]:
        #print(x)
        word = x
        polarity = getCompound(word)
        #print('Word: {}   Polarity: {}'.format(word, polarity))
        if polarity >= 0.05:
            positiveWords7.append(word)
        elif polarity <= - 0.05:
            abusiveWords7.append(word)
        else:
            neutralWords7.append(word)
    i = i - 1
    j = j + 1


In [None]:
#remove the duplicated words

final_abusiveWords7 = list(dict.fromkeys(abusiveWords7))
print(final_abusiveWords7)

In [None]:
print("The length of the orginal list of abusive words:", len(abusiveWords7))
print("After removing the duplicated words, this is the length of the new list:", len(final_abusiveWords7))

In [None]:
#copy all abusive words into a txt file
with open(r'abusiveWordsVADER4.txt', 'w') as fp:
    for item in final_abusiveWords7:
        # write each item on a new line
        fp.write("%s\n" % item)
    print('Done')

#### 40001 - 47692

In [None]:
#find the polarity of each word and put it in a list
abusiveWords8 = [] #create a list
positiveWords8 = []
neutralWords8 = []
i = len(testData2['tokenized'][40001:])
j = 40001
    
while i != 0:
    for x in testData2['tokenized'][j]:
        #print(x)
        word = x
        polarity = getCompound(word)
        #print('Word: {}   Polarity: {}'.format(word, polarity))
        if polarity >= 0.05:
            positiveWords8.append(word)
        elif polarity <= - 0.05:
            abusiveWords8.append(word)
        else:
            neutralWords8.append(word)
    i = i - 1
    j = j + 1


In [None]:
#remove the duplicated words

final_abusiveWords8 = list(dict.fromkeys(abusiveWords8))
print(final_abusiveWords8)

In [None]:
print("The length of the orginal list of abusive words:", len(abusiveWords8))
print("After removing the duplicated words, this is the length of the new list:", len(final_abusiveWords8))

In [None]:
#copy all abusive words into a txt file
with open(r'abusiveWordsVADER5.txt', 'w') as fp:
    for item in final_abusiveWords8:
        # write each item on a new line
        fp.write("%s\n" % item)
    print('Done')

#### Combine all 5 abusive text file into one 

Copied the all words into a textfile called "abusiveWordsVADER-full" then used excel to remove the duplicated values. After removing the duplicates, will check that if it is a abusive word then convert it back to a text file. 

In [2]:
#to load the corpus

import nltk.data
nltk.data.load('nltk_data/corpora/dataset/AbusiveWords-ver1.txt', format='raw')



In [4]:
import re
import string
from nltk.corpus.reader import WordListCorpusReader


w = WordListCorpusReader('.', ['nltk_data\\corpora\\dataset\\AbusiveWords-ver1.txt'])
wordList = w.words()

wordString = " "

wordString = wordString.join(wordList)

# convert all words to capitalize letters

caps = wordString.title()
caps_list = list(caps.split(" "))

#copy all capitalize words into a txt file
with open(r'capsList.txt', 'w') as fp:
    # write each item on a new line
    for item in caps_list:
        # write each item on a new line
        fp.write("%s\n" % item)
    print('Done')
    

# convert all words to uppercase letters

upper = wordString.upper()
upper_list = list(upper.split(" "))

#copy all uppercase words into a txt file
with open(r'upperList.txt', 'w') as fp:
    # write each item on a new line
    for item in upper_list:
        # write each item on a new line
        fp.write("%s\n" % item)
    print('Done')
    

Done
Done


All the files (capsList.txt, upperList.txt, AbusiveWords-ver1) are combined to produced the final version of list of abusive words called AbusiveWords (final).txt