## Cyberbullying Detection: A Machine Learning Approach

In [None]:
#imports

import pandas as pd 
import nltk
import re
import matplotlib.pyplot as plt
import numpy as np
import sklearn

### Step 1: Load the Dataset 

In [None]:
df = pd.read_csv("cyberbullying_tweets.csv")

### Visualize the data

In [None]:
#to set the column width to maximum 
pd.set_option('display.max_colwidth',150)

df.head()

In [None]:
df.tail()

In [None]:
# drop the cyberbullying_type column 

df.drop(["cyberbullying_type"], axis = 1, inplace = True)
df.head()

In [None]:
#check for missing values in data

df.isna().sum()

### Step 2 : Data Preprocessing 

In [None]:
#to convert uppercase to lowercase characters
def lower_word(t):
    new_text = "".join(t.lower())
    return new_text

df['lowercased'] = df['tweet_text'].apply(lambda x: lower_word(x))

In [None]:
#remove usernames, url and non utf8/ascii characters 
def rem_url(t):
    text1 = "".join(re.sub(r'(?:\@|https?\://)\S+', '', t))
    text = "".join(re.sub(r'[^\x00-\x7f]',r'', text1))
    return text

df['no_url_and_username'] = df['lowercased'].apply(lambda x: rem_url(x))

In [None]:
#to remove punctuation 
def rem_punc(t):
        new_text = "".join(re.sub(r'[^\w\s]', '', t))
        return new_text

df['no_punctuation'] = df['no_url_and_username'].apply(lambda x: rem_punc(x))

In [None]:
#to break the sentence into tokens
from nltk.tokenize import word_tokenize

def create_token(t):
        token_text = " ".join(word_tokenize(t))
        return token_text
    
df['tokenized'] = df['no_punctuation'].apply(lambda x: create_token(x))

In [None]:
#split strings into list and join as string 
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
stop_words.remove('not')
stop_words.extend(['rt', 'mkr', 'httpâ', 'tvwâ', 'etc'])

def rem_stopword(t):
    new_text = " ".join([word for word in t.split() if word not in stop_words])
    return new_text

df['no_stopwords'] = df['tokenized'].apply(lambda x: rem_stopword(x))

In [None]:
#total number of rows after datapreprocessing
df.shape

In [None]:
#to check for duplicated cells after cleaning
df.duplicated().sum()

In [None]:
#drop duplicated cells
df.drop_duplicates("no_stopwords",inplace=True)

In [None]:
df.duplicated().sum()

In [None]:
# WORDNET LEMMATIZER (with appropriate pos tags)
from nltk.stem import wordnet 
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

def lemma_postag(t):
    lemmatizer = WordNetLemmatizer()

# Define function to lemmatize each word with its POS tag

    def pos_tagger(nltk_tag):
        if nltk_tag.startswith('J'):
            return wordnet.ADJ
        elif nltk_tag.startswith('V'):
            return wordnet.VERB
        elif nltk_tag.startswith('N'):
            return wordnet.NOUN
        elif nltk_tag.startswith('R'):
            return wordnet.ADV
        else:
            return None

    # tokenize the sentence and find the POS tag for each token
    pos_tagged = nltk.pos_tag(nltk.word_tokenize(t))

    # our own pos_tagger function to make things simpler to understand.
    wordnet_tagged = list(map(lambda x: (x[0], pos_tagger(x[1])), pos_tagged))

    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            # if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:
            # else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    lemmatized_sentence = " ".join(lemmatized_sentence)

    return lemmatized_sentence


In [None]:
#lemmatization_pos_tag is chosen after analysis

df['lemmatization_postag'] = df['no_stopwords'].apply(lambda x: lemma_postag(x))

In [None]:
df.head()

In [None]:
#since data is cleaned, so all columns are dropped except for lemmatization_postag

clean_data = df.drop(['lowercased', 'no_url_and_username', 'no_punctuation', 'no_stopwords', 'tokenized'], axis = 1)
clean_data.rename(columns = {"lemmatization_postag":"cleaned_tweet"}, inplace = True)
clean_data.rename(columns = {"tweet_text":"original_tweet"}, inplace = True)

clean_data.head()

### Step 3: Sentiment Analysis 

#### To label the data into their respective categories

In [None]:

from textblob import TextBlob

#TextBlob
def getPolarity_TB(t):
    result = TextBlob(t).sentiment.polarity
    return result

In [None]:
text_blob = clean_data.copy(deep=True)

text_blob['TextBlob_polarity_originalTweet'] = text_blob['original_tweet'].apply(getPolarity_TB)

In [None]:
text_blob['TextBlob_polarity_cleanedTweet'] = text_blob['cleaned_tweet'].apply(getPolarity_TB)

In [None]:
def getLabel(polarity):
    if polarity < 0:
        return 'Negative'
    elif polarity == 0:
        return 'Neutral'
    else:
        return 'Positive'

In [None]:
text_blob['TextBlob_label_originalTweet'] = text_blob['TextBlob_polarity_originalTweet'].apply(getLabel)

In [None]:
text_blob['TextBlob_label_cleanedTweet'] = text_blob['TextBlob_polarity_cleanedTweet'].apply(getLabel)
text_blob.head()

In [None]:
#to visualize the textblob
text_blob.TextBlob_label_originalTweet.value_counts().plot(kind='bar',title="Sentiment Analysis for Original Tweet")

In [None]:
#to visualize the textblob
text_blob.TextBlob_label_cleanedTweet.value_counts().plot(kind='bar',title="Sentiment Analysis for Cleaned Tweet")

#### Remove tweets that is labelled as 'Neutral'

In [None]:
text_blob.drop(text_blob[text_blob['TextBlob_label_cleanedTweet'] == "Neutral"].index, inplace=True)
text_blob.shape

In [None]:
# remove it since it will be done in the next line
# text_blob.TextBlob_label_cleanedTweet.value_counts().plot(kind='bar',title="Sentiment Analysis for Cleaned Tweet")

In [None]:
# print(text_blob['TextBlob_label_cleanedTweet'] == "Negative")

In [None]:
# Convert the negative label to 1 , positive label to 0 
# but first need to drop the rows, just remain the (cleaned_tweet, TextBlob_label_cleanedTweet, TextBlob_polarity_cleanedTweet)
# need to add a new column (target)
cleaned_table = text_blob.drop(['original_tweet','TextBlob_polarity_originalTweet', 'TextBlob_label_originalTweet'], axis = 1)

In [None]:
cleaned_table.rename(columns = {'TextBlob_polarity_cleanedTweet':'polarity',
                              'TextBlob_label_cleanedTweet':'label'}, inplace = True)

In [None]:
# Positive - 0 for non-cyberbullying 
# Negative - 1 for cyberbullying

cleaned_table['target'] = cleaned_table['label'].apply(lambda label : 1 if label == "Negative" else 0)

In [None]:
import seaborn as sns 

x=cleaned_table['target'].value_counts()
sns.barplot(x.index,x).set(title="Updated Table with Negative and Positive Tweets")

sum = x[0] + x[1]
percentage_positive = (x[0]/sum)*100
percentage_negative = (x[1]/sum)*100
diff = percentage_negative - percentage_positive

print("Percentage of positive class (0) : {:.2f}%".format(percentage_positive))
print("Percentage of negative class (1) : {:.2f}%".format(percentage_negative))
print("Difference between two classes : {:.2f}%".format(diff))


# since the difference is not too large, so it is a quite balanced class 

In [None]:
# export to csv to rename the label after dropping rows and columns 
# then need to rearrange the index number manually in the excel file
cleaned_table.to_csv('cleaned_table.csv')