Hate Speech Filttering

In [None]:
# Libraries and instances

# Web scraper
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager


# NLTK
import re 
import csv
import nltk
from collections import Counter
from nltk.corpus import wordnet
tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

# ML libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
logReg = LogisticRegression()



In [None]:
options = webdriver.ChromeOptions()
# run Chrome tab without interface
options.add_argument("--headless")
driver = webdriver.Chrome(executable_path=ChromeDriverManager().install(), options=options)

unvisited = set()
visited = set()
baseUrl = input("Enter url of website: ")
unvisited.add(baseUrl)

def getLink(pageDriver, baseUrl):
    # Get all 'a-tags'
    allTags = driver.find_elements_by_tag_name('a')

    # Get all allUrls on page
    allUrls = [tag.get_attribute('href') for tag in allTags]

    #Only base url and all related allUrls
    useful_urls = [url for url in allUrls if url and baseUrl in url]

    return useful_urls

def getText(pageDriver):
    # Get textual content from 'body' html tag
    pageText = pageDriver.find_element_by_xpath("/html/body")
    if not pageText:
        return ""
    # pageText is webElement, pageText.text returns texts
    return pageText.text

# counter for text indexing
ct = 31963
while unvisited:
    # Pop url from unvisited url set and add it to visited
    page = unvisited.pop()
    visited.add(page)

    # Get page content
    driver.get(page)

    # Get links
    links = getLink(driver, baseUrl)

    # Get text
    text = getText(driver)
    textToWrite = text.splitlines()
    # Create .csv file
    with open(f'test.csv', 'w', encoding='utf-8') as csvfile:
        fieldnames = ['id', 'text']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        # Write to .csv file with index and text
        for text in textToWrite:
            writer.writerow({'id': ct, 'text': {text}})
            ct += 1

print("WEB SCRAPING FINISHED")

In [None]:
# Read raw .csv data using Pandas Dataframe
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')


In [None]:
# Combine training data with test data
allTexts = train.append(test, ignore_index=True, sort=False)

allTexts[31980:31990]


In [None]:
def clean_texts(text, pattern):
    process = re.findall(pattern, text)
    for i in process:
        text = re.sub(i, '', text) 
    return text

In [None]:
# Clean text 
# Add new column 'tidy_text'
# Clean usernames with '@' symbols
allTexts['tidy_text'] = np.vectorize(clean_texts) (allTexts['text'], "@[\w]*") 

# Clean non-alphabets
allTexts['tidy_text'] = allTexts['tidy_text'].str.replace("[^a-zA-Z]", " ") 

allTexts[31990:32000]


In [None]:
# Determine word type
def get_pos(word):
    w_synsets = wordnet.synsets(word)

    pos_counts = Counter()
    # noun
    pos_counts["n"] = len([item for item in w_synsets if item.pos() == "n"])
    # verb
    pos_counts["v"] = len([item for item in w_synsets if item.pos() == "v"])
    # adj
    pos_counts["a"] = len([item for item in w_synsets if item.pos() == "a"])
    # adv
    pos_counts["r"] = len([item for item in w_synsets if item.pos() == "r"])

    most_common_pos_list = pos_counts.most_common(3)
    return most_common_pos_list[0][0]

# Tokenization and Lemmatization function
def lemmatize_text(text):
    return [lemmatizer.lemmatize(w, get_pos(w)) for w in tokenizer.tokenize(text)]

In [None]:
# Tokenization and Lemmatization text
nltkText = allTexts['tidy_text'].apply(lemmatize_text)
# Join back 
allTexts['tidy_text'] = [' '.join(map(str, l)) for l in nltkText]

In [None]:
# Bag Of Words Model
bow_vectorizer = CountVectorizer(
    max_df=0.5, 
    min_df=2, 
    max_features=5000, 
    stop_words='english'
    )

In [None]:
# Feature extraction - Bag-of-Words [sklearn CountVectorizer] 
# Matrix dimensions change accordingly to test data size
# bag-of-words feature matrix
bow = bow_vectorizer.fit_transform(allTexts['tidy_text'])

In [None]:
# Splitting feature matrix into training and testing matrices
trainingBow = bow[:31962, :]
testingBow = bow[31962:, :]

# splitting data into training and validation set
# xtrainBow training dataset
# xtrainValidBow validation for training
# ytrain training label vector
# yvalid validation label vector
xtrainBow, xtrainValidBow, ytrain, yvalid = train_test_split(trainingBow, train['label'], test_size=0.1)
# trainingBow feature matrix
# train['label'] label vector
# test_size percentage of data gets tested on (0.9 training)


In [None]:
# BoW LR 
logReg.fit(xtrainBow, ytrain)                     

# predicting on the validation set
predValid = logReg.predict_proba(xtrainValidBow)
# if prediction probability is greater than or equal to 0.25 than 1 else 0
predValidNum = predValid[:, 1] >= 0.25
predValidNum = predValidNum.astype(np.int)
# calculating f1 score
f1_score(yvalid, predValidNum)  

In [None]:
# Prediction on testing dataset Bow LR
predTest = logReg.predict_proba(testingBow)
predTestNum = predTest[:, 1] >= 0.25
predTestNum = predTestNum.astype(np.int)
test['predicted_label'] = predTestNum
sub = test[['id', 'predicted_label']]
# writing data to a CSV file
sub.to_csv('logReg_bow.csv', index=False)

In [None]:
# BoW SVM
svm = SVC(kernel='rbf', gamma=0.1, C=11)
svm.fit(xtrainBow, ytrain)
# Prediction on validating dataset Bow SVM
predValid = svm.predict(xtrainValidBow)
predValidNum = predValid.astype(np.int)

f1_score(yvalid, predValidNum)

In [None]:
# Prediction on testing dataset Bow SVM
predTest = svm.predict(testingBow)
predTestNum = predTest.astype(np.int)
test['predicted_label'] = predTestNum
sub = test[['id', 'predicted_label']]
sub.to_csv('svm_bow.csv', index=False)

In [None]:
# TF-IDF
tfidf_vectorizer = TfidfVectorizer(
    max_df=0.5, 
    min_df=2, 
    max_features=5000, 
    stop_words='english'
    )

In [None]:

# TF-IDF Features - Looks at frequency of occurence for terms/importance of the term
# TF = (Number of times term t appears in a document)/(Number of terms in the document)
# IDF = log(N/n), where, N is the number of documents and n is the number of documents a term t has appeared in.
# TF-IDF = TF*IDF
tfidf = tfidf_vectorizer.fit_transform(allTexts['tidy_text'])

In [None]:
# TFIDF feature matrix
trainTFIDF = tfidf[:31962, :]
testTFIDF = tfidf[31962:, :]
# extract label vectors from the feature matrix via matching data index
xtrainTFIDF = trainTFIDF[ytrain.index]
xvalidTFIDF = trainTFIDF[yvalid.index]

In [None]:
# TF-IDF LR
logReg.fit(xtrainTFIDF, ytrain) 
# Prediction on validating dataset TF-IDF LR
predValid = logReg.predict_proba(xvalidTFIDF) 
predValidNum = predValid[:, 1] >= 0.25
predValidNum = predValidNum.astype(np.int)

f1_score(yvalid, predValidNum)

In [None]:
# Prediction on testing dataset TF-IDF LR
predTest = logReg.predict_proba(testTFIDF)
predTestNum = predTest[:, 1] >= 0.20
predTestNum = predTestNum.astype(np.int)
test['predicted-label'] = predTestNum
sub = test[['id', 'predicted-label']]
sub.to_csv('logReg_td-idf.csv', index=False)

In [None]:
# TF-IDF SVM
svm = SVC(kernel='rbf', gamma=0.1, C=11)
svm.fit(xtrainTFIDF, ytrain) 
# Prediction on validating dataset TF-IDF SVM
predValid = svm.predict(xvalidTFIDF)
predValidNum = predValid.astype(np.int)

f1_score(yvalid, predValidNum)

In [None]:
# Prediction on testing dataset TF-IDF SVM
predTest = svm.predict(testTFIDF)
predTestNum = predTest.astype(np.int)
test['predicted-label'] = predTestNum
sub = test[['id', 'predicted-label']]
sub.to_csv('svm_td-idf.csv', index=False)