In [1]:
!pip install --upgrade pyarrow
!pip install --upgrade pandas
from google.colab import drive
from os.path import join
from bs4 import BeautifulSoup as bs
import pandas as pd
import pyarrow.feather as feather
import os
import pickle
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.stem.porter import *
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
import string
import re
from nltk.stem.porter import *
import sqlite3

In [None]:
ROOT = '/content/drive/'            # This is how you get to the "root" folder of your google drive. 
BASE = 'My Drive/Umar - Omdena Newsroom /' # This is where you specify the subfolder that is the working folder for this notebook. 
PROJECT_PATH = join(ROOT,BASE)

drive.mount(ROOT)
%cd '{PROJECT_PATH}'

In [None]:
conn = sqlite3.connect(PROJECT_PATH + 'data/omdena_master.db')
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
print(cursor.fetchall())

# Unlabelled Dataset

## Load the feather files into one dataframe.

First we will load the unlabelled dataset provided by Omdena. These are the 13 or so feather files. Each will be loaded up and added to a dataframe. The end result (df_unlab) is a dataset with all the articles in one frame.

In [11]:
#Load the unlabbeled data from the Feather files. 

unlab_data_path = PROJECT_PATH + 'data/raw_unlabeled/'

df_unlab=pd.DataFrame()
for file in os.listdir(unlab_data_path):
    df_tmp = pd.read_feather(unlab_data_path + file)
    df_unlab = df_unlab.append(df_tmp)

df_unlab.reset_index(inplace=True, drop=True)
df_unlab.to_csv(PROJECT_PATH + 'data/raw_combined_articles.csv')
#df_unlab.to_sql('raw_dataset',conn)

## Extract text from HTML

In [26]:
# Clean URLS to extract source

def strip_html (html_string):
    text = html_string[1:-1]
    text = bs(text).get_text()
    return text

def strip_url(string_in,suffix):
    url_string = string_in
    for suf in suffix:
        if suf in url_string:
            pos = url_string.find(suf)
            url_string = url_string[:pos]
    
    url_string = url_string.replace('https://','')
    url_string = url_string.replace('http://','')
    url_string = url_string.replace('www.','')
    return url_string

df_unlab['title'] = df_unlab['article_title'].apply(strip_html,1)
df_unlab['subtitle'] = df_unlab['article_subtitle'].apply(strip_html,1)
df_unlab['text'] = df_unlab['article_text'].apply(strip_html,1)
df_unlab['author'] = df_unlab['author_name'].apply(strip_html,1)

suffs = ['.com','.org','.co.uk','.net']
df_unlab['source'] = df_unlab['link'].apply(strip_url,1,args=(suffs,))

df_unlab.drop(['article_title','article_subtitle','article_text','author_name'], axis =1, inplace=True)

df_unlab.reset_index()
df_unlab.to_csv(PROJECT_PATH + 'data/cleaned_unlab.csv')
df_unlab.to_sql('cleaned',conn,if_exists='replace')
conn.commit()

df_unlab.head()


nytimes                    7148
cnn                        7148
breitbart                  7148
bbc                        7148
nypost                     7147
timesofindia.indiatimes    7147
foxnews                    7147
chicagotribune             7147
reuters                    7147
aljazeera                  7147
cbsnews                    7147
businessinsider            7147
nbcnews                    7147
latimes                    7147
nationalreview             7147
newsweek                   7147
theguardian                7146
thesun                     7146
thehill                    7146
telegraph                  7146
independent                7146
vox                        7146
vice                       7146
politico                   7146
washingtontimes            7146
dailymail                  7146
boingboing                 7143
usatoday                   7141
apnews                     7096
dailycaller                6604
wired                      6146
zerohedg

# Hate speech dataset


We will now load up the n-grams that were found to be most likely to be found in hatespeech in the paper The "Hate Speech and Offensive Language" by Tom Davidson,Dana Warmsley, Michael Macy, and Ingmar Weber. 2017. "Automated Hate Speech Detection and the Problem of Offensive Language."

Dataset[here](https://data.world/thomasrdavidson/hate-speech-and-offensive-language)..

Github [here](https://github.com/t-davidson/hate-speech-and-offensive-language)..

In [None]:
df_hate_words = pd.read_csv(PROJECT_PATH + 'data/raw_other/hateful_ngrams.csv')
df_hate_words.to_sql('hate_nrgams',conn,if_exists='replace')
conn.commit()

## Vectorize dataset for hatewords


Now we are going to turn our article text data into vectors to get counts of the occurence of the hateful n-grams in each articles. 

A few notes:

Our effort is to try to bring out dataset in the format that most closely resembles the way in which the authors formatetd their own dataset. To that end, we will use the tokenizer they developed for their dataset to tokenize ours in the same way. We will also generate n-grams in the range of 1-4, since that is the n-gram range used by the authors. 

Furthermore, we will pass in the "hateful" ngrams listed by the authors into our vectorizer so that it only counts the occurences of those n-grams instead of attempting to build a full vocabular of the dataset. The latter is simply too computationlly intensive (even for Colab). At the moment, we are only interested in seeing the occurence of hateful words so this should be fine. 

After running the CountVectorizer, we will do the following:

1. Gather up all the hits for hateful terms found in each document into one dictionary so we can easily look at what ngrams were found for a given article (instead of having to examine all the 178 or so columns)

2. Calculate a "hate score" This score will be the number of times a hateful ngram occurs in a document with the probability that the presence of that n-gram indicates that there is hatespeech being uttered. These probabilities were generated as the end result of the modeling done by Davidson et all and is found in the df_hate_words frame. 

3. We will save a whittled down version of the dataframe with the hateful n-grams dataframe to our database, to minimize redundancy and keep the size of the database small. We can later simply merge tables on the index column as needed. We will leave the "link" column in there for sanity checks later on. 

In [None]:
# Davidson's custom tokenizer
stemmer = PorterStemmer()
def tokenize(tweet):
    """Removes punctuation & excess whitespace, sets to lowercase,
    and stems tweets. Returns a list of stemmed tokens."""
    tweet = " ".join(re.split("[^a-zA-Z]*", tweet.lower())).strip()
    #tokens = re.split("[^a-zA-Z]*", tweet.lower())
    tokens = [stemmer.stem(t) for t in tweet.split()]
    return tokens

# Function to gather and tally hateful n-grams found in a given article and calculate the "hatescore" for that article
def tally_counts_doc(row):
    row2 = row[row > 0]
    score = 0
    hits = {}
    for index,val in row2.items():
        hits[index]=val
        hit = dict_hateweights[index] * val
        score += hit 
    row['hate_score'] = score
    row['hate_hits'] = hits

    return row

In [None]:
#instantiate and fit the vectorizer
unlab_cvect = CountVectorizer(
    ngram_range=(1,4),
    tokenizer = tokenize,
    vocabulary = list(df_hate_words['ngram'])
)


unlab_vectors = unlab_cvect.fit(df_unlab['text'])
unlab_matrix = unlab_vectors.transform(df_unlab['text'])
df_vectors = pd.DataFrame(unlab_matrix.todense(),columns=unlab_vectors.get_feature_names())

#Gather hate counts and calculate hate scores
df_vectors = df_vectors.apply(lambda row:tally_counts_doc(row),axis=1,result_type='expand')
df_hatecounts = pd.concat([df_unlab,df_vectors],axis=1)
df_hatecounts['hate_hits'] = df_hatecounts['hate_hits'].astype('string')
df_hatecounts.to_csv(PROJECT_PATH + 'data/unlab_hatecounts.csv')

#Whittle down the vector dataframe to reduce size of DF by curtailing redundant data in the tables and allow for joins
df_hatecounts_lite = df_hatecounts.drop([
                                         'author',
                                         'text',
                                         'title',
                                         'subtitle',
                                         'timestamp',
                                         'source',
                                         'link'
                                         ],axis=1)
df_hatecounts_lite.to_sql('hatecounts',conn,if_exists='replace')
conn.commit()

# Utterance Generation

Now, we will try to break apart the articles into sentences to have our own set of "utterances" that we can explore and analyse programatically. 

In [None]:
df_unlab = pd.read_sql('SELECT * FROM cleaned',conn,index_col='index')

df_hatevecs = pd.read_sql('SELECT * FROM hatecounts',conn,index_col='index')
df_hatevecs = df_hatevecs.drop(['timestamp','source','link'],axis=1) ## DELETE THIS AFTER RE-RUNNING THE PROCESSING NOTEBOOK 

df_hatecounts = df_unlab.merge(df_hatevecs,on='index')
df_utter_test = df_utter_test[df_utter_test['hate_score'] > 0]
df_utter_test = df_hatecounts[['source','link','text','hate_score']]

In [None]:
df_utters = pd.DataFrame()

for index, row in df_utter_test.iterrows():
    tokens = sent_tokenize(row['text'])

    chunk = pd.DataFrame.from_dict({'sentence':tokens})
    chunk['source'] = row['source']
    chunk['link'] = row['link']
    chunk['article_index'] = index

    df_utters = pd.concat([df_utters,chunk])

In [None]:
#%%time
sent_cvect = CountVectorizer(
    ngram_range=(1,4),
    tokenizer = tokenize,
    vocabulary = list(df_hate_words.index)
)


sent_vectors = sent_cvect.fit(df_utters['sentence'])
sent_matrix = sent_vectors.transform(df_utters['sentence'])
df_svectors = pd.DataFrame(sent_matrix.todense(),columns=sent_vectors.get_feature_names())

#Gather hate counts and calculate hate scores
df_svectors = df_svectors.apply(lambda row:tally_counts_doc(row),axis=1,result_type='expand')
df_shatecounts = pd.concat([df_utters,df_svectors],axis=1)