<a href="https://colab.research.google.com/github/traubad/Toxic_Spans_Detection/blob/main/Final_Toxic_Spans_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<h1> Toxic Spans Detection </h1>
<h2> Semeval 2020 Task </h2>
<h3> Adam Traub </h3>

In [None]:
# Imports

# Preprocessing stuff
import re
import spacy
import string
import itertools

import nltk
nltk.download('stopwords')
nltk.download('vader_lexicon')
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from ast import literal_eval

# Structural Stuff
from gensim.models import Word2Vec
import pandas as pd

# ML Stuff
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

# Misc.
import sys
import numpy as np

#Only Necessary for google colab
from google.colab import drive

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


<h2> Setting up Libraries post import </h2>

In [None]:
# Setup Libraries
sp = spacy.load('en_core_web_sm')

# nltk
stopWords = set(stopwords.words('english'))

# colab
drive.mount('/content/drive/')

# Data
data_source = '/content/drive/My Drive/ToxicSpans/tsd_trial.csv'

# General System information
py_version = sys.version.replace('\n',' ')
print("Initial Setup: ")
print(f"\tPython Version: {py_version}")

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
Initial Setup: 
	Python Version: 3.6.9 (default, Oct  8 2020, 12:12:24)  [GCC 8.4.0]


<h2> Utility functions for printing spans, particularly for printing in color</h2>

In [None]:
# Helper functions mostly for display
class text_color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'


def color_string(in_string, color=text_color.RED):
    return f"{color}{in_string}{text_color.END}"


def color_span(span, original_text, color=text_color.RED):
    out_text = original_text
    spans = contiguous_ranges(span)
    bad_words = {out_text[start:end+1] for (start,end) in spans}
    for bw in bad_words:
        out_text = out_text.replace(bw, color_string(bw, color),999)
    return out_text


def contiguous_ranges(span_list):
    """Extracts continguous runs [1, 2, 3, 5, 6, 7] -> [(1,3), (5,7)].
    From: https://github.com/ipavlopoulos/toxic_spans/blob/master/evaluation/fix_spans.py"""
    output = []
    for _, span in itertools.groupby(
        enumerate(span_list), lambda p: p[1] - p[0]):
        span = list(span)
        output.append((span[0][1], span[-1][1]))
    return output

    
def print_row_and_span(i, text, span, color=text_color.RED):
    color_text = color_span(span, text, color).replace('\n','\n    ')
    print(f"{i+1:02}: {contiguous_ranges(span)}\n    {color_text}\n")

<h2> General Options</h2>

In [None]:
CLASSIFIERS = {
    'random_forest': RandomForestClassifier(n_jobs=2, random_state=0), #Warning: Prohibitively Slow!
    'sgd': SGDClassifier(loss='log'), #simple gradient descent
    'knn': KNeighborsClassifier(n_neighbors=9), #k nearest neighbors - works better than sgd
}

PARSING_OPTIONS = {
    'use_lemmas': True,
    'remove_extra_whitespace': True,
    'remove_punctuation': True,
    'remove_pronouns': True, #Requires use_lemmas
    'remove_stopwords': True, #Requires use_lemmas
    'make_lowercase': True,
    'max_span_length': 60,
    'min_span_length': 3,
    'classifier': CLASSIFIERS['knn'],
}

W2V_PARAMS = {
    'W2V_window': 10,
    'W2V_size': 100, #SDG you can go pretty high with this
    'W2V_min_count': 1, #probably keep this at 1
    'W2V_workers': 4, #probably keep this at 4
}

<h2> Parser </h2>

In [None]:
def string_to_parsed_list(in_string, options=None):
    if options is None:
        options = PARSING_OPTIONS

    if options['make_lowercase']:
        in_string = in_string.lower()

    if options['remove_extra_whitespace']:
        in_string = " ".join(in_string.split())

    if options.get('remove_punctuation'):
        in_string = re.sub(r'[^\w\s]', '', in_string)

    if options['use_lemmas']:
        out = [word for word in sp(in_string)]

        if options['remove_pronouns']:
            out = filter(lambda word: str(word.lemma_) != "-PRON-", out)

        if options['remove_stopwords']:
            out = filter(lambda word: str(word.lemma_) not in stopWords, out)

        if not options.get('preserve_original', False):
            out = [word.lemma_ for word in out]

    else:
        out = in_string.split(' ')

    if options['min_span_length']:
        out = [word for word in out if len(str(word)) >= PARSING_OPTIONS['min_span_length']]

    return out

<h2> Read the data in and provide some basic information</h2>

In [None]:
# Read data
data = pd.read_csv(data_source)

# Make spans into list
data["spans"] = data.spans.apply(literal_eval)
original_length = len(data)

# Filter any rows with spans bigger than max_span_length
data = data[data['spans'].map(len) <= PARSING_OPTIONS['max_span_length']]

print(f"Data Loaded, there were originally {original_length} entries")
print(f"After removing all spans with a length greater than {PARSING_OPTIONS['max_span_length']} characters, there are now {len(data)} entries.\n")

for i, row in data.head(5).iterrows():
    print_row_and_span(i, row['text'], row['spans'], text_color.RED)

Data Loaded, there were originally 690 entries
After removing all spans with a length greater than 60 characters, there are now 670 entries.

01: [(15, 19), (27, 31)]
    Because he's a [91mmoron[0m and a [91mbigot[0m. It's not any more complicated than that.

02: [(29, 34)]
    How about we stop protecting [91midiots[0m and let nature add some bleach to the gene pool. We can always submit their names for the Darwin awards.

03: [(166, 171)]
    If people  were  smart, they would  Boycott this  inept  airline,  but   they  are  not  smart,  so   rogue  businesses  like  this  one,   still thrive   taking the [91midiots[0m  for  a ride...

04: [(87, 92)]
    Trump Claimed that Russia will never invade the Ukraine, when Russia already has - how [91mstupid[0m can people be?

05: []
    As long as your willing to pay a lot more for products you buy, then fine.
    But you better not be going to Costco and Walmart to buy stuff because it's cheaper.
    If so, we get to call you a 

<h2>Split the data for testing and training</h2>

In [None]:
# Train test split (70-30)
train, test = train_test_split(data, test_size=0.3, random_state=42, shuffle=True)

<h2>Functions for getting the toxic and non_toxic words and label them</h2>
<h3>Also where most of the parsing takes place</h2>

In [None]:
# Creating word train data set
# Getting toxic words
def get_toxic_words(data):
  toxic_string = ''
  comment = data['text']
  prev_idx = -99

  for idx in data['spans']:
    if idx != (prev_idx + 1):
      toxic_string += ' '
      
    toxic_string += comment[idx]
    prev_idx = idx

  return string_to_parsed_list(toxic_string)

# Getting non toxic words
def get_non_toxic_words(data):
  non_toxic_string = ''
  prev_idx = -99

  for idx, com in enumerate(data['text']):
    if idx not in data['spans']:
      if idx == (prev_idx +1):
        non_toxic_string += com
      else:
        non_toxic_string += ' ' + com

      prev_idx = idx

  return string_to_parsed_list(non_toxic_string)

# create the word_data for training and testing
def create_word_data(data):
    toxic_words = train.apply(get_toxic_words, axis=1).values
    toxic_words = list(set([item for sublist in toxic_words for item in sublist])) #Flatten it

    non_toxic_words = train.apply(get_non_toxic_words, axis=1).values
    non_toxic_words = list(set([item for sublist in non_toxic_words for item in sublist]))

    words = non_toxic_words + toxic_words
    labels = np.append(np.zeros(len(non_toxic_words)), np.ones(len(toxic_words)))

    word_data = pd.DataFrame(list(zip(words, labels)), columns=['Words', 'Labels'])

    return word_data

In [None]:
# create word sets
word_train_data = create_word_data(train)
word_test_data = create_word_data(test)

<h2>W2V embeddings for training data</h2>

In [None]:
# Put word embedding in word data
def get_word_embedding(word):
  try:
    embedding = model.wv[word]
  except:
    embedding = np.zeros(W2V_PARAMS['W2V_size'])

  return embedding

# Create W2V embeddings
training_comment_words = [string_to_parsed_list(row) for _, row in train['text'].iteritems()]

# Creates numerical representations of text data
model = Word2Vec(sentences=training_comment_words, 
                 size= W2V_PARAMS['W2V_size'], 
                 window= W2V_PARAMS['W2V_window'], 
                 min_count= W2V_PARAMS['W2V_min_count'], 
                 workers= W2V_PARAMS['W2V_workers'])

word_train_data['Vectors'] = word_train_data['Words'].apply(get_word_embedding)
word_test_data['Vectors'] = word_test_data['Words'].apply(get_word_embedding)


In [None]:
word_test_data.tail(20)

Unnamed: 0,Words,Labels,Vectors
3166,islamic,1.0,"[-0.0028621643, 0.0043536727, -0.0007164391, 0.0015838945, 0.0047234236, 0.0028461756, -0.002154229, -0.003112106, -0.0019454921, 0.0013121659, 0.003866197, 0.0046383725, -0.00047905263, -0.0034406784, 0.00022200674, 0.0013823613, 0.0006274088, -0.0048103263, -0.0022406883, -9.797256e-05, -0.0023656022, 0.0034154053, 0.0023334194, -0.0007789052, -0.0029026803, -0.0022613995, 0.0025104408, -0.002418102, 0.0017173865, 0.005014782, -0.002665927, 0.00044265378, -0.0005802945, -0.0014338188, 0.0048714103, 0.001570337, -0.0012572918, 0.0034740267, -0.0034886478, -0.0041079717, 0.0010151163, -0.00184437, 0.0021448105, -0.0035009084, 0.0022382652, -0.004266118, -0.0009133386, 0.0020898038, -0.0036836239, -0.0027017111, -0.003571011, -0.0011280326, -0.0039671427, 0.00053747284, 0.004077051, 0.0014892444, 0.0034610506, 0.0013702804, 0.0025870393, -0.003546858, -0.00091649574, 0.003981613, -0.0022639947, 0.0011767818, -0.002057557, -0.00039286818, -0.0045079715, 0.0033587983, -0.003996367, 0.0028331708, 0.0031462598, 0.0032210744, -0.004713818, 0.0002955223, -0.0036015003, -0.0008857186, 0.0012627362, -0.0004130935, -0.0027922562, -0.0032261908, -0.0032241496, 0.0028531633, 0.0046008, -0.0023259197, 0.0058195856, -0.0015388211, -0.00077762356, 0.00014075126, -0.0046495204, -0.002408899, 0.0024491148, 0.00039252313, -0.003522276, -0.0047478913, -0.0006251089, -0.0033334927, -0.0029991653, 0.0048216414, 0.0018350246, -0.0009982756]"
3167,rape,1.0,"[-0.0023681738, 0.004270104, 0.00046689616, -0.0010216752, 0.001001719, -0.0008725737, 0.0046186675, 0.004579107, 0.0010339294, -0.004176913, -0.0043805875, 0.004821485, -0.0043637482, 0.0035740419, 0.0018190277, 0.0012261763, 0.0021079946, 0.0012234831, 0.0008357547, -0.0041445787, 0.003186188, -0.0025788532, 0.0037856766, -0.004658322, -0.0006342563, -0.0025218183, 0.001663197, -0.0019419944, -0.0009568514, 0.003554747, -0.0037047067, -0.0021241934, -0.0043449667, 0.0010266593, -0.004848526, -0.0014186555, 0.0023727168, -0.003387948, -0.0042369864, -0.0036686943, -0.003299002, 0.0019611756, 0.0008307423, 0.0047138915, 0.004338172, -0.0045322357, -0.0028346742, -0.0038743296, 0.0014337479, -0.003279065, 0.003259138, -0.004606722, -0.005026784, 0.0010956711, -0.004744649, -0.001138697, -0.0037304529, -0.00087971095, 0.0036580048, -0.00038486562, -0.003204813, -0.0033170204, -0.0021093646, -0.004105667, 0.0006727908, 0.0035869954, -0.0010005482, -0.003105616, -0.004052837, 0.004370245, -0.004529118, 0.00071398844, -0.0022298752, -0.0044920747, -0.0038450058, -0.003924193, -0.0021931124, 0.0029101684, -0.0023469331, -0.0039206557, -0.0007101661, 0.00043048494, -0.001643447, 0.003765482, -0.002968095, -0.0027054274, -0.001546567, 0.0028314616, 0.0031002872, -0.0013520247, -0.004076317, 0.001185844, 0.00350169, -0.0016041921, 0.0029690466, -0.0038322539, 0.0005121846, 0.0018182007, 0.0004415427, 0.0048511783]"
3168,life,1.0,"[0.0029740266, -0.0050461097, 0.0049759923, -0.0030307763, -0.0010227744, -0.0021292048, -0.0054388735, -0.0043151365, 7.012336e-05, 0.003172196, -0.00051954336, -0.00014361329, -0.0033014917, -0.00072858913, -0.0043873014, -0.003558395, -0.0012333348, 0.0021637848, 0.003155723, 0.00011257008, 0.0022502567, 0.003063488, -0.00327663, -0.00037263634, -0.005237315, 0.0022799163, 0.0051655425, -0.0011916348, -0.0006823736, 0.0035622038, 0.0014717311, -0.0009845953, -0.0023678702, 0.0027083193, -0.003556968, -0.0033825939, -0.0022389835, 2.4573957e-05, -0.0047494993, 0.0046992493, -0.00039438932, 0.0034490358, 0.0013855947, -0.0025891066, -0.00057229935, -0.000894425, -0.0027861148, -0.0038590706, -0.000113365466, -0.0034490847, -0.004171869, 0.0018805754, 5.4559445e-05, 0.0015444162, 0.000903341, -0.0022167033, 0.0034084425, 0.0027352998, 0.00029633468, -0.0011006548, -0.0020788605, -0.003946295, -0.0038432092, -0.00014417284, 0.0019946794, 0.00605191, 0.0044729183, 0.0027464796, 0.003056397, 0.0010854093, 0.0010101815, -0.0034189452, 0.0016933446, -0.0069549093, 0.001548447, -0.0005874611, 0.0038149895, -0.0016922383, 0.0057345564, 0.0038924369, -0.0040979297, 0.0020510252, -0.00052605546, 0.0022864896, 0.00647195, 0.00047592697, 0.0047727856, 0.004933053, 0.0024341962, 0.0012743352, -0.0015562606, -0.004779814, 0.005453027, -0.001113747, 0.0019608596, 0.004408472, 0.003944105, 0.002138879, -0.00026501575, -0.00049644744]"
3169,insane,1.0,"[-0.0044012642, -0.0033602018, 0.0035635438, 0.0004076728, -0.004152443, -0.0016937134, -0.0009944472, 0.0030915986, 0.0016203744, -5.042108e-05, 0.0038834168, 0.0019953488, 0.00462942, 0.0047558555, 0.002163681, 0.0035465662, 0.0010308962, -0.0022365886, -0.002142163, -0.0038488738, 0.0032998722, 0.0025424273, -0.0012962888, 0.001704151, -0.0011974466, 6.138586e-05, -0.004977808, 0.002668765, 0.002479825, 0.0034335193, 0.0002527408, 0.0046131453, -0.0045648557, -0.0035658027, -0.00029677423, -0.0041108276, -0.0022264041, 0.00147899, 0.00086579967, -0.0029472394, 0.0048859976, 0.004722974, -0.0044804914, 0.001033903, -0.0010357946, 0.0011589379, -0.0019575863, -0.0010918702, -0.0044498537, -0.0024383268, 0.0006121154, -0.0020163888, -0.004911011, 0.0010142783, -0.002110889, -0.00204405, 0.0011061202, 0.003395659, 0.0034492107, 1.1963706e-05, 0.004483018, 0.0020052819, 0.004445696, -0.004410852, 0.0001258426, 0.0007505061, 0.0015159216, -0.0040639285, -0.0021518285, -0.0042015277, 0.0048307464, 0.004071015, -7.933073e-05, 0.00014054573, 0.0016988066, -0.0015544622, 0.0046322304, 0.0012064356, 0.0014964333, 0.002422024, 0.0038238054, -0.00147695, 0.0020959063, 0.004045554, -0.002394329, 0.00080578716, 0.003034537, -0.004037864, 0.0018221785, 0.0017419, -0.002676754, 0.0040367777, 0.004704439, 0.002052649, 0.00033485465, 0.0039532245, -0.0024426845, 0.0031706432, 0.002113079, -0.0045122285]"
3170,lottery,1.0,"[0.00041313926, -0.0041576675, -0.0004414657, 0.0010554189, 0.0036100545, -0.0040722336, -0.0047708848, -0.00046535878, 0.0017934928, 0.004465578, -0.0038383214, 0.001294045, 0.0016036134, -0.0016960304, -0.0002202479, -0.0049443785, -0.0016454596, -0.0035851921, -0.0019853783, 0.00090069, 0.0011334446, 0.0033987178, 0.003824428, 0.0046915216, -0.0009408558, 0.0014959176, 0.0020953093, 0.0002828599, 0.0014086971, -0.0046314164, 0.0021098228, -0.0026210228, -0.0033308223, -0.001245969, -0.0010454544, 0.004683655, 0.0029243967, -0.0023293816, 0.004054181, -0.0013108205, 0.0035699778, 0.0009617653, 0.004366967, 0.004059054, -0.00085632084, 0.00027150323, -0.004469228, -0.003514133, 0.003922262, 0.0038702285, -0.0004768722, -0.0024968123, -0.0022430085, -0.0010960519, 0.00013541449, 0.0017384891, 0.0005121637, -0.0025026968, -0.0032680219, -0.00076599984, 0.0048763524, 0.0031897658, 0.000926116, -0.002125933, -0.000170657, -0.0002682971, -0.0027603982, -0.0013690154, -0.0036966098, -0.0045394897, 0.0034406248, -0.0043951133, 0.0038193061, -0.0005835354, 0.0028607708, 0.00053054345, -0.0037606924, -0.00073535665, 0.0033162571, -0.002057096, -0.00299256, -0.0025797125, 0.0008401297, -0.0023792498, -0.003300436, -0.0044426904, 0.004272048, 0.002868692, -0.0024132195, 0.0017789466, -0.0036831722, 0.0014798486, -0.0043097343, 0.0019775392, 0.00011084482, 0.00089546986, 0.0014561258, 0.003990974, 0.003548329, 0.002526719]"
3171,motivated,1.0,"[-0.0015959673, 0.002825524, 0.0011258351, 3.3674733e-05, 0.0005431843, 0.0018166584, -0.0010526708, 0.0026466418, 0.0006010056, -0.0005700053, -0.0008631669, -0.004179617, 0.0025663387, 0.0019506868, 0.004787658, 0.0003527503, -0.0024169902, 0.004690127, -0.0030231324, -0.0026999337, 0.004251783, -0.0026488483, 0.0015340488, -0.0022380415, 0.0037232395, -0.0012922016, -0.0019418773, 0.0010130233, -0.0016444238, 0.0046961596, 0.00075732486, 0.0009830906, -0.0014260692, 0.0012786668, 0.0034515571, -0.0014463733, -0.0006283126, 0.003159949, -0.0029596859, 0.0038269216, -0.00211653, -0.0030614804, 0.003546953, -0.0029099663, 0.0039033464, 0.00027415267, 0.003484161, 0.0034135303, 0.0033484467, 0.0017499938, -0.0019476765, 0.0020458873, -0.0019249029, 0.0026932042, -0.0022005797, -0.0049785613, 0.0046000117, -0.0037397733, -0.00032329484, 0.00089587097, 0.0017732889, -0.0012323689, 0.00384815, -0.00011647246, -0.005004403, 0.0016246734, -0.002607581, 0.0017352513, 0.003609581, 0.0040608724, -9.843945e-05, 0.002972877, -0.000705116, -0.004861457, 0.0007014414, -0.0010399872, 0.0015045619, -0.0033236493, 0.00033897962, 0.0009255257, -0.0037819126, -0.0006226241, -0.004870781, 0.00031474547, 0.0019341719, 0.004505032, 0.002307422, 8.6219785e-05, -0.003126864, 0.003223737, -0.0046350686, 0.0024211812, -0.0020932907, 0.0019972129, 0.0036897927, 0.0022590212, -0.0018059209, -0.00019118843, 0.0021722824, -0.0048092743]"
3172,rapist,1.0,"[0.003488925, 0.0036103332, 0.0016737877, 0.003644281, -0.0002579896, 0.004378434, -0.0044630724, -0.0036914714, -0.0036471656, 0.0033206209, -0.0025662254, -0.0005449976, -0.0041688895, 0.0046326937, 0.0016331298, 0.00016928316, 0.0015099787, -0.003999317, 0.0031816799, -0.0016224548, -0.00013586166, -0.0019242417, -0.0009942974, -0.00454932, 0.0039459667, 0.001480809, 0.00253115, 0.004736394, 0.00033866832, -0.0018296856, -0.0043105315, -0.0035783183, -0.0046912995, -0.0034964855, -0.004965669, 0.0043705422, 0.004581204, -0.0035238508, -0.004166949, -0.0040183812, 0.0014765316, 0.0002410213, -0.0030618126, 0.001105914, -0.0038750167, -0.004236109, 0.0009191605, 0.00035757903, -0.0029628791, -0.0001604309, -0.0045964634, -0.0041065225, 0.0021452585, -0.004281757, 0.0038688583, 0.0026105414, 0.004732936, -0.0009390201, -0.00033095555, 0.0033376182, -0.00013530052, 0.0022528127, 0.0028348432, -0.00260192, -0.0016749598, 0.0024809903, 0.004273335, 0.0030800109, 0.004683346, 0.0010292928, -7.220442e-05, -0.0032843323, 0.004956945, 0.0045055, -0.0007804619, -0.0019915306, 0.001701191, -0.0029048363, -0.0037554617, -0.0014885807, -0.0041529564, -0.0025713323, 0.0049482207, 0.0022452045, 0.00064968475, 0.0005398046, -0.0023407915, -0.003024487, 0.0012113998, 0.0013809418, -0.002185477, -0.0024536513, 0.0010090269, 0.00450074, -0.003827326, -0.002255738, 0.00446195, 0.002965359, -0.00013657099, -0.0010552299]"
3173,fcking,1.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
3174,obamas,1.0,"[-4.375803e-05, -0.0012844215, -0.0014048184, -0.00042203537, 0.00060565816, 0.003917285, -0.005017278, 0.002090899, -0.0037039386, 0.0008082206, -0.00031446753, 0.0018563408, 0.00021461045, -0.004586454, -0.0036464925, -0.0020598446, -0.004122477, 0.004082756, 9.565303e-05, 0.00044926972, 0.0040217447, -0.0035987669, 0.004637142, -0.000489022, -0.004903928, 0.0018013291, 0.001580861, -0.0022575255, -0.0029677127, 0.0011456361, 0.004804055, -0.0030920506, -0.0022588503, -0.004239357, -0.0035178715, -0.0027266094, -0.0008626547, 0.0012937419, -0.0021502862, 0.0049769715, -0.0012509507, 0.0013105955, 0.0013948062, -0.00049204973, -0.0023156565, 0.0028228746, 0.0032718673, -0.0046288795, -0.0012586237, 0.0007250538, -0.00073369656, -0.0010277062, 0.00425094, 0.00065555004, -0.0025047609, -0.0042049545, -0.0045745326, 0.001358853, -0.0028147174, -0.002538374, 0.0017224741, -0.001550756, -0.004469108, -0.00010572922, 0.000106316525, 0.00055815245, 0.0036560467, -0.0013737753, -0.0047666216, -0.00062147493, 0.003667654, 0.0005760347, -0.0025944542, -0.0048347358, -0.0015049017, 0.00029677947, -0.0018197502, -0.001785184, -0.0008348397, 0.004967081, -0.0017941348, 0.003241748, 0.00091655867, 0.00062330044, 0.0045947977, 0.00086346606, 0.0022181366, 0.0050637443, -0.0006043621, 3.74757e-05, -0.0036751546, -0.0024283167, 0.0037538381, -0.003967302, -0.003577775, 0.0033401204, 0.0044385055, -0.0003516606, 0.000660392, 0.0022485207]"
3175,ignorance,1.0,"[-0.0007376733, 0.002737437, 0.00029969955, -0.003026184, 0.0021755495, -0.0031317247, 0.003455045, 0.0038875537, 0.004663703, 0.0042423876, 0.0008401908, -0.00075805385, 0.0007821644, -0.00477783, -0.0031823153, 0.002176165, -0.0044264533, 0.00054304965, 0.0022384846, -0.0049980464, 0.0024058707, 0.0037940412, 0.00091644644, 0.0032739101, 0.0033475468, -0.0005075915, -0.003446661, -0.0037025523, -0.0039636777, -0.0008812311, 0.0015364863, 0.0033801636, -0.0044402867, 0.0008072917, 0.0048026266, -0.004680123, 0.004074633, -0.0016873111, 0.0031614539, -0.0030107552, 0.0040736585, 0.003555375, -0.0017870378, 0.0018560034, 0.0045949044, 0.003358691, -0.0011777625, -0.0014734877, -0.004377258, 0.002438186, -0.0017283305, 0.0031748512, 0.0020331878, 0.003426497, -0.0039890534, -0.0035438698, -0.0010483592, -0.00024613607, -0.0019013322, 0.0006340083, -0.0027096877, 0.003591376, -0.0027325538, -0.0004421699, 0.003526094, -0.0045399736, -0.00096970674, -0.0038078043, -0.0034073535, -0.0034557434, -0.0012586341, 0.00079782854, 0.0039083385, -0.0030250535, 0.0047483556, 0.0032042102, 0.00092373585, 0.004259555, -0.0030039211, -0.004402609, 0.0038270468, -0.0007554159, -0.0022171666, -0.00011033221, 0.002913784, 0.0015629492, 0.0016030751, 0.003496401, 0.0040278863, 0.0032663967, 0.00111163, -0.0035617212, -0.00481866, -0.0010685812, 0.0023712271, 0.0034399806, -0.0012799265, 0.0047482555, 0.00054799777, 0.0026551585]"


<h2>Train the model</h2>

In [None]:
# Train ML model
# Give word embedding and if the word is toxic or not (label)
# If word not in vocabulary, give 0 vector
classifier = PARSING_OPTIONS['classifier']

# word classification - YEAAH
labels = word_train_data['Labels']

# QnD stuff, ignore it. Basically changing data types. Y is effectively is list of list where each inner list is a word embedding 
x = word_train_data['Vectors'].values
y = [list(z) for z in x] 

# use hstack to add additional features

# Do THE THING (training)
classifier.fit(y, labels)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=2,
                       oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

<h2>Make Predictions</h2>

In [None]:
# Predict if word is toxic in sentence
# If toxic, get the indices of the word
# Repeat
result = []
found = 0
not_found = 0

# Each comment
test_comment_word_sets = []
all_bad_words = []

for _, row in test['text'].iteritems():
    test_comment_word_sets.append(string_to_parsed_list(row, 
                                   options={**PARSING_OPTIONS,
                                       **{'preserve_original': PARSING_OPTIONS['use_lemmas']}}))

for index, test_comment_word_set in enumerate(test_comment_word_sets):
  # Each word of comment
  span = []
  bad_word = []

  for word in test_comment_word_set:
      original_word = str(word)
      if PARSING_OPTIONS['use_lemmas']:
          word = str(word.lemma_)
      # Get word embedding 
      try: # if the word exists
        embedding = model.wv[word] # add it's embedding
        found+=1
      except: # if it's a new word
        embedding = np.zeros(W2V_PARAMS['W2V_size']) # it is represented by 0's
        not_found+=1
    
      # Make prediction
      predictor = embedding.reshape(1, -1)
      predictor.setflags(write=True)
      prediction = classifier.predict(predictor)[0]

      # If prediction is toxic, get span, add to span variable
      if prediction == 1: # revisit if the same word is there more than once
        if word in bad_word:
            continue #We've seen this word before, skip it
        bad_word.append(word)
        span_begins = [i for i in range(len(test.iloc[index]['text'])) if test.iloc[index]['text'].lower().startswith(original_word.lower(), i)] 

        for span_begin in span_begins:
            span_end = span_begin + len(original_word)
            for i in range(span_begin, span_end):
                span.append(i)
        
  result.append(span)
  for b in bad_word:
      all_bad_words.append(b.lower())

all_bad_words = set(all_bad_words)

In [None]:
print(f"Words Found: {found}")
print(f"Words Not Found: {not_found}")
print(f"Bad Words: {all_bad_words}")

Words Found: 2590
Words Not Found: 834
Bad Words: {'airline', 'headline', 'man', 'httpwwwfactcheckorg201601obamasnumbersjanuary2016update', 'monster', 'timely', 'peek', 'dumpy', 'confused', 'heartily', 'rear', 'againanother', 'sentence', 'confuse', 'crp', 'brink', 'venom', 'bloody', 'present', 'nail', 'virtually', 'review', 'monthly', 'idiotic', 'trap', 'favour', 'photo', 'normal', 'mulcair', 'surprisingly', 'ready', 'forth', 'washing', 'wit', 'carlaw', 'suck', 'nkorea', 'dime', 'feature', 'thatya', '1st', 'ethnicity', 'pansy', 'protein', 'commission', 'mayor', 'mileage', 'existential', 'bloodline', 'bye', 'bozell', 'willfully', 'rioter', 'pretend', 'psychotic', 'naturally', 'derange', 'kampf', 'jiberish', 'deficiency', 'cram', 'audience', 'conflate', 'reckless', 'enuf', 'repeatedly', 'eager', '2500', 'schoolwork', 'degraded', 'inauguration', 'speed', 'staff', 'odor', 'opt', 'blow', 'purposely', '147', 'earth', 'surprise', 'bow', 'claw', 'asian', 'sociopath', 'bicyclist', 'socialism', 

<h2>Display raw results</h2>

In [None]:
pd.set_option('display.max_colwidth', -1)

for i, (row_i, row) in enumerate(test.head(10).iterrows()):
    print_row_and_span(row_i, row['text'], result[i])

374: [(30, 39), (69, 76), (86, 90), (123, 128), (130, 134)]
    She should cut and run. It is [91mridiculous[0m for her to be carrying that [91mmortgage[0m at this [91mstage[0m - especially when she wants to [91mretire[0m [91mearly[0m. She doesn't need the house.

163: [(11, 14)]
    You are an [91mugly[0m American.

496: [(0, 3), (46, 50)]
    [91mHeck[0m, I remember when you couldn't leave your [91mhorse[0m tied up outside without some idiot stealing it.

659: [(8, 17), (20, 24)]
    Are you [91mILLITERATE[0m, [91mChuck[0m?

285: [(3, 13), (25, 35), (129, 133)]
    My [91mgrandfather[0m must have [91mparaphrased[0m this, he simply taught me, "Never argue with an idiot, you'll never win." Sound words I use [91moften[0m. The main reason I never argue with myself.

375: []
    You gotta hit the Americans where it hurts.

321: [(20, 28), (63, 69), (97, 104), (121, 125), (131, 140), (170, 182), (256, 265), (314, 325), (352, 358), (400, 411), (431, 435), (512, 522

  """Entry point for launching an IPython kernel.


<h2>Evaluation</h2>

In [None]:
# f1 = 2*(Recall * Precision) / (Recall + Precision)
def f1(predictions, gold):
   rec = recall(predictions, gold)
   prec = precision(predictions,gold)
   return 0 if (rec + prec == 0) else (2*(rec * prec) / (rec + prec))


def precision(predictions, gold): # TP/TP+FP
    TP = len(set(predictions).intersection(set(gold)))
    FP = len(set(predictions) - set(gold))
    return 0 if (TP+FP==0) else TP / (TP+FP)


def recall(predictions, gold): # TP/TP+FN
    TP = len(set(predictions).intersection(set(gold)))
    FN = len(set(gold) - set(predictions))
    return 0 if (TP+FN==0) else TP / (TP+FN)


def calculate_metric(result, method):
    """
    The above methods will only measure 1 comment,
    This generalized code will run the given method against all comments
    """
    tally = 0.0
    for i, (_, row) in enumerate(test.head(10).iterrows()):
        tally += method(result[i], row['spans'])
    tally /= len(result)
    return tally
    
print(f"f1-score: {calculate_metric(result, f1)}")
print(f"Precision: {calculate_metric(result, precision)}")
print(f"Recall: {calculate_metric(result, recall)}")

f1-score: 0.011794400431858306
Precision: 0.010165935925696219
Recall: 0.01589876703439325
