In [1]:
import nltk
import pandas as pd
import numpy as np
from nltk import sent_tokenize, word_tokenize
from nltk.tokenize import WordPunctTokenizer
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from collections import Counter

In [2]:
wpt = WordPunctTokenizer()
tbwt = TreebankWordTokenizer()
p_stem = PorterStemmer()
w_lem = WordNetLemmatizer()

In [3]:
# Load training dataset to dataframe 'df'
df = pd.read_csv('train.csv')

In [7]:
# Create dataframe with only neutral comments (not labeled for any of the toxic category)

df['total']= df.toxic + df.severe_toxic + df.obscene + df.threat + df.insult + df.identity_hate 
df_neutral = df[df.total==0]

In [8]:
# Based on dataframe with only neutral comments, generate a list of unique neutral words (5-word)

neutral_list_5 = []
    
for num in range(len(df_neutral.comment_text)):

    place_holder = []

    for word in wpt.tokenize(df_neutral.comment_text.iloc[num]):

        word_low = word.lower()
        place_holder.append(word_low)
        
        if len(place_holder) == 5:
            phrase = " ".join(place_holder)  
            
            neutral_list_5.append(phrase)            

            del place_holder[0]  

neutral_list_5_unique = []

for pair in list(Counter(neutral_list_5).most_common(1000)):
    neutral_list_5_unique.append(pair[0])

In [9]:
# Based on dataframe with only neutral comments, generate a list of unique neutral words (6-word)

neutral_list_6 = []
    
for num in range(len(df_neutral.comment_text)):

    place_holder = []

    for word in wpt.tokenize(df_neutral.comment_text.iloc[num]):

        word_low = word.lower()
        place_holder.append(word_low)
        
        if len(place_holder) == 6:
            phrase = " ".join(place_holder)  
            
            neutral_list_6.append(phrase)            

            del place_holder[0]  

neutral_list_6_unique = []

for pair in list(Counter(neutral_list_6).most_common(1000)):
    neutral_list_6_unique.append(pair[0])

In [10]:
# Create dataframes containing toxic comments based on each type of labeled toxic category. 
# Comments labeled/categorized as multiple toxic categories are not accounted for at this point.
df_toxic = df[df.toxic == 1]
df_severe_toxic = df[df.severe_toxic == 1]
df_obscene = df[df.obscene == 1]
df_threat = df[df.threat == 1]
df_insult = df[df.insult == 1]
df_identity_hate = df[df.identity_hate == 1]

In [11]:
# Approach --- for each toxic category, the comments would contain both toxic words + neutral words. By removing
# the neutral words generated by neutral comments, the toxic words would remain.
# Example, for each toxic category, [toxic words + neutral words] - [neutral words] = toxic words

In [12]:
# Toxic words can be 1 word (single-word) or phrasese (2- , 3-, 4-words)

In [13]:
# Function to generate toxic words by 5-word and count the frequencies.

def word_count_5(dataframe,name):
    
    global_list_cnt = []

    for num in range(len(dataframe.comment_text)):

        place_holder = []
        local_list_cnt = []

        for word in wpt.tokenize(dataframe.comment_text.iloc[num]):

            word_low = word.lower()

            #place_holder.append(p_stem.stem(word_low))
            place_holder.append(word_low)

            if len(place_holder) == 5:
                phrase = " ".join(place_holder)  

                # Taking out the neutral 5-word phrase
                if phrase not in neutral_list_5_unique:

                    if phrase not in local_list_cnt:
                        local_list_cnt.append(phrase)

                    del place_holder[0]     

        global_list_cnt.extend(local_list_cnt)

    list_cnt = Counter(global_list_cnt)
    
    df_wcnt = pd.DataFrame.from_dict(dict(list_cnt),orient='index',columns=['count'])
    df_wcnt['% of Total'] = round(df_wcnt['count'] / df_wcnt['count'].sum() * 100,2)

    rank = []
    for num in range(20):
        rank.append(num+1)    
   
    df_wcnt = df_wcnt.sort_values('count',ascending=False).head(20).reset_index()
    df_wcnt['rank']=rank
    df_wcnt_ranked = df_wcnt.set_index('rank')
    df_wcnt_ranked = df_wcnt_ranked.rename(columns={"index": name})    
    
    return (df_wcnt_ranked)

In [14]:
# Function to generate toxic words by 6-word and count the frequencies.

def word_count_6(dataframe,name):
    
    global_list_cnt = []

    for num in range(len(dataframe.comment_text)):

        place_holder = []
        local_list_cnt = []

        for word in wpt.tokenize(dataframe.comment_text.iloc[num]):

            word_low = word.lower()

            #place_holder.append(p_stem.stem(word_low))
            place_holder.append(word_low)

            if len(place_holder) == 6:
                phrase = " ".join(place_holder)  

                # Taking out the neutral 6-word phrase
                if phrase not in neutral_list_6_unique:

                    if phrase not in local_list_cnt:
                        local_list_cnt.append(phrase)

                    del place_holder[0]     

        global_list_cnt.extend(local_list_cnt)

    list_cnt = Counter(global_list_cnt)
    
    df_wcnt = pd.DataFrame.from_dict(dict(list_cnt),orient='index',columns=['count'])
    df_wcnt['% of Total'] = round(df_wcnt['count'] / df_wcnt['count'].sum() * 100,2)

    rank = []
    for num in range(20):
        rank.append(num+1)    
   
    df_wcnt = df_wcnt.sort_values('count',ascending=False).head(20).reset_index()
    df_wcnt['rank']=rank
    df_wcnt_ranked = df_wcnt.set_index('rank')
    df_wcnt_ranked = df_wcnt_ranked.rename(columns={"index": name})    
    
    return (df_wcnt_ranked)

In [15]:
# Execute function to generate toxic words by 5-word count
df_toxic_5wcnt = word_count_5(df_toxic,'toxic')
df_severe_toxic_5wcnt = word_count_5(df_severe_toxic,'severe_toxic')
df_obscene_5wcnt = word_count_5(df_obscene,'obscene')
df_threat_5wcnt = word_count_5(df_threat,'threat')
df_insult_5wcnt = word_count_5(df_insult,'insult')
df_identity_hate_5wcnt = word_count_5(df_identity_hate,'identity_hate')

In [16]:
# Execute function to generate toxic words by 6-word count
df_toxic_6wcnt = word_count_6(df_toxic,'toxic')
df_severe_toxic_6wcnt = word_count_6(df_severe_toxic,'severe_toxic')
df_obscene_6wcnt = word_count_6(df_obscene,'obscene')
df_threat_6wcnt = word_count_6(df_threat,'threat')
df_insult_6wcnt = word_count_6(df_insult,'insult')
df_identity_hate_6wcnt = word_count_6(df_identity_hate,'identity_hate')

In [17]:
# Combine the results of all six toxic categories in one table
master_list_5wc = pd.concat([df_toxic_5wcnt, df_severe_toxic_5wcnt,df_obscene_5wcnt,df_threat_5wcnt,df_insult_5wcnt,df_identity_hate_5wcnt],axis=1)
master_list_6wc = pd.concat([df_toxic_6wcnt, df_severe_toxic_6wcnt,df_obscene_6wcnt,df_threat_6wcnt,df_insult_6wcnt,df_identity_hate_6wcnt],axis=1)

In [18]:
# Results of toxic word count by 5-word
master_list_5wc

Unnamed: 0_level_0,toxic,count,% of Total,severe_toxic,count,% of Total,obscene,count,% of Total,threat,count,% of Total,insult,count,% of Total,identity_hate,count,% of Total
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,don ' t give a,53,0.01,you son of a bitch,17,0.04,you ' re a fucking,46,0.02,i am going to kill,7,0.04,you ' re a fucking,43,0.02,. 28 . 54 .,6,0.01
2,you ' re a fucking,49,0.01,you ' re a fucking,15,0.04,don ' t give a,36,0.01,am going to kill you,7,0.04,", you ' re a",33,0.01,28 . 54 . 73,6,0.01
3,do you think you are,49,0.01,don ' t give a,8,0.02,do you think you are,34,0.01,i know where you live,6,0.04,you son of a bitch,33,0.01,. why don ' t,5,0.01
4,. you ' re a,45,0.01,"son of a bitch ,",7,0.02,you son of a bitch,33,0.01,i hope you die in,6,0.04,. you ' re a,30,0.01,you ' re a fucking,5,0.01
5,", you ' re a",42,0.01,do you think you are,7,0.02,", you ' re a",28,0.01,and i hope you die,6,0.04,do you think you are,27,0.01,", you ' re a",5,0.01
6,. why don ' t,39,0.01,"motherfucker , go fuck ur",6,0.01,. you ' re a,28,0.01,in the head and laugh,5,0.03,you ' re an idiot,26,0.01,nigger nigger nigger nigger nigger,5,0.01
7,", why don ' t",38,0.01,", motherfucker , go fuck",6,0.01,", why don ' t",27,0.01,am going to shoot you,5,0.03,. why don ' t,25,0.01,. i ' m a,4,0.01
8,you son of a bitch,33,0.01,"fuck ur mothers cunt ,",6,0.01,i don ' t give,25,0.01,come to your house and,5,0.03,don ' t give a,24,0.01,you son of a bitch,4,0.01
9,i don ' t give,32,0.01,go fuck ur mothers cunt,6,0.01,. why don ' t,24,0.01,going to shoot you in,5,0.03,", why don ' t",20,0.01,", why don ' t",4,0.01
10,", you don ' t",30,0.01,", go fuck ur mothers",6,0.01,don ' t be a,21,0.01,to shoot you in the,5,0.03,don ' t be a,18,0.01,you are german cock sucker,4,0.01


In [19]:
# Results of toxic word count by 6-word
master_list_6wc

Unnamed: 0_level_0,toxic,count,% of Total,severe_toxic,count,% of Total,obscene,count,% of Total,threat,count,% of Total,insult,count,% of Total,identity_hate,count,% of Total
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,i don ' t give a,68,0.01,i don ' t give a,12,0.03,i don ' t give a,56,0.02,i am going to kill you,6,0.03,i don ' t give a,33,0.01,. why don ' t you,6,0.01
2,", why don ' t you",43,0.01,"motherfucker , go fuck ur mothers",6,0.01,don ' t give a shit,30,0.01,going to shoot you in the,5,0.03,. why don ' t you,26,0.01,. 28 . 54 . 73,6,0.01
3,. why don ' t you,41,0.01,"bitch , asshole , motherfucker ,",6,0.01,", why don ' t you",30,0.01,your brains splatter onto the ground,5,0.03,", why don ' t you",23,0.01,nigger nigger nigger nigger nigger nigger,5,0.01
4,don ' t give a shit,35,0.01,", motherfucker , go fuck ur",6,0.01,. why don ' t you,26,0.01,as your brains splatter onto the,5,0.03,why don ' t you go,16,0.01,why don ' t you go,5,0.01
5,why don ' t you go,24,0.0,"asshole , motherfucker , go fuck",6,0.01,don ' t give a fuck,19,0.01,laugh as your brains splatter onto,5,0.03,don ' t give a shit,15,0.01,211 . 28 . 54 .,4,0.01
6,", i ' m going to",22,0.0,"fuck ur mothers cunt , trade",6,0.01,why don ' t you go,14,0.0,and laugh as your brains splatter,5,0.03,don ' t give a fuck,14,0.0,", why don ' t you",4,0.01
7,. i don ' t care,21,0.0,"ur mothers cunt , trade ur",6,0.01,. i don ' t give,14,0.0,head and laugh as your brains,5,0.03,you ' re an idiot .,14,0.0,have ! i wish they were,3,0.01
8,don ' t give a fuck,20,0.0,"mothers cunt , trade ur wife",6,0.01,the fuck do you think you,14,0.0,the head and laugh as your,5,0.03,why don ' t you just,12,0.0,cock sucker and fucker mother fucker,3,0.01
9,who do you think you are,20,0.0,", go fuck ur mothers cunt",6,0.01,don ' t be a dick,13,0.0,in the head and laugh as,5,0.03,", aren ' t you ?",11,0.0,were in hell with the other,3,0.01
10,do you think you are ?,18,0.0,", asshole , motherfucker , go",6,0.01,fuck do you think you are,12,0.0,shoot you in the head and,5,0.03,the fuck do you think you,10,0.0,they were in hell with the,3,0.01
