In [88]:
import nltk
import pandas as pd
import numpy as np
from nltk import sent_tokenize, word_tokenize
from nltk.tokenize import WordPunctTokenizer
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from collections import Counter

In [89]:
wpt = WordPunctTokenizer()
tbwt = TreebankWordTokenizer()
p_stem = PorterStemmer()
w_lem = WordNetLemmatizer()

In [90]:
# Load training dataset to dataframe 'df'
df = pd.read_csv('train.csv')

In [91]:
len(df)

159571

In [92]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
id               159571 non-null object
comment_text     159571 non-null object
toxic            159571 non-null int64
severe_toxic     159571 non-null int64
obscene          159571 non-null int64
threat           159571 non-null int64
insult           159571 non-null int64
identity_hate    159571 non-null int64
dtypes: int64(6), object(2)
memory usage: 9.7+ MB


In [93]:
df.head(10)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0


In [94]:
# Create dataframe with only neutral comments (not labeled for any of the toxic category)

df['total']= df.toxic + df.severe_toxic + df.obscene + df.threat + df.insult + df.identity_hate 
df_neutral = df[df.total==0]

In [95]:
# Based on dataframe with only neutral comments, generate a list of unique neutral words (1-word)

neutral_list_1 = []
    
for num in range(len(df_neutral.comment_text)):

    place_holder = []

    for word in wpt.tokenize(df_neutral.comment_text.iloc[num]):

        word_low = word.lower()
        place_holder.append(word_low)
        
        if len(place_holder) == 1:
            phrase = " ".join(place_holder)  
            
            neutral_list_1.append(phrase)            

            del place_holder[0]  

neutral_list_1_unique = []

for pair in list(Counter(neutral_list_1).most_common(1000)):
    neutral_list_1_unique.append(pair[0])

In [96]:
# Based on dataframe with only neutral comments, generate a list of unique neutral words (2-word)

neutral_list_2 = []
    
for num in range(len(df_neutral.comment_text)):

    place_holder = []

    for word in wpt.tokenize(df_neutral.comment_text.iloc[num]):

        word_low = word.lower()
        place_holder.append(word_low)
        
        if len(place_holder) == 2:
            phrase = " ".join(place_holder)  
            
            neutral_list_2.append(phrase)            

            del place_holder[0]  

neutral_list_2_unique = []

for pair in list(Counter(neutral_list_2).most_common(1000)):
    neutral_list_2_unique.append(pair[0])

In [97]:
# Based on dataframe with only neutral comments, generate a list of unique neutral words (3-word)

neutral_list_3 = []
    
for num in range(len(df_neutral.comment_text)):

    place_holder = []

    for word in wpt.tokenize(df_neutral.comment_text.iloc[num]):

        word_low = word.lower()
        place_holder.append(word_low)
        
        if len(place_holder) == 3:
            phrase = " ".join(place_holder)  
            
            neutral_list_3.append(phrase)            

            del place_holder[0]  

neutral_list_3_unique = []

for pair in list(Counter(neutral_list_3).most_common(1000)):
    neutral_list_3_unique.append(pair[0])

In [98]:
# Based on dataframe with only neutral comments, generate a list of unique neutral words (4-word)

neutral_list_4 = []
    
for num in range(len(df_neutral.comment_text)):

    place_holder = []

    for word in wpt.tokenize(df_neutral.comment_text.iloc[num]):

        word_low = word.lower()
        place_holder.append(word_low)
        
        if len(place_holder) == 4:
            phrase = " ".join(place_holder)  
            
            neutral_list_4.append(phrase)            

            del place_holder[0]  

neutral_list_4_unique = []

for pair in list(Counter(neutral_list_4).most_common(1000)):
    neutral_list_4_unique.append(pair[0])

In [99]:
# Create dataframes containing toxic comments based on each type of labeled toxic category. 
# Comments labeled/categorized as multiple toxic categories are not accounted for at this point.
df_toxic = df[df.toxic == 1]
df_severe_toxic = df[df.severe_toxic == 1]
df_obscene = df[df.obscene == 1]
df_threat = df[df.threat == 1]
df_insult = df[df.insult == 1]
df_identity_hate = df[df.identity_hate == 1]

In [100]:
# Approach --- for each toxic category, the comments would contain both toxic words + neutral words. By removing
# the neutral words generated by neutral comments, the toxic words would remain.
# Example, for each toxic category, [toxic words + neutral words] - [neutral words] = toxic words

In [101]:
# Toxic words can be 1 word (single-word) or phrasese (2- , 3-, 4-words)

In [102]:
# Function to generate toxic words by 1-word and count the frequencies.

def word_count_1(dataframe,name):
    
    global_list_cnt = []

    for num in range(len(dataframe.comment_text)):

        place_holder = []
        local_list_cnt = []

        for word in wpt.tokenize(dataframe.comment_text.iloc[num]):

            word_low = word.lower()

            # Taking out the neutral words (only for 1-word count)
            if word_low not in neutral_list_1_unique:

                place_holder.append(p_stem.stem(word_low))
                #place_holder.append(word_low)

                if len(place_holder) == 1:
                    phrase = " ".join(place_holder)  

                    if phrase not in local_list_cnt:
                        local_list_cnt.append(phrase)

                    del place_holder[0]     

        global_list_cnt.extend(local_list_cnt)

    list_cnt = Counter(global_list_cnt)
    
    df_wcnt = pd.DataFrame.from_dict(dict(list_cnt),orient='index',columns=['count'])
    df_wcnt['% of Total'] = round(df_wcnt['count'] / df_wcnt['count'].sum() * 100,2)

    rank = []
    for num in range(20):
        rank.append(num+1)    
   
    df_wcnt = df_wcnt.sort_values('count',ascending=False).head(20).reset_index()
    df_wcnt['rank']=rank
    df_wcnt_ranked = df_wcnt.set_index('rank')
    df_wcnt_ranked = df_wcnt_ranked.rename(columns={"index": name})    
    
    return (df_wcnt_ranked)

In [103]:
# Function to generate toxic words by 2-word and count the frequencies.

def word_count_2(dataframe,name):
    
    global_list_cnt = []

    for num in range(len(dataframe.comment_text)):

        place_holder = []
        local_list_cnt = []

        for word in wpt.tokenize(dataframe.comment_text.iloc[num]):

            word_low = word.lower()

            #place_holder.append(p_stem.stem(word_low))
            place_holder.append(word_low)

            if len(place_holder) == 2:
                phrase = " ".join(place_holder)  

                # Taking out the neutral 2-word phrase
                if phrase not in neutral_list_2_unique:

                    if phrase not in local_list_cnt:
                        local_list_cnt.append(phrase)

                    del place_holder[0]     

        global_list_cnt.extend(local_list_cnt)

    list_cnt = Counter(global_list_cnt)
    
    df_wcnt = pd.DataFrame.from_dict(dict(list_cnt),orient='index',columns=['count'])
    df_wcnt['% of Total'] = round(df_wcnt['count'] / df_wcnt['count'].sum() * 100,2)

    rank = []
    for num in range(20):
        rank.append(num+1)    
   
    df_wcnt = df_wcnt.sort_values('count',ascending=False).head(20).reset_index()
    df_wcnt['rank']=rank
    df_wcnt_ranked = df_wcnt.set_index('rank')
    df_wcnt_ranked = df_wcnt_ranked.rename(columns={"index": name})    
    
    return (df_wcnt_ranked)

In [104]:
# Function to generate toxic words by 3-word and count the frequencies.

def word_count_3(dataframe,name):
    
    global_list_cnt = []

    for num in range(len(dataframe.comment_text)):

        place_holder = []
        local_list_cnt = []

        for word in wpt.tokenize(dataframe.comment_text.iloc[num]):

            word_low = word.lower()

            #place_holder.append(p_stem.stem(word_low))
            place_holder.append(word_low)

            if len(place_holder) == 3:
                phrase = " ".join(place_holder)  

                # Taking out the neutral 3-word phrase
                if phrase not in neutral_list_3_unique:

                    if phrase not in local_list_cnt:
                        local_list_cnt.append(phrase)

                    del place_holder[0]     

        global_list_cnt.extend(local_list_cnt)

    list_cnt = Counter(global_list_cnt)
    
    df_wcnt = pd.DataFrame.from_dict(dict(list_cnt),orient='index',columns=['count'])
    df_wcnt['% of Total'] = round(df_wcnt['count'] / df_wcnt['count'].sum() * 100,2)

    rank = []
    for num in range(20):
        rank.append(num+1)    
   
    df_wcnt = df_wcnt.sort_values('count',ascending=False).head(20).reset_index()
    df_wcnt['rank']=rank
    df_wcnt_ranked = df_wcnt.set_index('rank')
    df_wcnt_ranked = df_wcnt_ranked.rename(columns={"index": name})    
    
    return (df_wcnt_ranked)

In [105]:
# Function to generate toxic words by 4-word and count the frequencies.

def word_count_4(dataframe,name):
    
    global_list_cnt = []

    for num in range(len(dataframe.comment_text)):

        place_holder = []
        local_list_cnt = []

        for word in wpt.tokenize(dataframe.comment_text.iloc[num]):

            word_low = word.lower()

            #place_holder.append(p_stem.stem(word_low))
            place_holder.append(word_low)

            if len(place_holder) == 4:
                phrase = " ".join(place_holder)  

                # Taking out the neutral 3-word phrase
                if phrase not in neutral_list_4_unique:

                    if phrase not in local_list_cnt:
                        local_list_cnt.append(phrase)

                    del place_holder[0]     

        global_list_cnt.extend(local_list_cnt)

    list_cnt = Counter(global_list_cnt)
    
    df_wcnt = pd.DataFrame.from_dict(dict(list_cnt),orient='index',columns=['count'])
    df_wcnt['% of Total'] = round(df_wcnt['count'] / df_wcnt['count'].sum() * 100,2)

    rank = []
    for num in range(20):
        rank.append(num+1)    
   
    df_wcnt = df_wcnt.sort_values('count',ascending=False).head(20).reset_index()
    df_wcnt['rank']=rank
    df_wcnt_ranked = df_wcnt.set_index('rank')
    df_wcnt_ranked = df_wcnt_ranked.rename(columns={"index": name})    
    
    return (df_wcnt_ranked)

In [106]:
# Execute function to generate toxic words by 1-word count
df_toxic_1wcnt = word_count_1(df_toxic,'toxic')
df_severe_toxic_1wcnt = word_count_1(df_severe_toxic,'severe_toxic')
df_obscene_1wcnt = word_count_1(df_obscene,'obscene')
df_threat_1wcnt = word_count_1(df_threat,'threat')
df_insult_1wcnt = word_count_1(df_insult,'insult')
df_identity_hate_1wcnt = word_count_1(df_identity_hate,'identity_hate')

In [107]:
# Execute function to generate toxic words by 2-word count
df_toxic_2wcnt = word_count_2(df_toxic,'toxic')
df_severe_toxic_2wcnt = word_count_2(df_severe_toxic,'severe_toxic')
df_obscene_2wcnt = word_count_2(df_obscene,'obscene')
df_threat_2wcnt = word_count_2(df_threat,'threat')
df_insult_2wcnt = word_count_2(df_insult,'insult')
df_identity_hate_2wcnt = word_count_2(df_identity_hate,'identity_hate')   

In [108]:
# Execute function to generate toxic words by 3-word count
df_toxic_3wcnt = word_count_3(df_toxic,'toxic')
df_severe_toxic_3wcnt = word_count_3(df_severe_toxic,'severe_toxic')
df_obscene_3wcnt = word_count_3(df_obscene,'obscene')
df_threat_3wcnt = word_count_3(df_threat,'threat')
df_insult_3wcnt = word_count_3(df_insult,'insult')
df_identity_hate_3wcnt = word_count_3(df_identity_hate,'identity_hate')

In [109]:
# Execute function to generate toxic words by 4-word count
df_toxic_4wcnt = word_count_4(df_toxic,'toxic')
df_severe_toxic_4wcnt = word_count_4(df_severe_toxic,'severe_toxic')
df_obscene_4wcnt = word_count_4(df_obscene,'obscene')
df_threat_4wcnt = word_count_4(df_threat,'threat')
df_insult_4wcnt = word_count_4(df_insult,'insult')
df_identity_hate_4wcnt = word_count_4(df_identity_hate,'identity_hate')

In [110]:
# Combine the results of all six toxic categories in one table
master_list_1wc = pd.concat([df_toxic_1wcnt, df_severe_toxic_1wcnt,df_obscene_1wcnt,df_threat_1wcnt,df_insult_1wcnt,df_identity_hate_1wcnt],axis=1)
master_list_2wc = pd.concat([df_toxic_2wcnt, df_severe_toxic_2wcnt,df_obscene_2wcnt,df_threat_2wcnt,df_insult_2wcnt,df_identity_hate_2wcnt],axis=1)
master_list_3wc = pd.concat([df_toxic_3wcnt, df_severe_toxic_3wcnt,df_obscene_3wcnt,df_threat_3wcnt,df_insult_3wcnt,df_identity_hate_3wcnt],axis=1)
master_list_4wc = pd.concat([df_toxic_4wcnt, df_severe_toxic_4wcnt,df_obscene_4wcnt,df_threat_4wcnt,df_insult_4wcnt,df_identity_hate_4wcnt],axis=1)

In [111]:
# Results of toxic word count by 1-word
master_list_1wc

Unnamed: 0_level_0,toxic,count,% of Total,severe_toxic,count,% of Total,obscene,count,% of Total,threat,count,% of Total,insult,count,% of Total,identity_hate,count,% of Total
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,fuck,3719,2.32,fuck,972,6.9,fuck,3581,4.19,fuck,148,3.02,fuck,2734,3.5,fuck,473,3.0
2,shit,1298,0.81,shit,258,1.83,shit,1112,1.3,kill,130,2.66,shit,839,1.07,gay,276,1.75
3,stupid,1011,0.63,bitch,247,1.75,ass,828,0.97,die,123,2.51,bitch,701,0.9,faggot,179,1.13
4,suck,940,0.59,suck,215,1.53,suck,795,0.93,shit,58,1.18,ass,672,0.86,nigger,163,1.03
5,ass,930,0.58,ass,194,1.38,bitch,767,0.9,ass,56,1.14,stupid,665,0.85,shit,163,1.03
6,idiot,810,0.5,asshol,155,1.1,asshol,585,0.68,rape,40,0.82,suck,662,0.85,ass,130,0.82
7,bitch,793,0.49,dick,150,1.07,stupid,556,0.65,bitch,39,0.8,idiot,610,0.78,bitch,125,0.79
8,asshol,626,0.39,cunt,127,0.9,dick,511,0.6,hell,34,0.69,asshol,529,0.68,suck,93,0.59
9,hell,599,0.37,faggot,121,0.86,cunt,451,0.53,!!!,31,0.63,dick,411,0.53,hate,89,0.56
10,dick,568,0.35,!!!,96,0.68,faggot,413,0.48,piec,30,0.61,faggot,410,0.53,stupid,88,0.56


In [112]:
# Results of toxic word count by 2-word
master_list_2wc

Unnamed: 0_level_0,toxic,count,% of Total,severe_toxic,count,% of Total,obscene,count,% of Total,threat,count,% of Total,insult,count,% of Total,identity_hate,count,% of Total
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,fuck you,492,0.8,fuck you,245,2.98,fuck you,492,1.35,fuck you,20,1.2,fuck you,452,1.34,fuck you,74,1.15
2,fuck off,171,0.28,fuck off,65,0.79,fuck off,170,0.47,of shit,9,0.54,fuck off,132,0.39,go fuck,21,0.33
3,""" you",149,0.24,go fuck,59,0.72,go fuck,132,0.36,piece of,8,0.48,go fuck,125,0.37,you fucking,20,0.31
4,go fuck,132,0.21,fuck yourself,50,0.61,fuck yourself,102,0.28,you fucking,8,0.48,fuck yourself,98,0.29,fuck off,19,0.29
5,fuck yourself,102,0.17,suck my,46,0.56,the fuck,90,0.25,! if,7,0.42,you fucking,83,0.25,is gay,18,0.28
6,""" """"",95,0.15,you fucking,40,0.49,you fucking,85,0.23,and die,6,0.36,you suck,73,0.22,piece of,17,0.26
7,the fuck,91,0.15,piece of,33,0.4,you suck,80,0.22,death to,6,0.36,your a,71,0.21,fuck yourself,15,0.23
8,i hate,90,0.15,you you,32,0.39,suck my,78,0.21,im going,6,0.36,suck my,70,0.21,your a,14,0.22
9,you fucking,86,0.14,the fuck,31,0.38,of shit,74,0.2,kill yourself,5,0.3,piece of,68,0.2,of shit,14,0.22
10,you suck,82,0.13,of shit,31,0.38,piece of,70,0.19,i hate,5,0.3,of shit,64,0.19,faggot .,13,0.2


In [113]:
# Results of toxic word count by 3-word
master_list_3wc

Unnamed: 0_level_0,toxic,count,% of Total,severe_toxic,count,% of Total,obscene,count,% of Total,threat,count,% of Total,insult,count,% of Total,identity_hate,count,% of Total
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,go fuck yourself,141,0.07,go fuck yourself,57,0.28,go fuck yourself,139,0.13,am going to,17,0.24,go fuck yourself,128,0.12,piece of shit,21,0.1
2,get a life,121,0.06,piece of shit,47,0.23,piece of shit,113,0.1,i am going,14,0.2,piece of shit,106,0.1,go fuck yourself,18,0.09
3,piece of shit,118,0.06,"fuck you ,",30,0.15,what the fuck,92,0.08,piece of shit,12,0.17,get a life,86,0.08,you are gay,13,0.06
4,what the fuck,97,0.05,son of a,28,0.14,get a life,85,0.08,going to kill,10,0.14,"fuck you ,",70,0.07,get a life,13,0.06
5,why don ',81,0.04,fuck you you,28,0.14,"fuck you ,",73,0.07,will kill you,9,0.13,son of a,68,0.07,", are you",11,0.05
6,son of a,76,0.04,of a bitch,26,0.13,son of a,71,0.06,i will kill,9,0.13,of a bitch,61,0.06,a bunch of,11,0.05
7,what the hell,73,0.04,suck my dick,25,0.12,of a bitch,62,0.06,to kill you,9,0.13,what the fuck,56,0.05,is a gay,11,0.05
8,"fuck you ,",73,0.04,get a life,21,0.1,fuck you .,60,0.05,im going to,9,0.13,why don ',55,0.05,son of a,11,0.05
9,why are you,70,0.03,fuck off you,20,0.1,why don ',54,0.05,! if you,7,0.1,fuck you .,54,0.05,"fuck you ,",10,0.05
10,i hate you,69,0.03,the fuck up,19,0.09,of shit .,50,0.05,or i will,7,0.1,of shit .,44,0.04,of a bitch,9,0.04


In [114]:
# Results of toxic word count by 4-word
master_list_4wc

Unnamed: 0_level_0,toxic,count,% of Total,severe_toxic,count,% of Total,obscene,count,% of Total,threat,count,% of Total,insult,count,% of Total,identity_hate,count,% of Total
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,you ' re a,186,0.05,you ' re a,34,0.1,you ' re a,127,0.06,i will kill you,13,0.11,you ' re a,145,0.07,you ' re a,20,0.05
2,. you are a,79,0.02,son of a bitch,30,0.09,son of a bitch,74,0.04,i hope you die,11,0.09,son of a bitch,70,0.04,. you are a,11,0.03
3,son of a bitch,75,0.02,shut the fuck up,23,0.07,piece of shit .,57,0.03,to your house and,7,0.06,. you are a,63,0.03,son of a bitch,11,0.03
4,piece of shit .,64,0.02,piece of shit .,23,0.07,shut the fuck up,55,0.03,son of a bitch,6,0.05,piece of shit .,55,0.03,piece of shit .,10,0.03
5,", you are a",60,0.01,you son of a,17,0.05,. you are a,46,0.02,hunt you down and,6,0.05,shut the fuck up,45,0.02,", you are a",7,0.02
6,shut the fuck up,55,0.01,fuck you fuck you,17,0.05,you are a fucking,40,0.02,", i am going",5,0.04,", you are a",42,0.02,shut the fuck up,7,0.02
7,you think you are,54,0.01,"fuck you , you",15,0.05,the fuck are you,39,0.02,i know where you,5,0.04,you are a fucking,36,0.02,you are a fucking,7,0.02
8,do you think you,53,0.01,you are a fucking,15,0.05,do you think you,34,0.02,know where you live,5,0.04,you son of a,34,0.02,28 . 54 .,6,0.02
9,you are a fucking,40,0.01,go fuck yourself you,13,0.04,", you are a",34,0.02,you ' re a,5,0.04,you ' re an,33,0.02,. 54 . 73,6,0.02
10,the fuck are you,39,0.01,you piece of shit,12,0.04,you son of a,34,0.02,you are a fucking,5,0.04,you think you are,31,0.02,. 28 . 54,6,0.02
