In [1]:
import nltk
import pandas as pd
import numpy as np

In [2]:
from nltk import sent_tokenize, word_tokenize
from nltk.tokenize import WordPunctTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from collections import Counter

In [3]:
wpt = WordPunctTokenizer()
p_stem = PorterStemmer()
w_lem = WordNetLemmatizer()

In [4]:
df = pd.read_csv('train.csv')

In [5]:
len(df)

159571

In [109]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
id               159571 non-null object
comment_text     159571 non-null object
toxic            159571 non-null int64
severe_toxic     159571 non-null int64
obscene          159571 non-null int64
threat           159571 non-null int64
insult           159571 non-null int64
identity_hate    159571 non-null int64
dtypes: int64(6), object(2)
memory usage: 9.7+ MB


In [6]:
df.head(10)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0


In [7]:
df_toxic = df[df.toxic == 1]
df_severe_toxic = df[df.severe_toxic == 1]
df_obscene = df[df.obscene == 1]
df_threat = df[df.threat == 1]
df_insult = df[df.insult == 1]
df_identity_hate = df[df.identity_hate == 1]
meaningless_words = ['wikipedia','hi','like','u','''"''','''""''',"""'""",'/',':','-','(',')','?','.',',','i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', 'couldn', 'didn', 'doesn', 'hadn', 'hasn', 'haven', 'isn', 'ma', 'mightn', 'mustn', 'needn', 'shan', 'shouldn', 'wasn', 'weren', 'won', 'wouldn']

In [86]:
def word_count(dataframe,name):

    list_cnt = Counter()
    
    for num in range(len(dataframe.comment_text)):

        for word in wpt.tokenize(dataframe.comment_text.iloc[num]):
            word_low = word.lower()
            if word_low == '!!' or word_low == '!!!':
                word_low = '!'        

            if word_low in meaningless_words:
                pass
            else:
                list_cnt[p_stem.stem(word_low)]+=1
                #list_cnt[w_lem.lemmatize(word_low)]+=1     

    df_wcnt = pd.DataFrame.from_dict(dict(list_cnt),orient='index',columns=['count'])
    #df_wcnt['% Total'] = round(df_wcnt.Count / df_wcnt.Count.sum() * 100,1)

    rank = []
    for num in range(15):
        rank.append(num+1)    
   
    df_wcnt = df_wcnt.sort_values('count',ascending=False).head(15).reset_index()
    df_wcnt['rank']=rank
    df_wcnt_ranked = df_wcnt.set_index('rank')
    df_wcnt_ranked = df_wcnt_ranked.rename(columns={"index": name})
  
    return (df_wcnt_ranked)     

In [87]:
df_toxic_wcnt = word_count(df_toxic,'toxic')
df_severe_toxic_wcnt = word_count(df_severe_toxic,'severe_toxic')
df_obscene_wcnt = word_count(df_obscene,'obscene')
df_threat_wcnt = word_count(df_threat,'threat')
df_insult_wcnt = word_count(df_insult,'insult')
df_identity_hate_wcnt = word_count(df_identity_hate,'identity_hate')    

In [88]:
pd.concat([df_toxic_wcnt, df_severe_toxic_wcnt,df_obscene_wcnt,df_threat_wcnt,df_insult_wcnt,df_identity_hate_wcnt],axis=1)

Unnamed: 0_level_0,toxic,count,severe_toxic,count,obscene,count,threat,count,insult,count,identity_hate,count
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,!,13895,fuck,8801,fuck,13247,die,1178,fuck,10862,nigger,3089
2,fuck,13443,!,5906,!,9578,ass,772,!,9064,jew,1498
3,suck,4806,suck,3522,suck,4196,!,672,suck,3839,fuck,1437
4,go,3813,ass,2120,shit,3233,kill,522,nigger,2892,!,1352
5,shit,3651,shit,1889,ass,2873,go,376,go,2563,fat,1322
6,nigger,3428,faggot,1576,nigger,2839,fuck,312,faggot,2533,gay,939
7,ass,2976,bitch,1544,go,2723,block,188,ass,2231,die,885
8,hate,2763,go,1514,bitch,2285,must,167,bitch,2210,faggot,761
9,faggot,2694,die,1165,faggot,1894,wale,158,fat,1926,suck,523
10,get,2653,cunt,1039,cunt,1786,jim,158,shit,1708,nigga,512
