In [1]:
import pandas as pd
import numpy as np
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk import FreqDist
from nltk import word_tokenize, pos_tag, regexp_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from nltk.stem import WordNetLemmatizer 
from sklearn.linear_model import LogisticRegression
import spacy
nlp = spacy.load("en_core_web_sm")

In [2]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [3]:
train2 = train.copy()

In [4]:
# remove '\\n'
train2['comment_text'] = train2['comment_text'].map(lambda x: re.sub('\\n',' ',str(x)))
    
# remove any text starting with User... 
train2['comment_text'] = train2['comment_text'].map(lambda x: re.sub("\[\[User.*",'',str(x)))
    
# remove IP addresses or user IDs
train2['comment_text'] = train2['comment_text'].map(lambda x: re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}",'',str(x)))
    
#remove http links in the text
train2['comment_text'] = train2['comment_text'].map(lambda x: re.sub("(http://.*?\s)|(http://.*)",'',str(x)))

In [5]:
train2['char_total'] = train2['comment_text'].map(lambda x: len(x))

In [6]:
train2.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,char_total
0,0000997932d777bf,Explanation Why the edits made under my userna...,0,0,0,0,0,0,252
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,112
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,233
3,0001b41b1c6bb37e,""" More I can't make any real suggestions on im...",0,0,0,0,0,0,622
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,67


In [7]:
no_spams_df = train2[train2['char_total']<2000]

In [17]:
origtestcomment = train['comment_text'][92]
origtestcomment

'"\nAlmost got me too; I had to look it up to see if it was real. ...  talk "'

In [18]:
testcomment = train2['comment_text'][92]
testcomment

'" Almost got me too; I had to look it up to see if it was real. ...  talk "'

In [19]:
doc = nlp(testcomment)
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)


" " PUNCT `` punct " False False
Almost almost ADV RB advmod Xxxxx True False
got get VERB VBD ccomp xxx True False
me -PRON- PRON PRP dobj xx True True
too too ADV RB advmod xxx True True
; ; PUNCT : punct ; False False
I -PRON- PRON PRP nsubj X True False
had have VERB VBD relcl xxx True True
to to PART TO aux xx True True
look look VERB VB xcomp xxxx True False
it -PRON- PRON PRP dobj xx True True
up up PART RP prt xx True True
to to PART TO aux xx True True
see see VERB VB advcl xxx True True
if if ADP IN mark xx True True
it -PRON- PRON PRP nsubj xx True True
was be VERB VBD ccomp xxx True True
real real ADJ JJ acomp xxxx True False
. . PUNCT . punct . False False
... ... PUNCT NFP punct ... False False
    SPACE     False False
talk talk NOUN NN ROOT xxxx True False
" " PUNCT '' punct " False False


In [20]:
lems = [token.lemma_ for token in doc if (token.lemma_ != '-PRON-' and token.is_stop==False and 
                                   token.is_punct==False and token.is_digit==False and token.is_space==False 
                                          and not token.lemma_.startswith("'"))]
lems

['almost', 'get', 'look', 'real', 'talk']

In [21]:
tfid = TfidfVectorizer(stop_words='english', token_pattern="([a-zA-Z]+(?:'[a-z]+)?)")
test = tfid.fit_transform(lems)
tfid.get_feature_names()

['look', 'real', 'talk']

## Test on multiple comments instead of one test comment

In [13]:
def lemmatize(comment):
    lems = [token.lemma_ for token in comment if (token.lemma_ != '-PRON-' and token.is_stop==False and 
                                   token.is_punct==False and token.is_digit==False and token.is_space==False 
                                          and not token.lemma_.startswith("'"))]
    return lems

In [14]:
test_bunch = no_spams_df['comment_text'].sample(1000)
test_bunch_nlp = map(nlp, test_bunch)
test_bunch_nlp_lem = map(lemmatize, test_bunch_nlp)
test_bunch_nlp_lem_joined = map(' '.join, test_bunch_nlp_lem)

In [16]:
tfid = TfidfVectorizer(stop_words='english', token_pattern="([a-zA-Z]+(?:'[a-z]+)?)")
tfid.fit(test_bunch_nlp_lem_joined)
tfid.get_feature_names()

['aa',
 'aadmi',
 'aafs',
 'aam',
 'aardvark',
 'abandon',
 'abate',
 'abc',
 'abdul',
 'aberration',
 'abet',
 'abhiramdasji',
 'abhor',
 'abi',
 'abide',
 'ability',
 'able',
 'aboriginal',
 'abortion',
 'abraham',
 'abroad',
 'absence',
 'absent',
 'absolutely',
 'absurd',
 'abuse',
 'abusive',
 'academic',
 'academy',
 'acc',
 'accept',
 'acceptable',
 'acceptance',
 'access',
 'accessory',
 'accident',
 'accidently',
 'accomplice',
 'accomplish',
 'accomplished',
 'accomplishment',
 'accord',
 'accordance',
 'accordingly',
 'account',
 'accounts',
 'accuracy',
 'accurate',
 'accurately',
 'accusation',
 'accuse',
 'ace',
 'achaemenids',
 'ache',
 'achieve',
 'achievement',
 'acknowledge',
 'acount',
 'acquire',
 'acquis',
 'act',
 'action',
 'active',
 'activist',
 'activity',
 'acts',
 'actual',
 'actually',
 'actualy',
 'acually',
 'ad',
 'adam',
 'adams',
 'add',
 'adding',
 'addition',
 'additional',
 'additionally',
 'address',
 'adequately',
 'adhere',
 'adjacent',
 'adjecti