In [1]:
import pandas as pd
import numpy as np
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk import FreqDist
from nltk import word_tokenize, pos_tag, regexp_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from nltk.stem import WordNetLemmatizer 
from sklearn.linear_model import LogisticRegression
import spacy
nlp = spacy.load("en_core_web_sm")

In [2]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [3]:
# remove '\\n'
train['comment_text'] = train['comment_text'].map(lambda x: re.sub('\\n',' ',str(x)))
    
# remove any text starting with User... 
train['comment_text'] = train['comment_text'].map(lambda x: re.sub("\[\[User.*",'',str(x)))
    
# remove IP addresses or user IDs
train['comment_text'] = train['comment_text'].map(lambda x: re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}",'',str(x)))
    
#remove http links in the text
train['comment_text'] = train['comment_text'].map(lambda x: re.sub("(http://.*?\s)|(http://.*)",'',str(x)))

In [5]:
def remove_all_digits(comment):
    result = ''.join([i for i in comment if not i.isdigit()])
    return result

In [6]:
train['comment_text'] = train['comment_text'].apply(remove_all_digits)

In [9]:
train['comment_text'] = train['comment_text'].map(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

In [10]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation Why the edits made under my userna...,0,0,0,0,0,0
1,000103f0d9cfb60f,Daww He matches this background colour Im seem...,0,0,0,0,0,0
2,000113f07ec002fd,Hey man Im really not trying to edit war Its j...,0,0,0,0,0,0
3,0001b41b1c6bb37e,More I cant make any real suggestions on impr...,0,0,0,0,0,0
4,0001d958c54c6e35,You sir are my hero Any chance you remember wh...,0,0,0,0,0,0


In [11]:
train['total_chars'] = train['comment_text'].map(lambda x: len(x))

In [12]:
train['total_chars'].sum()

59443622

In [26]:
testcomment = train['comment_text'][1011]
testcomment

'CNN says mother died   Its on their home page'

In [27]:
doc = nlp(testcomment)
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

CNN cnn PROPN NNP nsubj XXX True False
says say VERB VBZ ROOT xxxx True False
mother mother NOUN NN nsubj xxxx True False
died die VERB VBD ccomp xxxx True False
      SPACE _SP     False False
Its -PRON- ADJ PRP$ dobj Xxx True False
on on ADP IN prep xx True True
their -PRON- ADJ PRP$ poss xxxx True True
home home NOUN NN compound xxxx True False
page page NOUN NN pobj xxxx True False


In [None]:
lems = [token.lemma_ for token in doc if (token.lemma_ != '-PRON-' and token.is_stop==False and 
                                   token.is_punct==False and token.is_digit==False and token.is_space==False 
                                          and not token.lemma_.startswith("'"))]
lems

In [None]:
tfid = TfidfVectorizer(stop_words='english', token_pattern="([a-zA-Z]+(?:'[a-z]+)?)")
test = tfid.fit_transform(lems)
tfid.get_feature_names()

In [28]:
def lemmatize(comment):
    lems = [token.lemma_ for token in comment if (token.lemma_ != '-PRON-' and token.is_stop==False and 
                                   token.is_punct==False and token.is_digit==False and token.is_space==False 
                                          and not token.lemma_.startswith("'"))]
    return lems

In [43]:
test_bunch = train['comment_text'].sample(1000)
test_bunch_nlp = map(nlp, test_bunch)
test_bunch_nlp_lem = map(lemmatize, test_bunch_nlp)
test_bunch_nlp_lem_joined = map(' '.join, test_bunch_nlp_lem)

In [44]:
tfid = TfidfVectorizer(stop_words='english')
tfid.fit(test_bunch_nlp_lem_joined)
tfid.get_feature_names()

['aafia',
 'aaron',
 'ab',
 'abbreviate',
 'abbreviation',
 'aberrantly',
 'abeter',
 'abhishekitmtruth',
 'abhorrent',
 'ability',
 'able',
 'abominable',
 'aboriginal',
 'aborigines',
 'abortion',
 'abovein',
 'abram',
 'abrasive',
 'absalom',
 'absence',
 'absent',
 'absolom',
 'absolute',
 'absolutely',
 'absolutist',
 'absurd',
 'absurdity',
 'abu',
 'abundantly',
 'abuse',
 'abusive',
 'academe',
 'academia',
 'academic',
 'academicians',
 'academies',
 'academy',
 'acceleration',
 'accept',
 'acceptable',
 'acceptance',
 'access',
 'accessable',
 'accessible',
 'accident',
 'accidental',
 'accidentally',
 'acclaim',
 'accommodate',
 'accomodat',
 'accomplished',
 'accomplishment',
 'accord',
 'accordance',
 'accordingly',
 'account',
 'accounting',
 'accsuation',
 'accuracy',
 'accurate',
 'accusation',
 'accuse',
 'accuser',
 'acetylene',
 'achieve',
 'achievement',
 'acknowledge',
 'acknowledgement',
 'acolyte',
 'acquaint',
 'acquire',
 'acquisition',
 'acronym',
 'act',
 'ac