In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from keras.preprocessing import text, sequence


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
pd.set_option('display.max_colwidth', 362)
pd.set_option('display.max_rows', 100)

In [3]:
label_cols=['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [4]:
PATH = '~/data/toxic/data/'
# train = pd.read_csv(PATH + 'train.csv')
# test = pd.read_csv(PATH + 'test.csv')
# train = pd.read_csv(PATH + 'cleaned_train.csv')
# test = pd.read_csv(PATH + 'cleaned_test.csv')
train = pd.read_csv(PATH + 'train_preprocessed.csv')
test = pd.read_csv(PATH + 'test_preprocessed.csv')

In [5]:
train_ori = pd.read_csv(PATH + 'train.csv')
train = train.merge(train_ori[['comment_text', 'id']], on='id', suffixes=('', '_ori'))

train_cleaned = pd.read_csv(PATH + 'cleaned_train.csv')
train = train.merge(train_cleaned[['comment_text_cleaned', 'id']], on='id')

In [6]:
test_ori = pd.read_csv(PATH + 'test.csv')
test = test.merge(test_ori[['comment_text', 'id']], on='id', suffixes=('', '_ori'))

test_cleaned = pd.read_csv(PATH + 'cleaned_test.csv')
test = test.merge(test_cleaned[['comment_text_cleaned', 'id']], on='id')

In [7]:
test.drop(label_cols, axis=1, inplace=True) # the preprocessed file contains label columns with value NaN, so drop them

In [8]:
# embeddings_index_lex, embed_size = word2Vec('lex')

# embeddings_index_glc, embed_size = word2Vec('gl-common')

In [9]:
max_features=100000
maxlen=150

In [10]:
comment_col = 'comment_text' # 'comment_text_cleaned' 

X_train = train[comment_col].str.lower().fillna('something') # something is a word of neutral sentiment
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values

X_test = test[comment_col].str.lower().fillna('something')

In [11]:
tok=text.Tokenizer(num_words=max_features,lower=True)
tok.fit_on_texts(list(X_train)+list(X_test))
X_train=tok.texts_to_sequences(X_train)
X_test=tok.texts_to_sequences(X_test)
x_train=sequence.pad_sequences(X_train,maxlen=maxlen)
x_test=sequence.pad_sequences(X_test,maxlen=maxlen)

In [12]:
train['having_pos_label'] = train.apply(lambda row: sum(row[label_cols]), axis=1)

In [13]:
train['splited'] = train['comment_text'].fillna('something').apply(lambda x: x.split())
test['splited'] = test['comment_text'].fillna('something').apply(lambda x: x.split())

In [14]:
def found_word(word_list, word):
    word_list = set(word_list)
    if word in word_list:
        return True
    else:
        return False

In [15]:
test[comment_col] = test[comment_col].str.lower().fillna('something')

test['splited'] = test['comment_text'].apply(lambda x: x.split())

In [16]:
def analyze_word(the_word, verbose=True):
    appear_total_times = tok.word_counts[the_word]
    train['found'] = train['splited'].apply(lambda x: found_word(x, the_word))
    appearsin_comments_in_train = sum(train['found'])
    negative_comments = sum((train.found) & (train.having_pos_label))
    if appearsin_comments_in_train != 0:
        negative_ratio = negative_comments/appearsin_comments_in_train
    else:
        negative_ratio = 0
    test['found'] = test['splited'].apply(lambda x: found_word(x, the_word))
    appearsin_comments_in_test = sum(test['found'])
   
    if verbose:
        print('the word "{}" appears {} times in training and testing'.format(the_word, appear_total_times)) # number of times the word appears
        print('it appears in {} comments in training data'.format(appearsin_comments_in_train))# number of document the word appears in train
        print('{} of these comments have at least one positive labels (negative comment). Negative ratio: {:.3f}%'.format(negative_comments, negative_ratio*100))
        print('it appears in {} comments in testing data'.format(appearsin_comments_in_test)) # number of document the word appears in test
        print('$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$')
 
    return appear_total_times, appearsin_comments_in_train, negative_comments, negative_ratio, appearsin_comments_in_test

In [61]:
# check the comments containing a specific word
the_word = "buncha"
analyze_word(the_word)
train[train.found][['comment_text_ori']+label_cols].head(10)

the word "buncha" appears 12 times in training and testing
it appears in 1 comments in training data
1 of these comments have at least one positive labels (negative comment). Negative ratio: 100.000%
it appears in 11 comments in testing data
$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$


Unnamed: 0,comment_text_ori,toxic,severe_toxic,obscene,threat,insult,identity_hate
51506,The TNA fags need to stop \n\nBuncha pussies,1.0,0.0,1.0,0.0,1.0,0.0


In [62]:
test[test.found][['comment_text','comment_text_cleaned','comment_text_ori']].head(10)

Unnamed: 0,comment_text,comment_text_cleaned,comment_text_ori
13973,campbeltown loch oh campbeltown loch i wish ye we re whisky i whid drink ye dry whilst snivelling intae my british army surplus blanket over my ain ingrained inadequacy ah m a teuchter fuck dinnae criticise meh furrit in mah phoenitical weakness ahm a twat o nationalism an a ll suffer furrit ah d tek a strachur shinty stick against a fuck in pa...,campbeltown loch ... oh campbeltown loch i wish ye we be whisky i whid drink ye dry ... whilst snivel intae my british army surplus blanket over my ain ingrain inadequacy . ah'm a teuchter fuck ... dinnae criticise meh furrit in mah phoenitical weakness ahm a twat o'nationalism an ' a'll suffer furrit . ah'd tek a strachur shinty stick against a fuck ' pan...,"== Campbeltown Loch.... == \n\n Oh, Campbeltown Loch, I wish ye we're whisky, I whid drink ye dry....whilst snivelling intae my British Army surplus blanket over my ain ingrained inadequacy. \n\n Ah'm a teuchter fuck...dinnae criticise meh furrit, in mah phoenitical weakness ahm a twat o'nationalism, an' a'll suffer furrit. Ah'd tek a Strachur shinty stick..."
20589,damnedest weirdest lookin buncha black folks i ve ever seen yikes p,damnedest ( weirdest ) lookin ' buncha black folk i have ever see . yikes ! p,Damnedest (weirdest) lookin' buncha black folks I've ever seen. Yikes! p
61335,ah but you see it s you brits who gave us yanks our congenital distrust of authority back in don t cha know so it s really all your fault we re a buncha paranoid libertarian types grinning ducking and running i am trying to be funny in case anyone misunderstands,: ah but you see it be you brit who give u yank our congenital distrust of authority back in 1776 do not ' cha know ... so it be really all your fault we be a buncha paranoid libertarian type ! ( grin duck and run ... ; ) ) ( i be try to be funny in case anyone misunderstand ... ),""":Ah, but you see, it's you Brits who gave us Yanks our congenital distrust of authority back in 1776, don't 'cha know... so it's really all your fault we're a buncha paranoid libertarian types! (grinning, ducking and running... ;-) ) (I am trying to be funny, in case anyone misunderstands...) \n\n """
68936,yep elinord is one dumb fuck fits in well with the rest of these morons i guess only the borderline retard ed who want to tell themselves they re intelligent set take this place seriously whot a buncha losers,:: yep . elinord be one dumb fuck . fit in well with the rest of these moron i guess . only the borderline retard who want to tell themselves they be intelligent set take this place seriously . whot a buncha loser !,"::Yep. ElinorD is one dumb fuck. Fits in well with the rest of these morons, I guess. Only the borderline-retarded-who-want-to-tell-themselves-they're-intelligent set take this place seriously. Whot a buncha losers!"
70011,it still reads like a buncha faggot ry,: it still read like a buncha faggotry,:it still reads like a buncha faggotry
71852,bitch what the fuck star wars is an epic space opera franchise that s the first fuck ing sentence of this locked article fuck you all i speak only english and have not the first fuck ing idea what any portion of that gibberish means you re all a buncha fuck in gay boy idiot s go fuck yourselves i m simply full to the fuck ing gills with you pig ...,bitch what the fuck ? star war be an epic space opera franchise ? that be the first fuck sentence of this lock article ? fuck you all . i speak only english and have not the first fuck idea what any portion of that gibberish mean . you be all a buncha fuck ' gay boy idiot . go fuck yourselves . i be simply full to the fuck gill with you pig fuck homo who c...,"== bitch, what the fuck? == \n\n Star Wars is an epic space opera franchise? \n\n that's the first fucking sentence of this locked article? fuck you all. i speak only english and have not the first fucking idea what any portion of that gibberish means. you're all a buncha fuckin' gay-boy idiots. go fuck yourselves. i'm simply full to the fucking gills wit..."
112486,damnedest weirdest lookin buncha black folks i ve ever seen yikes p,:d amnedest ( weirdest ) lookin ' buncha black folk i have ever see . yikes ! p,:Damnedest (weirdest) lookin' buncha black folks I've ever seen. Yikes! p
118906,why don t we all just agree to disagree they re just a buncha sand nigger s anyways,why do not we all just agree to disagree ? they be just a buncha sandniggers anyways .,Why don't we all just agree to disagree? They're just a buncha sandniggers anyways.
122626,i think that any of you faggot s talkin shit should at least try to duplicate the skill they have before opening your mouth about how bad they suck buncha lames,i think that any of you faggot talkin shit should at least try to duplicate the skill they have before open your mouth about how bad they suck ! buncha lam,I think that any of you faggots talkin shit should at least try to duplicate the skill they have before opening your mouth about how bad they suck! buncha lames
124187,i think this doesn t befit the granduer of wikipedia i mean are you serious a monkey man this is stupid common are there any right minded scientists out there this is shiz s t yahya al shiddazi and i added this to the article itself lies seems like a buncha lies to me i dink dat dis article needs to be cleaned up in order to meet wikipedi ass ...,i think this do not befit the granduer of wikipedia . i mean be you serious ? a monkey man ? this be stupid . common be there any right mind scientist out there ? this be shiz ( s * * t ) yahya al shiddazi and i add this to the article itself : lie ! seem like a buncha lie to me ! i dink dat dis article need to be clean up in order to meet wikipedia's sill...,"I think this doesn't befit the granduer of Wikipedia. I mean, are you serious? a monkey man? This is stupid. Common, are there any right-minded scientists out there? This is shiz (s**t) \n\n -Yahya Al-Shiddazi \n\n And I added this to the article itself: \n\n Lies! Seems like a buncha lies to me! I dink dat dis article needs to be cleaned up, in order to m..."


In [21]:
def isEnglish(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True

print(isEnglish('slabiky, ale liší se podle významu'))
print( isEnglish('English'))
print( isEnglish('ގެ ފުރަތަމަ ދެ އަކުރު ކަ'))
print( isEnglish('how about this one : 通 asfަ'))
print( isEnglish('?fd4))45s&'))

False
True
False
False
True


In [30]:
print(isEnglish("\n\nCongratulations from me as well, use the tools well.  · talk "))

True


In [22]:
train_p = train.head(1000).copy()

In [23]:
train_p.head(1)

Unnamed: 0,comment_text,id,identity_hate,insult,obscene,set,severe_toxic,threat,toxic,toxicity,comment_text_ori,comment_text_cleaned,having_pos_label,splited,found
0,explanation why the edits made under my username hardcore metallica fan were reverted they weren t vandalisms just closure on some gas after i voted at new york dolls fac and please don t remove the template from the talk page since i m retired now,0000997932d777bf,0.0,0.0,0.0,train,0.0,0.0,0.0,0.0,"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27",explanation why the edit make under my username hardcore metallica fan be revert ? they be not vandalism just closure on some gas after i vote at new york doll fac . and please do not remove the template from the talk page since i be retire now .,0.0,"[explanation, why, the, edits, made, under, my, username, hardcore, metallica, fan, were, reverted, they, weren, t, vandalisms, just, closure, on, some, gas, after, i, voted, at, new, york, dolls, fac, and, please, don, t, remove, the, template, from, the, talk, page, since, i, m, retired, now]",False


In [None]:
train

In [32]:
train['isEnglish'] = train['comment_text_ori'].apply(lambda x: isEnglish(x))

In [None]:
train[(train['isEnglish'] == False) & (train['having_pos_label'])][['comment_text_ori']].iloc[500:]

In [37]:
test['isEnglish'] = test['comment_text_ori'].apply(lambda x: isEnglish(x))

In [38]:
len(test[(test['isEnglish'] == False)])#.iloc[100:200]

17915

In [None]:
test[(test['isEnglish'] == False)][['comment_text_ori', 'comment_text']]#.iloc[152819]

In [60]:
test[['comment_text_ori', 'comment_text']].loc[152819:152819]

Unnamed: 0,comment_text_ori,comment_text
152819,== 蹩脚英语 == \n 英语好烂呀！最近跟维基用户有事要说，无奈几乎不会英语，说不了话。你得帮帮我呀！这次要删除文章，你告诉他行不行？,crappy english english is so rotten recently with the wiki user something to say helpless almost no english can not say anything you have to help me this time to delete the article you tell him not work


In [None]:
bad_word_dict1 = {
    'niggors': 'nigger', # stem + spell check
    'mothjer': 'mother', # spell check (stem is ok)
    'faggt':'faggot',
    'cucks':'cocks', # spell check 
    'sexsex': 'sex',
    'peenus': 'penis',
    'sexsex': 'sex',
    'fuckerucker': 'fucker', 
    'sockpuppet':'alias',
    'sockpuppetry': 'alias',
    'semite':'semitic',
    'donkeysex':'dick',
    'homopetersymonds': 'homo',
    'headsdick': 'dick',
    'peni':'penis',
    'asspie':'ass',
    'nonense': 'nonsense',
    'vagpenis':'penis',
    'diedres': 'crap',
    'niggetr': 'nigger'
}

bad_word_dict2 ={
    'youfuck': 'fuck',
    'niggors' :'nigger',
    'bitchbot': 'bitch',
    'donkeysex': 'dick',
    'motherfuc': 'motherfucker',
    'sexualit': 'sexuality',
    'sexuall': 'sexual',
    'cocain': 'cocaine',
    "fuck'": 'fuck',
    'valentin': 'valentine',
    'decease': 'deceased',
    'deth': 'death',
    "'ass": 'ass',
    'gayy': 'gay',
    'sexe': 'sex',
    'bith': 'bitch',
    'bithc': 'bitch',
    'bithces': 'bitch',
    "fool'": 'fool',
    'cuck': 'cock',
    "'shit": 'shit',
    "penis'": 'penis',
    "bitch'": 'bitch',
    "'stupid": 'stupid',
    'fuckn': 'fuck',
    'choked': 'choke',
    'headsdick': 'dick',
    'peni': 'penis',
    'asspie': 'ass',
    'nonense': 'nonsense',
    'nonsesnse': 'nonsense',
    'zdick': 'dick',
    'asse': 'ass',
    'vagpenis': 'penis',
    'niggetr': 'nigger',
    "mother's": 'mother',
    'pornn': 'porn',
    'fuock': 'fuck',
    'dik': 'dick',
    'maoth': 'mouth',
    'horsecock': 'cock',
    'pneis': 'penis',
    'mothjer': 'mother'
}