In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from keras.preprocessing import text, sequence


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
pd.set_option('display.max_colwidth', 362)
pd.set_option('display.max_rows', 100)

In [3]:
label_cols=['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [4]:
PATH = '~/data/toxic/data/'
# train = pd.read_csv(PATH + 'train.csv')
# test = pd.read_csv(PATH + 'test.csv')
# train = pd.read_csv(PATH + 'cleaned_train.csv')
# test = pd.read_csv(PATH + 'cleaned_test.csv')
train = pd.read_csv(PATH + 'train_preprocessed.csv')
test = pd.read_csv(PATH + 'test_preprocessed.csv')

In [5]:
train_ori = pd.read_csv(PATH + 'train.csv')
train = train.merge(train_ori[['comment_text', 'id']], on='id', suffixes=('', '_ori'))

train_cleaned = pd.read_csv(PATH + 'cleaned_train.csv')
train = train.merge(train_cleaned[['comment_text_cleaned', 'id']], on='id')

In [6]:
test_ori = pd.read_csv(PATH + 'test.csv')
test = test.merge(test_ori[['comment_text', 'id']], on='id', suffixes=('', '_ori'))

test_cleaned = pd.read_csv(PATH + 'cleaned_test.csv')
test = test.merge(test_cleaned[['comment_text_cleaned', 'id']], on='id')

In [7]:
test.drop(label_cols, axis=1, inplace=True) # the preprocessed file contains label columns with value NaN, so drop them

In [8]:
# embeddings_index_lex, embed_size = word2Vec('lex')

# embeddings_index_glc, embed_size = word2Vec('gl-common')

In [9]:
max_features=100000
maxlen=150

In [10]:
comment_col = 'comment_text' # 'comment_text_cleaned' 

X_train = train[comment_col].str.lower().fillna('something') # something is a word of neutral sentiment
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values

X_test = test[comment_col].str.lower().fillna('something')

In [11]:
tok=text.Tokenizer(num_words=max_features,lower=True)
tok.fit_on_texts(list(X_train)+list(X_test))
X_train=tok.texts_to_sequences(X_train)
X_test=tok.texts_to_sequences(X_test)
x_train=sequence.pad_sequences(X_train,maxlen=maxlen)
x_test=sequence.pad_sequences(X_test,maxlen=maxlen)

In [12]:
train['having_pos_label'] = train.apply(lambda row: sum(row[label_cols]), axis=1)

In [13]:
train['splited'] = train['comment_text'].fillna('something').apply(lambda x: x.split())
test['splited'] = test['comment_text'].fillna('something').apply(lambda x: x.split())

In [14]:
def found_word(word_list, word):
    word_list = set(word_list)
    if word in word_list:
        return True
    else:
        return False

In [15]:
test[comment_col] = test[comment_col].str.lower().fillna('something')

test['splited'] = test['comment_text'].apply(lambda x: x.split())

In [16]:
def analyze_word(the_word, verbose=True):
    appear_total_times = tok.word_counts[the_word]
    train['found'] = train['splited'].apply(lambda x: found_word(x, the_word))
    appearsin_comments_in_train = sum(train['found'])
    negative_comments = sum((train.found) & (train.having_pos_label))
    if appearsin_comments_in_train != 0:
        negative_ratio = negative_comments/appearsin_comments_in_train
    else:
        negative_ratio = 0
    test['found'] = test['splited'].apply(lambda x: found_word(x, the_word))
    appearsin_comments_in_test = sum(test['found'])
   
    if verbose:
        print('the word "{}" appears {} times in training and testing'.format(the_word, appear_total_times)) # number of times the word appears
        print('it appears in {} comments in training data'.format(appearsin_comments_in_train))# number of document the word appears in train
        print('{} of these comments have at least one positive labels (negative comment). Negative ratio: {:.3f}%'.format(negative_comments, negative_ratio*100))
        print('it appears in {} comments in testing data'.format(appearsin_comments_in_test)) # number of document the word appears in test
        print('$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$')
 
    return appear_total_times, appearsin_comments_in_train, negative_comments, negative_ratio, appearsin_comments_in_test

In [19]:
# check the comments containing a specific word
the_word = "fuock"
analyze_word(the_word)
train[train.found][['comment_text_ori']+label_cols].head(10)

the word "fuock" appears 17 times in training and testing
it appears in 0 comments in training data
0 of these comments have at least one positive labels (negative comment). Negative ratio: 0.000%
it appears in 1 comments in testing data
$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$


Unnamed: 0,comment_text_ori,toxic,severe_toxic,obscene,threat,insult,identity_hate


In [20]:
test[test.found][['comment_text','comment_text_cleaned','comment_text_ori']].head(10)

Unnamed: 0,comment_text,comment_text_cleaned,comment_text_ori
95388,fuock you ponyo let your mom play with my dik in her maoth fuock you ponyo let your mom play with my dik in her maoth fuock you ponyo let your mom play with my dik in her maoth fuock you ponyo let your mom play with my dik in her maoth fuock you ponyo let your mom play with my dik in her maoth fuock you ponyo let your mom play with my dik in her maoth fuoc...,fuock you ponyo let your mom play with my dik in her maoth fuock you ponyo let your mom play with my dik in her maoth fuock you ponyo let your mom play with my dik in her maoth fuock you ponyo let your mom play with my dik in her maoth fuock you ponyo let your mom play with my dik in her maoth fuock you ponyo let your mom play with my dik in her maoth fuoc...,fuOck you Ponyo let your mom play with my dik in her maoth \n\n fuOck you Ponyo let your mom play with my dik in her maoth \n\n fuOck you Ponyo let your mom play with my dik in her maoth \n\n fuOck you Ponyo let your mom play with my dik in her maoth \n\n fuOck you Ponyo let your mom play with my dik in her maoth \n\n fuOck you Ponyo let your mom play with...


In [None]:
bad_word_dict1 = {
    'niggors': 'nigger', # stem + spell check
    'mothjer': 'mother', # spell check (stem is ok)
    'faggt':'faggot',
    'cucks':'cocks', # spell check 
    'sexsex': 'sex',
    'peenus': 'penis',
    'sexsex': 'sex',
    'fuckerucker': 'fucker', 
    'sockpuppet':'alias',
    'sockpuppetry': 'alias',
    'semite':'semitic',
    'donkeysex':'dick',
    'homopetersymonds': 'homo',
    'headsdick': 'dick',
    'peni':'penis',
    'asspie':'ass',
    'nonense': 'nonsense',
    'vagpenis':'penis',
    'diedres': 'crap',
    'niggetr': 'nigger'
}

bad_word_dict2 ={
    'youfuck': 'fuck',
    'niggors' :'nigger',
    'bitchbot': 'bitch',
    'donkeysex': 'dick',
    'motherfuc': 'motherfucker',
    'sexualit': 'sexuality',
    'sexuall': 'sexual',
    'cocain': 'cocaine',
    "fuck'": 'fuck',
    'valentin': 'valentine',
    'decease': 'deceased',
    'deth': 'death',
    "'ass": 'ass',
    'gayy': 'gay',
    'sexe': 'sex',
    'bith': 'bitch',
    'bithc': 'bitch',
    'bithces': 'bitch',
    "fool'": 'fool',
    'cuck': 'cock',
    "'shit": 'shit',
    "penis'": 'penis',
    "bitch'": 'bitch',
    "'stupid": 'stupid',
    'fuckn': 'fuck',
    'choked': 'choke',
    'headsdick': 'dick',
    'peni': 'penis',
    'asspie': 'ass',
    'nonense': 'nonsense',
    'nonsesnse': 'nonsense',
    'zdick': 'dick',
    'asse': 'ass',
    'vagpenis': 'penis',
    'niggetr': 'nigger',
    "mother's": 'mother',
    'pornn': 'porn',
    'fuock': 'fuck',
    'dik': 'dick',
    'maoth': 'mouth',
    'horsecock': 'cock',
    'pneis': 'penis',
    'mothjer': 'mother'
}