In [166]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score, roc_curve, auc
from scipy.sparse import csr_matrix, hstack

In [473]:
train.comment_text[0]

"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27"

In [474]:
train.comment_text_cleaned[0]

'explanation why the edit make under my username hardcore metallica fan be revert ? they be not vandalism just closure on some gas after i vote at new york doll fac . and please do not remove the template from the talk page since i be retire now .'

In [3]:
PATH = '~/data/toxic/data/'

train = pd.read_csv(PATH + 'cleaned_train.csv')
test = pd.read_csv(PATH + 'cleaned_test.csv')



train_sentence = train['comment_text_cleaned']
test_sentence = test['comment_text_cleaned']

text = pd.concat([train_sentence, test_sentence])

print(train.shape)
print(test.shape)

(159571, 27)
(153164, 21)


In [4]:
# CountVectorizer(input=’content’, encoding=’utf-8’, decode_error=’strict’, strip_accents=None,\
#                 lowercase=True, preprocessor=None, tokenizer=None, stop_words=None,\
#                 token_pattern=’(?u)\b\w\w+\b’, ngram_range=(1, 1), analyzer=’word’, max_df=1.0,\
#                 min_df=1, max_features=None, vocabulary=None, binary=False, dtype=<class ‘numpy.int64’>)

# TfidfVectorizer(input=’content’, encoding=’utf-8’, decode_error=’strict’, strip_accents=None,\
#                 lowercase=True, preprocessor=None, tokenizer=None, analyzer=’word’, stop_words=None,\
#                 token_pattern=’(?u)\b\w\w+\b’, ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None,\
#                 vocabulary=None, binary=False, dtype=<class ‘numpy.int64’>, norm=’l2’, use_idf=True,\
#                 smooth_idf=True, sublinear_tf=False)

# min_df=0, max_features=None, strip_accents='unicode',lowercase =True,
# analyzer='word', token_pattern=r'\w{3,}', ngram_range=(1,1),
# use_idf=True,smooth_idf=True, sublinear_tf=True, stop_words = "english"

print('getting tfidf')
phrase_vectorizer = TfidfVectorizer(ngram_range=(1,3),
                                    strip_accents='unicode', 
                                    max_features=100000, 
                                    analyzer='word',
                                    sublinear_tf=True,
                                    token_pattern=r'\w{1,}')
char_vectorizer = TfidfVectorizer(ngram_range=(1,5), 
                                  strip_accents='unicode', 
                                  max_features=200000, 
                                  analyzer='char', 
                                  sublinear_tf=True)

print('fitting char')
char_vectorizer.fit(text.values)
print('fitting phrase')
phrase_vectorizer.fit(text.values)
print('transforming train char')
train_char = char_vectorizer.transform(train_sentence.values)
print('transforming train phrase')
train_phrase = phrase_vectorizer.transform(train_sentence.values)
print('transforming test char')
test_char = char_vectorizer.transform(test_sentence.values)
print('transforming test phrase')
test_phrase = phrase_vectorizer.transform(test_sentence.values)

train_tfidf = hstack((train_char, train_phrase), format='csr')
test_tfidf = hstack((test_char, test_phrase), format='csr')

label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train_tfidf

getting tfidf
fitting char
fitting phrase
transforming train char
transforming train phrase
transforming test char
transforming test phrase


<159571x300000 sparse matrix of type '<class 'numpy.float64'>'
	with 158654373 stored elements in Compressed Sparse Row format>

In [5]:
import pdb

In [413]:
def pr(y_i, y, train_features):
    p = train_features[y==y_i].sum(0)
    return (p + 1) / ((y == y_i).sum() + 1)

def get_nblogreg_model(label_cols, train_features, train, test_features):
    #pdb.set_trace()
    preds = np.zeros((test.shape[0], len(label_cols)))
    train_preds = np.zeros((train.shape[0], len(label_cols)))
    for i, j in enumerate(label_cols):
        if i == i:
            print('fit', j)
            y = train[j].values
            r = np.log(pr(1, y, train_features) / pr(0, y, train_features))
            model = LogisticRegression()
            x_nb = train_features.multiply(r).tocsr()
            # split
            index_to_split = int(x_nb.shape[0]*0.9)
            x_train = x_nb[:index_to_split,:]
            y_train = y[:index_to_split]
            x_val = x_nb[index_to_split:,:]
            y_val = y[index_to_split:]
            val_preds = np.zeros((x_val.shape[0],len(label_cols)))
            print(x_train.shape, y_train.shape, x_val.shape, y_val.shape)
            # fit
            model.fit(x_train, y_train)
            # validation
            val_preds[:, i] = model.predict(x_val)
            val_prob =  model.predict_proba(x_val)[:, 1]
            print('accuracy is {}'.format(accuracy_score(y_val, val_preds[:, i])))
            print('roc_auc is {}'.format(roc_auc_score(y_val, val_prob)))
            
            #preds[:, i] = model.predict_proba(test_features.multiply(r))[:, 1]
            #train_preds[:, i] = model.predict_proba(x_nb)[:, 1]
            #print('accuracy is {}'.format(roc_auc_score(y, train_preds[:, i])))
    return preds, val_preds, val_prob, y_val, index_to_split

def save(y_test, label_cols, path):
    import time
    BUILD_ID = int(time.time())
    print('BUILD_ID: {}'.format(BUILD_ID))
    submission = pd.read_csv(path + 'sample_submission.csv')
    submission[label_cols] = y_test
    submission.to_csv('/home/kai/data/shiyi/toxic/submissions/sub_' + str(BUILD_ID) + '.csv', index=False)
    
print('done')

done


In [364]:
print('predicting')
y_test, val_preds, val_prob, y_val, index_to_split = get_nblogreg_model(label_cols, train_tfidf, train, test_tfidf)
#print('total score is {}'.format(roc_auc_score(train[label_cols], y_train)))

predicting
fit toxic
(143613, 300000) (143613,) (15958, 300000) (15958,)
accuracy is 0.9659105151021431
roc_auc is 0.9844540447511267


In [102]:
confusion_matrix(y_val, val_preds[:,0]) # 0 is toxic

array([[129372,    474],
       [  5760,   8008]])

In [None]:
submission = pd.read_csv('/home/kai/data/shiyi/toxic/submissions/sub_1518559907.csv')

In [418]:
submission.head(2)

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,1.0,0.223519,0.999997,0.01139,0.99538,0.271654
1,0000247867823ef7,0.003152,0.00068,0.001581,0.000194,0.004644,0.000764


In [420]:
sum(test.id == submission.id) == test.shape[0]

True

In [427]:
test[['id','comment_text_cleaned','word_count']].head(2) # test data from csv

Unnamed: 0,id,comment_text_cleaned,word_count
0,00001cee341fdb12,yo bitch ja rule be more succesful then you will ever be whats up with you and hat you sad mofuckas ... i should bitch slap you be pethedic white face and get you to kiss my as you guy sicken me . ja rule be about pride in da music man . dont dis that shit on him . and nothin be wrong bein like tupac he be a brother too ... fuck white boy get thing right next time .,72
1,0000247867823ef7,from rfc the title be fine a it be imo .,12


In [477]:
print(list(step_roc_dict))

['step<0.01>_words<1>_length<150>', 'step<0.01>_words<1>_length<200>', 'step<0.01>_words<1>_length<250>', 'step<0.01>_words<2>_length<150>', 'step<0.01>_words<2>_length<200>', 'step<0.01>_words<2>_length<250>', 'step<0.01>_words<3>_length<150>', 'step<0.01>_words<3>_length<200>', 'step<0.01>_words<3>_length<250>', 'step<0.01>_words<4>_length<150>', 'step<0.01>_words<4>_length<200>', 'step<0.01>_words<4>_length<250>', 'step<0.01>_words<5>_length<150>', 'step<0.01>_words<5>_length<200>', 'step<0.01>_words<5>_length<250>', 'step<0.02>_words<1>_length<150>', 'step<0.02>_words<1>_length<200>', 'step<0.02>_words<1>_length<250>', 'step<0.02>_words<2>_length<150>', 'step<0.02>_words<2>_length<200>', 'step<0.02>_words<2>_length<250>', 'step<0.02>_words<3>_length<150>', 'step<0.02>_words<3>_length<200>', 'step<0.02>_words<3>_length<250>', 'step<0.02>_words<4>_length<150>', 'step<0.02>_words<4>_length<200>', 'step<0.02>_words<4>_length<250>', 'step<0.02>_words<5>_length<150>', 'step<0.02>_words<5

In [472]:
for i in tqdm(range(1, len(step_roc_dict), 100)):
    #print(i)
    m = re.search('step<(.+?)>_words<(.+?)>_length<(.+?)>', list(step_roc_dict)[i])
    step, words, length = m.group(1), m.group(2), m.group(3)
    
    #print(step, words, length)
    sub_w_rule = submission.merge(test[['id','comment_text_cleaned','word_count']],on='id')
    toxic_words = toxic_words_poll_reordered[:int(words)+1]
    
    sub_w_rule['toxic_word_exist'] = sub_w_rule['comment_text_cleaned'].apply(lambda x: toxic_words_detector(toxic_words, x))            
    #print('{} \t comments containing any toxic words.'.format(sum(sub_w_rule['toxic_word_exist'])))
    
    sub_w_rule['toxic'] = sub_w_rule.apply(lambda r: r.toxic+float(step) if r.toxic_word_exist and r.word_count < int(length) else r.toxic, axis=1)
    
    sub_final = sub_w_rule.drop(['comment_text_cleaned','toxic_word_exist','word_count'], axis=1)
    assert sub_final.shape == submission.shape
    
    import time
    BUILD_ID = '{}_{}_{}_{}'.format(step, words, length, int(time.time()))
    sub_final.to_csv('/home/kai/data/shiyi/toxic/submissions/sub_w_rule_' + str(BUILD_ID) + '.csv', index=False)

100%|██████████| 8/8 [00:28<00:00,  3.58s/it]


In [433]:
sum(sub_w_rule.toxic>0.9), sum(sub_w_rule.toxic<0.1)

(19581, 108561)

In [434]:
sum(submission.toxic>0.9), sum(submission.toxic<0.1)

(19578, 108561)

In [435]:
sub_w_rule.columns

Index(['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate', 'comment_text_cleaned', 'word_count',
       'toxic_word_exist'],
      dtype='object')

In [369]:
val_table = train[index_to_split:]
val_table.shape

(15958, 27)

In [10]:
print(val_preds.shape)
val_preds_pd = pd.DataFrame(val_preds, columns=[col+'_p' for col in label_cols])
val_preds_pd.head()

(15958, 6)


Unnamed: 0,toxic_p,severe_toxic_p,obscene_p,threat_p,insult_p,identity_hate_p
0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
val_check = pd.concat([val_table.reset_index(), val_preds_pd], axis=1)
val_check.shape

(15958, 34)

In [370]:
print(val_prob.shape)
val_prob_pd = pd.DataFrame(val_prob, columns=['toxic_p'])
val_prob_pd.head()

(15958,)


Unnamed: 0,toxic_p
0,0.035369
1,0.014648
2,0.190522
3,0.005046
4,0.015192


In [371]:
val_check_prob = pd.concat([val_table.reset_index(), val_prob_pd], axis=1)
val_check_prob.shape

(15958, 29)

In [52]:
from sklearn.metrics import confusion_matrix

In [142]:
# 0.85612811958451318  is the original val set auc(based on prediction, not raw prob)
# tpr = 0.7200259235255995
# fpr = 0.007769684356573014
# confusion matrix:
# array([[14303,   112],
#        [  432,  1111]])

In [16]:
val_check['toxic_pred'] = np.where(val_check['toxic']==val_check['toxic_p'],True, False)

In [17]:
val_check['toxic_pred'].value_counts()

True     15414
False      544
Name: toxic_pred, dtype: int64

In [14]:
544/15958

0.034089484897856875

In [37]:
val_check[val_check['toxic_pred']==False][['toxic','toxic_p','comment_text']].to_csv('/home/kai/data/shiyi/toxic/nbsvm_toxic_predict_wrong.csv',index=False)

In [38]:
val_check[val_check['toxic_pred']==True][['toxic','toxic_p','comment_text']].to_csv('/home/kai/data/shiyi/toxic/nbsvm_toxic_predict_right.csv',index=False)

In [39]:
pd.set_option('display.max_colwidth', -1)

In [97]:
def toxic_words_detector(toxic_words, comment):
    for w in toxic_words:
        if w in comment:
            return True
    return False

In [229]:
def count_1s_and_0s_in_pred(preds, th):
    count_1 = 0
    count_0 = 0
    for i, pred in enumerate(list(preds)):
        if pred>=th:
            count_1+=1
        else:
            count_0+=1
    return count_1, count_0

train['toxic_word_exist'] = train['comment_text_cleaned'].apply(lambda x: toxic_words_detector(toxic_words, x))
print('{} \t comments containing any toxic words.'.format(sum(train['toxic_word_exist'])))
print('{} \t comments containing any toxic words but not labeled as toxic'.format(train[(train['toxic_word_exist'])&(train['toxic']==0)].shape[0]))

In [None]:
print(toxic_words)
val_check['toxic_word_exist'] = val_check['comment_text_cleaned'].apply(lambda x: toxic_words_detector(toxic_words, x))
print('{} \t comments containing any toxic words.'.format(sum(val_check['toxic_word_exist'])))
print('{} \t comments containing any toxic words but not labeled as toxic'.format(val_check[(val_check['toxic_word_exist'])&(val_check['toxic']==0)].shape[0]))
print('# of 1s and 0s before applying rule')
print(count_1s_and_0s_in_pred(val_check.toxic_p, 1)) # 1223, 14735 before consider toxic words exist
val_check['toxic_p'] = val_check.apply(lambda r: 1 if r.toxic_word_exist else r.toxic_p, axis=1)
print('# of 1s and 0s after applying rule')
print(count_1s_and_0s_in_pred(val_check.toxic_p, 1))
print('confusion matrix')
print(confusion_matrix(val_check.toxic.values, val_check.toxic_p.values))
fpr, tpr, thresholds = roc_curve(val_check.toxic.values, val_check.toxic_p.values, pos_label=1.0)
print('fpr: {0:.4f}, tpr: {1:.4f}'.format(fpr[1], tpr[1]))
print('auc: {}'.format(auc(fpr, tpr)))
print('acc: {}'.format(accuracy_score(val_check.toxic.values, val_check.toxic_p.values)))
print('roc_auc: {}'.format(roc_auc_score(val_check.toxic.values, val_check.toxic_p.values)))

In [283]:
toxic_words_poll = ['motherfucker', 'crap', 'fuck', 'prick', 'piss', 'hell', 'nerd', 'life', 'sick', 'jerk', 'racist', 'goddamn', 'wtf', 'shove', 'sucker', 'dude', 'motherfucking', 'mom', 'dare', 'stop', 'filthy', 'ignorant', 'fuckhead', 'youre', 'stupid', 'gonna', 'disgust', 'kiss', 'yo', 'get', 'pathetic', 'stupidity', 'liar', 'queer', 'fuk', 'bitch', 'retard', 'arsehole', 'screw', 'turd', 'suck', 'little', 'anus', 'stink', 'hole', 'dick', 'fuckwit', 'fucken', 'quit', 'scumbag', 'asshole', 'wank', 'fucktard', 'smelly', 'idiotic', 'jackass', 'sad', 'mother', 'wanna', 'disgrace', 'ya', 'cum', 'maggot', 'basement', 'lame', 'idiot', 'nazi', 'bunch', 'cking', 'ha', 'dumb', 'ban', 'go', 'hypocrite', 'commie', 'monkey', 'arrogant', 'butt', 'garbage', 'burn', 'homosexual', 'cant', 'cunt', 'stfu', 'douchebag', 'fuckface', 'bastard', 'cuz', 'stinky', 'masturbate', 'mommy', 'thats', 'clown', 'ever', 'hoe', 'pervert', 'man', 'laugh', 'bully', 'scumbags']

In [274]:
toxic_words = toxic_words_poll[:10] # ['shit']#,'swine','niggers', 'niggers', 'stupid', 'dick', 'cunt']

In [265]:
print(toxic_words)
val_check_prob['toxic_word_exist'] = val_check_prob['comment_text_cleaned'].apply(lambda x: toxic_words_detector(toxic_words, x))

['motherfucker', 'crap', 'fuck', 'prick', 'piss', 'hell', 'nerd', 'life', 'sick', 'jerk']


In [310]:
toxic_word_dict = {}

for i in range(50):
#     toxic_words = toxic_words_poll[i]
#     print(toxic_words)
#     train['toxic_word_exist'] = train['comment_text_cleaned'].apply(lambda x: toxic_words_detector([toxic_words], x))

    toxic_words = toxic_words_poll_reordered[:i+1]
    #print(toxic_words)
    val_check_prob['toxic_word_exist'] = val_check_prob['comment_text_cleaned'].apply(lambda x: toxic_words_detector(toxic_words, x))
    
    total_contain = sum(val_check_prob['toxic_word_exist'])
    contain_but_not_toxic = val_check_prob[(val_check_prob['toxic_word_exist'])&(val_check_prob['toxic']==0)].shape[0]
    not_toxic_ratio_contain = contain_but_not_toxic/total_contain
    
    #toxic_word_dict[toxic_words] = not_toxic_ratio_contain#'{}/{}'.format(contain_but_not_toxic, total_contain)
    print('{} \t= \t{}/{}'.format(not_toxic_ratio_contain, contain_but_not_toxic, total_contain))
    #print('{} \t comments containing any toxic word.'.format(total_contain))
    #print('{} \t comments containing any toxic word but not labeled as toxic'.format(contain_but_not_toxic))


0.0 	= 	0/6
0.0 	= 	0/9
0.0 	= 	0/10
0.0 	= 	0/30
0.0 	= 	0/35
0.0 	= 	0/36
0.06864988558352403 	= 	30/437
0.06864988558352403 	= 	30/437
0.08085106382978724 	= 	38/470
0.087890625 	= 	45/512
0.08687258687258688 	= 	45/518
0.09276437847866419 	= 	50/539
0.09259259259259259 	= 	50/540
0.10383386581469649 	= 	65/626
0.1035031847133758 	= 	65/628
0.10333863275039745 	= 	65/629
0.10476190476190476 	= 	66/630
0.11059907834101383 	= 	72/651
0.11280487804878049 	= 	74/656
0.11280487804878049 	= 	74/656
0.11596385542168675 	= 	77/664
0.11544227886056972 	= 	77/667
0.12223858615611193 	= 	83/679
0.1259150805270864 	= 	86/683
0.15240641711229946 	= 	114/748
0.163659793814433 	= 	127/776
0.16560509554140126 	= 	130/785
0.16750629722921914 	= 	133/794
0.19069239500567536 	= 	168/881
0.19047619047619047 	= 	168/882
0.1919191919191919 	= 	171/891
0.21244635193133046 	= 	198/932
0.21321961620469082 	= 	200/938
0.2554194156456173 	= 	271/1061
0.2608695652173913 	= 	282/1081
0.26916058394160586 	= 	295

In [311]:
toxic_words = toxic_words_poll_reordered[:10]
val_check_prob['toxic_word_exist'] = val_check_prob['comment_text_cleaned'].apply(lambda x: toxic_words_detector(toxic_words, x))
total_contain = sum(val_check_prob['toxic_word_exist'])
contain_but_not_toxic = val_check_prob[(val_check_prob['toxic_word_exist'])&(val_check_prob['toxic']==0)].shape[0]
not_toxic_ratio_contain = contain_but_not_toxic/total_contain
    

In [476]:
print(toxic_words_poll_reordered)

['motherfucking', 'fuckhead', 'fuckface', 'motherfucker', 'fucken', 'fucktard', 'fuck', 'fuckwit', 'asshole', 'bitch', 'sucker', 'cunt', 'arsehole', 'suck', 'stinky', 'scumbags', 'smelly', 'bastard', 'goddamn', 'maggot', 'wank', 'douchebag', 'fuk', 'scumbag', 'dick', 'retard', 'prick', 'filthy', 'idiot', 'masturbate', 'jackass', 'dumb', 'queer', 'stupid', 'nerd', 'cking', 'shove', 'jerk', 'stink', 'mommy', 'youre', 'wtf', 'piss', 'pathetic', 'hypocrite', 'commie', 'stfu', 'mother', 'basement', 'stupidity', 'kiss', 'idiotic', 'pervert', 'clown', 'crap', 'disgust', 'disgrace', 'cuz', 'sick', 'homosexual', 'screw', 'racist', 'gonna', 'garbage', 'monkey', 'ignorant', 'wanna', 'dare', 'dude', 'arrogant', 'nazi', 'anus', 'bunch', 'hole', 'burn', 'bully', 'life', 'turd', 'thats', 'laugh', 'butt', 'lame', 'stop', 'little', 'hoe', 'mom', 'sad', 'hell', 'yo', 'liar', 'get', 'ban', 'go', 'ya', 'cum', 'cant', 'ever', 'quit', 'ha', 'man']


In [317]:
print(toxic_words)

['motherfucking', 'fuckhead', 'fuckface', 'motherfucker', 'fucken', 'fucktard', 'fuck', 'fuckwit', 'asshole', 'bitch']


In [325]:
val_check_prob[(val_check_prob['toxic_word_exist']) & (val_check_prob['toxic_p']<0.6)][['comment_text','word_count','toxic_word_exist','toxic','toxic_p']].iloc[3:5]

Unnamed: 0,comment_text,word_count,toxic_word_exist,toxic,toxic_p
1592,"Man, why are you so angry? That word (FUCKING) which you posted on my talk page was REALLY UNNECESSARY! With that kind of words, I must say that you are completely unable to cooperate with other users on WP. I'll wont post new messages to you, nor I'll respond to yours, because I didn't came to WP to be insulted, but to improve articles. As for stubbornness, it's also one of my characteristics, so if you want to engage in an edit war with me, I'm here!",87,True,0,0.489867
2201,"""For those who want to bitch about me any behaviour ih which they dont like which seem to be so distruptive to wikipedia, or if you want to see personal attacks and uncivil behaviour made and condone by admins and the fellow supporters, like (talk???contribs), you can see a comment he made here, or by the person that blocked me (talk???contribs) here please go to WP:AN#Boothy443_Indef._blocked, and that goes for admins and non admins. Also if youy want to bitch about me in another public forum, go to here Wikipedia:Requests for comment/Boothy443, but dont expect a responce from me, as i dont see the effectness of displaying ones opinions on other usesr as being any bit of constructive in buliding an Encylopedia. It should aslo be know that i have been put behind an indefiendt block, imposed by one adminstrator Jtkiefer that has less to do with policy violiations is is just a way to supress me for as i see it, not playing by their """"rules"""", or to put it, falling in line like a good Wikipedian, and that i called out an admin (talk???contribs) for what i percived as his biased enforcment of a 3rr block on the pages , Philadelphia County, Pennsylvania, Philadelphia, Pennsylvania, Category:Philadelphia County, Pennsylvania, i which i will freely admit that i did, in order to keep as i see it the """"concensus"""", what ever that word means, of the editors on the pages in question that their was to be no merge of the artciles or categoreies at present that was trying to be forced by (talk???contribs), and an exam of his comments on talk:Philadelphia, Pennsylvania as compared to his comments on Wikipedia:Mediation_Cabal#Philadelphia_County_and_other_Philadelphia_Pages only show that he is pushing for a merge, but that (talk???contribs) also violated the 3rr on the page as well after he was blocked by using two anon accounts (talk???contribs) and (talk???contribs), yet neiter were blocked by (talk???contribs) nor was the block on (talk???contribs) extended an addation 48 hours, yes thats right 48 not 24 an arbitatry decision and another reason in which i belive that no admin be alowed to block a user for over 24 hours with out arbcom decison and even then i am not sure that they should. And also that he, (talk???contribs), posted a comment for (talk???contribs) on the Wikipedia:Mediation_Cabal#Philadelphia_County_and_other_Philadelphia_Pages, why couldnt (talk???contribs) waith tthe 48 hours like he was supposed to to post his comments, which to me would show that their is a question of the impartiality of (talk???contribs) becuase of his actions. I did email him, and in what could be construed as a nasty letter, to (talk???contribs) in which i told him that i do belive that he biased in his actions that that i was considering further action aginst him in the matter, with the possibility of releaving his of his adminstrative pridvlidges. His reaction was to increast the length of my block to addational 72 hours, which i can only see as a supression of any pending action agnist him, i would really not be supprised to know if he confired with (talk???contribs) off site which would lead to [[User:Jtkiefer|Jt",552,True,0,0.010364


In [328]:
val_check_prob[(val_check_prob['toxic_word_exist']) & (val_check_prob['toxic_p']<0.5) & (val_check_prob['toxic']==1)][['comment_text','word_count','toxic_word_exist','toxic','toxic_p']].shape

(10, 5)

In [331]:
val_check_prob[(val_check_prob['toxic_word_exist']) & (val_check_prob['toxic']==1) & (val_check_prob['toxic_p']<0.5)]['toxic_p'].

count    10.000000
mean     0.267718 
std      0.164455 
min      0.044202 
25%      0.141259 
50%      0.230452 
75%      0.412062 
max      0.485913 
Name: toxic_p, dtype: float64

In [349]:
val_check_prob[(val_check_prob['toxic_word_exist']) & (val_check_prob['toxic']==1) & (val_check_prob.toxic_p<0.5) & (val_check_prob.word_count < 200)][['comment_text','toxic','toxic_p']]

Unnamed: 0,comment_text,toxic,toxic_p
1399,"Tony Sidaway\nThe article Jim Nussle has been repeatedly vandalized for partisan purposes by right-wing Republicans, or rather one such Republican and his various sock puppets. Then this right wing goon of an admin intrudes himself. To state it mildly, I am apoplectically angry at the sonofabitch. To respond to your message\nWas your partial blanking of User talk:Tony Sidaway intentional, or an accident? [7]. MONGO \nI would seem to have been.",1,0.044202
5777,"""\n\nNo, I FUCKING REWROTE THE WHOLE ARTICLE already (first as an anomymous user). It was """"a mess of the article"""", before I did, starting last year or early this year, I don't remember. I provided most of reception, I rewrote most of everything else. Hello? """,1,0.187472
6897,"""\nThe album cover is fucking atrocious. Gaga is simply throwing out random ideas. But then, I love how kickass the music is on """"Judas"""". The production is the best I've heard among all songs releaed this year. And woooof, """"Judas"""" cleanup was hectic. ? (talk2me) """,1,0.273432
7190,"""\n\n Dingley. \n\nJust to put you in the picture, Trident13: the tone is nothing unusual for Dingley. He's got a long history. Being sarcastic/patronizing/abusive/contemptuous makes him feel good. He does it a lot. Funny thing is though, when he decides he's a victim of it he runs, crying like a toddler who's just dropped his ice-cream, to the """"notice board"""", demanding people be banned. He's always telling others how to behave, but he doesn't think the rules apply to him. Self-esteem problems. He won't read WP:AGF, of course, because, as you've probably noticed, he already knows everything. Take no notice of the sad fucker.""",1,0.37376
10863,"""\n\n Irresponsible \n\nIf you checked, I was not spamming Wikipedia, but my link is the first to come up when you search """"BMtron"""" on Google, not a spam link. I advise you to pack your bags for Jersey, mo'fucka!!""",1,0.485913
11223,ChrisO is fucken Homo and proven criminals\n\nMore on ChrisO find out on www.ChrisO.homo.com\n\nThe case of ChrisO is give to FBI do not consider him as normal. It is a case of mental retired person accused for many crimes.,1,0.424829
15865,"""\nThank you. Given the misuse of tools here and the itchy trigger finger as exemplified by the thread above, this """"admin"""" needs to be examined by Arbcom. I'm not surprised to see a thread already open at ANI discussing it's recent behavioural trends. I won't be initiating anything but will most certainly be presenting evidence at the appropriate time. In the meantime someone needs to tell this """"admin"""" to calm the fuck down and stop removing comments mad by editors on their own talk pages. That's a blockable offence, especially when they were told to never post here again. INVOLVED much? """,1,0.130198


In [354]:
submission.columns

Index(['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

In [355]:
val_check_prob[(val_check_prob['toxic_word_exist']) & (val_check_prob['toxic']==0) & (val_check_prob.toxic_p>0.5) & (val_check_prob.word_count < 200)][['comment_text','toxic','toxic_p','toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate']]

Unnamed: 0,comment_text,toxic,toxic_p,toxic.1,severe_toxic,obscene,threat,insult,identity_hate
1512,"dispute \n\nFirst i am restoring the censored information this user chose to remove from his page as it is relevent to this post\n\ncensored-information\nPlease be aware i made 1 edit and 2 reverts please learn to count, it takes 3 reverts to break 3rr now please fuck off and go back to your nazi policy enforcement and supporting of slave labor\n\nyou cunt94.168.204.89 (talk) \ncensored-information-\n\nAs to your comment on my talk page i once again state i have only made two reverts not three so once again learn to count\nsecondly if you would like users to be civil do not go on to there talk pages and accuse them of things they have not done and make idle threats. finally if you feel something needs a citation please add the [citation needed] tags instead of vandalizing the page for a second time also if you again choose to censor this information i will repost it adnauseum as you clearly act like and are a nazi piece of shit94.168.204.89",0,1.0,0,0,0,0,0,0
3722,"Case in point, ban this SB asshole who says on his user page he is some Barack Obama lackey and who seems to think my suggestions need to be hidden from view.",0,0.529833,0,0,0,0,0,0
4035,"""""""Sly SOB"""" is a somewhat common teasing expression, not a way to seriously call somebody a son of a bitch. Please learn the vernacular before commenting on it in the future, it will save you from some embarrassment. 204.69.190.254 \n""",0,0.570823,0,0,1,0,0,0
6017,"As I said, you're an asshole and that's all there is to it.",0,0.995265,0,0,1,0,1,0
6031,"I am in awe of your template usage, true fucking hero you are, you twinkle-queen",0,0.996679,0,0,1,0,0,0
6520,"At least I'm trying, your M.O. basically seems to be an asshole to people who are trying to contribute. What have you done?",0,0.82429,0,0,1,0,1,0
6536,"Non-notable KrakatoaKatie==\nYou and this other turd Riana dumped the article about Alison Crawford. Odds are you're not even in the UK to know who she is. It's hard to get over the fact that non-notable maggots like you can go around policing at Wikipedia, playing at and pretending to be notable yourselves, and fuck up the experience for the rest of us and make Wikipedia generally less effective and useful for Wikipedia visitors in general.\n\n==",0,0.640182,0,0,1,0,1,0
6937,"I also find it somewhat unimpressive that you closed bith discussions about abuse with comments directed at me rather than the problem of the abuse I submitted. Nicely played, no one wants to enforce abusive admins. And regardless of whether he used his tools abusively in this situation, he has done so many times in the past and he is an abusive editor as are the others I mentioned in the discussion. It would be nice if for once someone was actually willing to do something about that and their passive aggressive bullshit arguments and insults rather than persecute the submitter for being blunt when submitting them to ANI. They are saying the same things (that is calling me an asshole) they are just doing it in a different way. Typical for this site these days.",0,1.0,0,0,0,0,0,0
7565,"""\n\nappologies for the racist remarks, let the general moronics and asshole-itude of the cockbites on the """"dubya"""" page get to me. I've learned my lesson. Apparently the majority is always unbiased and right on wikipedia, will bring re-ienforcments to unbias the article next time. i guess this is yet another case of democrasy failing us. -steve oh and RyanFreisling is a mother humping, father felating dubya fonding, assmuncher of the highest order, right below catholic priests, rush limbaugh, and michael jackson on the fag-o-meter.""",0,0.798128,0,0,0,0,1,0
8317,"Congratulations! \n\nCongratulations, Floquenbeam, you have won the asshole of the year award! You have won this award by saying mean things about and to other people! In order to claim your award, you must call 1-800-Asshole(1-800-277-4653)! Once again, congratulations for your win! 142.162.13.40",0,0.783293,0,0,0,0,0,0


In [338]:
val_check_prob[(val_check_prob['toxic_word_exist']) & (val_check_prob['toxic']==0)]['word_count'].describe()

count    45.000000 
mean     214.200000
std      207.796513
min      13.000000 
25%      61.000000 
50%      150.000000
75%      291.000000
max      823.000000
Name: word_count, dtype: float64

In [339]:
train['word_count'].describe()

count    159571.000000
mean     67.273527    
std      99.230702    
min      1.000000     
25%      17.000000    
50%      36.000000    
75%      75.000000    
max      1411.000000  
Name: word_count, dtype: float64

In [330]:
val_check_prob[(val_check_prob['toxic_word_exist']) & (val_check_prob['toxic']==0)]['toxic_p'].describe()

count    45.000000
mean     0.571508 
std      0.391617 
min      0.006257 
25%      0.138093 
50%      0.640182 
75%      0.995265 
max      1.000000 
Name: toxic_p, dtype: float64

In [327]:
val_check_prob[(val_check_prob['toxic_word_exist']) & (val_check_prob['toxic']==1)][['comment_text','word_count','toxic_word_exist','toxic','toxic_p']].shape

(467, 5)

In [299]:
print(toxic_word_dict)

{'motherfucker': 0.011695906432748537, 'crap': 0.59298618490967059, 'fuck': 0.058385933273219115, 'prick': 0.2878787878787879, 'piss': 0.47816593886462883, 'hell': 0.85888378664782761, 'nerd': 0.39035087719298245, 'life': 0.76529756915339475, 'sick': 0.60550458715596334, 'jerk': 0.41358024691358025, 'racist': 0.63720316622691298, 'goddamn': 0.18292682926829268, 'wtf': 0.47029702970297027, 'shove': 0.40601503759398494, 'sucker': 0.10810810810810811, 'dude': 0.66946308724832215, 'motherfucking': 0.0, 'mom': 0.84592809977989725, 'dare': 0.66373626373626371, 'stop': 0.83016402961242564, 'filthy': 0.30769230769230771, 'ignorant': 0.65977443609022557, 'fuckhead': 0.0, 'youre': 0.46923076923076923, 'stupid': 0.3880952380952381, 'gonna': 0.6407942238267148, 'disgust': 0.5941558441558441, 'kiss': 0.53711790393013104, 'yo': 0.87423701225301809, 'get': 0.87997593502456628, 'pathetic': 0.47935368043087973, 'stupidity': 0.53061224489795922, 'liar': 0.87887130075705433, 'queer': 0.37634408602150538,

In [304]:
toxic_words_poll_reordered = []
for i, w in enumerate(sorted(toxic_word_dict, key=toxic_word_dict.get)):
    if i<10:
        print(w, toxic_word_dict[w])
    toxic_words_poll_reordered.append(w)

motherfucking 0.0
fuckhead 0.0
fuckface 0.0
motherfucker 0.0116959064327
fucken 0.037037037037
fucktard 0.0384615384615
fuck 0.0583859332732
fuckwit 0.0689655172414
asshole 0.0967283072546
bitch 0.098233995585


In [267]:
899/1505

0.59734219269103

In [375]:
print('roc_auc: {}'.format(roc_auc_score(val_check_prob.toxic.values, val_check_prob.toxic_p.values))) 
# original: 0.9844540447511267

roc_auc: 0.9844540447511267


In [378]:
sum(val_check_prob.toxic_word_exist)

6

In [406]:
step_roc_dict = {}
for step in tqdm(range(1,50)):
    step = step/100
    for i in range(50):
        for length in [150, 200, 250]:
            toxic_words = toxic_words_poll_reordered[:i+1]
            #if i<5:
            #    print(toxic_words)
            val_check_prob['toxic_word_exist'] = val_check_prob['comment_text_cleaned'].apply(lambda x: toxic_words_detector(toxic_words, x))
            #print('# of 1s and 0s before applying rule. Threashold = 0.5')
            #print(count_1s_and_0s_in_pred(val_check_prob.toxic_p, 0.5)) 
            val_check_prob['toxic_p_step'] = val_check_prob.apply(lambda r: 1 if r.toxic_word_exist and r.word_count < length else r.toxic_p, axis=1)
            #print('# of 1s and 0s after applying rule. Threashold = 0.5')
            #print(count_1s_and_0s_in_pred(val_check_prob.toxic_p_step, 0.5))
            roc = roc_auc_score(val_check_prob.toxic.values, val_check_prob.toxic_p_step.values)
            #print('roc_auc: {}'.format(roc))
            if roc > 0.9844540447511267:
                key = 'step<{}>_words<{}>_length<{}>'.format(step,i,length)
                step_roc_dict[key] = roc

100%|██████████| 49/49 [41:59<00:00, 51.43s/it]


In [408]:
len(step_roc_dict)

735

In [442]:
for i, w in enumerate(sorted(step_roc_dict, key=step_roc_dict.get, reverse=True)):
    print(w, step_roc_dict[w])

step<0.01>_words<4>_length<150> 0.98446609384
step<0.01>_words<4>_length<200> 0.98446609384
step<0.01>_words<4>_length<250> 0.98446609384
step<0.01>_words<5>_length<150> 0.98446609384
step<0.01>_words<5>_length<200> 0.98446609384
step<0.01>_words<5>_length<250> 0.98446609384
step<0.02>_words<4>_length<150> 0.98446609384
step<0.02>_words<4>_length<200> 0.98446609384
step<0.02>_words<4>_length<250> 0.98446609384
step<0.02>_words<5>_length<150> 0.98446609384
step<0.02>_words<5>_length<200> 0.98446609384
step<0.02>_words<5>_length<250> 0.98446609384
step<0.03>_words<4>_length<150> 0.98446609384
step<0.03>_words<4>_length<200> 0.98446609384
step<0.03>_words<4>_length<250> 0.98446609384
step<0.03>_words<5>_length<150> 0.98446609384
step<0.03>_words<5>_length<200> 0.98446609384
step<0.03>_words<5>_length<250> 0.98446609384
step<0.04>_words<4>_length<150> 0.98446609384
step<0.04>_words<4>_length<200> 0.98446609384
step<0.04>_words<4>_length<250> 0.98446609384
step<0.04>_words<5>_length<150> 0.

In [235]:
#fpr, tpr, thresholds = roc_curve(val_check_prob.toxic.values, val_check_prob.toxic_p.values, pos_label=1.0)
#print('fpr: {0:.4f}, tpr: {1:.4f}'.format(fpr[1], tpr[1]))
#print('auc: {}'.format(auc(fpr, tpr)))
#print('roc_auc: {}'.format(roc_auc_score(val_check_prob.toxic.values, val_check_prob.toxic_p.values)))

fpr: 0.0109, tpr: 0.2495
auc: 0.9775029566351928
roc_auc: 0.9775029566351928


In [None]:
toxic_pred_wrong = pd.read_csv('/home/kai/data/shiyi/toxic/nbsvm_toxic_predict_wrong.csv')

In [291]:
toxic_pred_wrong[toxic_pred_wrong.toxic==1].iloc[:10]

Unnamed: 0,toxic,toxic_p,comment_text
0,1,0.0,Lots of that other shit does not have a category for themselves. Terri does.
2,1,0.0,"I agree with the previous assessment about that swine Bill Connolley. He blocked me earlier this week for doing next to nothing. I was just attempting to make a page better, and he abused his power to reinforce his personal point of view. Among administrators, he has some of the worst judgement on all of wikipedia, and I regard him as a swine of the lowest order."
3,1,0.0,"""\n\nExcuse me for coming on hard, but the thought of going through this crap again displeases me. Quoting Rushton is equivalent to using Ann Coulter as a prominent reference in the Kwanzaa article, or (if you still do not get the point), allowing a well educated klan member (on the basis that the concerned klan member is well-known) to claim that """"niggers are monkeys"""". What is the difference? Should every stupid opinion be included merely for the sake of balance? Perhaps yes, but there are (or should be) limits. What annoys me most is that Kobrakid (and company) probably do not intend it, but succeed so well in disrupting this article. """
7,1,0.0,"""\n\n I just wanted to say . . . \n\nthat you were a horrid admin and really mean person. You lacked focus, commitment and neutrality. I'm jolly glad you're semi-retired and I'll dance the day you're de-sysopped. But, in a spirit of fairness, it'll be a dance of your choice. \n\nI've seen your dancing, it's crap. Why make a sad day even sadder? My only consolation is that you'll likely get blocked before Jennavecia gets desysopped. She's one of the few administrators I think wikipedia would actually miss. Fatuorum \n\nI was actually a really good admin. And I may have my mean moments... I don't know who you are, but I get the impression that you're an idiot, so I suppose that's your justification for calling me mean. I was not required to be focused here, to say I lacked commitment is just stupid as hell, and neutrality... well, I suppose it depended on the situation. I couldn't care less if you're jolly, but you can save the dancing for the day I retire, which may be today, and I recommend holding your breath until I'm desysoped... since it won't happen. I've never abused my tools, nor would I. Not to mention, it's practically impossible to desysop an admin here. I do plan to resign though, so you can dance for that too. But just know this... As I walk away from this place, you can kiss the view. vecia \n\nI just wanted to say that Jennavecia is the most awesome person in the world, and anyone that says otherwise has their cranium up their anus. We love you Jennavecia! <3 .talk.contribs \n\nHang on, let's not go overboard here. She did pretty well as an admin, but that's hardly a qualification for sanctification. """"The most awesome person in the world"""" comment is interesting as well. How many of the Earth's 6.5 billion population do you actually know? Hardly any I'd guess, because your parents aren't happy to allow you out after it gets dark. Pleeezee. Fatuorum \n\nO, Malleus, you know when you are presented with one extreme, it's always appropriate to respond with the opposite extreme! vecia \n. 92.11.217.44 \nI think my irony detectors may have failed me, Sorry Jayron32. Fatuorum \nYour mother. Fierce! \nI don't know who the eff JeanLatore is. Was I involved in his banning? Doesn't ring a bell and I don't give a shit, so I'm not going to check. vecia \n\n Hi Lara \n\nI dropped a question at Ioeth's talk page. I saw the semi-retired heading, and the links on your user page as being wiki-gnome and wiki-fairy. That's why I asked at his page. Anyway - I see that you're still active, and I didn't want you to think I was avoiding talking to you personally or anything. Hope life is treating you well .. \nReplied on Ioeth's talk. vecia \n\n Dear Lara \n\nNever had a chance to leave a message on your talk page (I think) since I've been here, so thought I'll use my final opportunity to do so. <3 Thank you for all your wonderful and valuable contributions. <3 For the limited period of time that I'm still stuck here, will miss ya. Take care, ([["
8,1,0.0,Wouldn't it be a great thing to have a good hard sniff of Maria Sharapova's fanny.
11,1,0.0,Get over yourself \n\nYour mad because I'm actually fighting back for whats right and I'm not going to let some random nimrod push me around. You know I'm right so report me all you want i couldn't care less.
12,1,0.0,"Might we be confusing stupid with stubborn or inflexible? I've known some technically intelligent people (bordering on genius) who were simply unable to acknowledge that any position other than theirs might be correct. Or their intelligence might be so narrowly focused as to be useless in normal conversation. Personally, I find the terminally stubborn much harder to deal with than someone who's 'stupid.'"
14,1,0.0,Personal note:Please read WP:SPOILER before making a dick move like that.
15,1,0.0,"""\n""""oo9"""", if you have ANY actual evidence, show it. You claim something found on May 23, 2013 - I can find no links that back you up. Either come up with evidence or go away you trolling little tea retard."""
16,1,0.0,"""\n\n YOUR A QUEER \n\n SOME PEOPLE WERE DROPPED AS A BABY... WERE YOU THROWN AT A WALL? Why would you do that you N00B ) """


with open('toxic_predict_wrong.txt','w') as f:
    for comment in list(val_check[val_check['toxic_pred']==False]['comment_text']): # 544 in total
        f.write(comment)
        f.write("\n==========================================\n")
        f.write("==========================================\n")

with open('toxic_predict_right.txt','w') as f:
    for comment in list(val_check[val_check['toxic_pred']==True]['comment_text'][:544]):
        f.write(comment)
        f.write("\n==========================================\n")
        f.write("==========================================\n")

In [15]:
print('saving files')
#model_name = 'nblogreg'
save(y_test, label_cols, PATH)

print('done')

saving files
BUILD_ID: 1518559907
done


predicting
fit toxic
accuracy is 0.992991831252086
fit severe_toxic
accuracy is 0.9991327419600897
fit obscene
accuracy is 0.9980276145314022
fit threat
accuracy is 0.9999371962826826
fit insult
accuracy is 0.9956056176608415
fit identity_hate
accuracy is 0.9996076220294341
total score is 0.9975504372860895
saving files
done
