In [49]:
import sys
import pickle
import pandas as pd
import numpy as np
from langdetect import detect
import re
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from dictionary import contraction_map, unnecessary_patterns

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.externals import joblib
from sklearn.model_selection import KFold, cross_val_score
from sklearn.decomposition import TruncatedSVD

cv = KFold(n_splits=5, shuffle=True, random_state=0)

stop_words = stopwords.words('english')
wnl = WordNetLemmatizer()

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
sample = pd.read_csv('data/sample_submission.csv')

train_y = train.as_matrix()[:, 2:].astype('int32')

In [21]:
contraction_object = re.compile("|".join(contraction_map.keys()))
sub_patterns = '|'.join(unnecessary_patterns)

def expand_contraction(sentence, sub_map=contraction_map, sub_object=contraction_object):
    def matching_case(match):
        return sub_map[match.group(0)]    
    return sub_object.sub(matching_case, sentence)


def cleaning_text(text, sub_patterns=sub_patterns):
    text = re.sub(sub_patterns, ' ', text)
    text = re.sub('[0-9]+', 'NUM', text)
    return text.strip()


def tokenizer(sentence):
    tokenized_sentence = word_tokenize(sentence)
    return [wnl.lemmatize(token) for token in tokenized_sentence if token not in stop_words]


def preprocessing(dataset):
    comment_list = []
    for comment in dataset.comment_text:
        comment_list.append(cleaning_text(expand_contraction(comment)))
    return comment_list


def persistence(fname, mode='load', obj=None):
    if mode == 'load':
        with open(fname, 'rb') as f:
            return pickle.load(f)
    elif mode == 'save' and obj is not None:
        with open(fname, 'wb') as f:
            pickle.dump(obj, f)
            

def make_submission(sample_sub, fname, prediction):
    idx = sample['id']
    columns = sample.columns.tolist()[1:]
    sub = pd.DataFrame(prediction, index=idx, columns=columns)
    sub.to_csv('submissions/{}.csv'.format(fname), index=True)

In [4]:
train_comment = persistence('train_comment.pkl', 'load')
test_comment = persistence('test_comment.pkl', 'load')

In [5]:
tfidf = TfidfVectorizer(tokenizer=tokenizer, min_df=50, max_df=0.5, stop_words=stop_words, lowercase=True)
train_x = tfidf.fit_transform(train_comment).todense()

print(train_x.shape)

(159571, 7145)


In [5]:
tfidf = persistence('tfidf_model.pkl', 'load')

In [9]:
tfidf.vocabulary_['suck']

6166

In [11]:
tfidf_matrix = tfidf.transform(train_comment).todense()

In [6]:
test_x = tfidf.transform(test_comment).todense()

print(test_x.shape)

(153164, 7145)


In [7]:
persistence('tfidf_model.pkl', 'save', tfidf)

In [8]:
model = OneVsRestClassifier(LogisticRegression(class_weight='balanced'))
# score = cross_val_score(model, train_x, train_y, scoring="roc_auc", cv=cv)
# print(round(np.mean(score), 4))

model.fit(train_x, train_y)


(153164, 6)


In [10]:
prediction = model.predict_proba(test_x)

print(prediction.shape)

(153164, 6)


In [11]:
make_submission(sample, 'tfidf_lr_balanced', prediction)

In [None]:
model = OneVsRestClassifier(DecisionTreeClassifier())
score = cross_val_score(model, train_x, train_y, scoring="roc_auc", cv=cv)
print(score)

## GloVe Embedding

In [12]:
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors

glove_file = datapath('/Users/youncheol/Documents/projects/toxic-comment-classification-challenge/embedding/glove.twitter.27B.100d.txt')
tmp_file = get_tmpfile('glove_model.txt')

glove2word2vec(glove_file, tmp_file)

glove_model = KeyedVectors.load_word2vec_format(tmp_file)

In [22]:
train_tokens = []
cnt = 0

for comment in train_comment:
    comment = comment.lower()
    train_tokens.append(tokenizer(comment))
    cnt += 1
    if (cnt != 0) and (cnt % 1000 == 0):
        sys.stdout.write('\r{}'.format(cnt))

159000

In [64]:
test_tokens = []
cnt = 0

for comment in test_comment:
    comment = comment.lower()
    test_tokens.append(tokenizer(comment))
    cnt += 1
    if (cnt != 0) and (cnt % 1000 == 0):
        sys.stdout.write('\r{}'.format(cnt))

153000

In [24]:
tfidf_all = TfidfVectorizer(tokenizer=tokenizer, stop_words=stop_words, lowercase=True)
tfidf_matrix = tfidf_all.fit_transform(train_comment).todense()

In [37]:
tfidf_matrix[0, tfidf_all.vocabulary_['as']]

KeyError: 'as'

In [None]:
def make_sentence_vector(sentence, dim):
    sentence_vector = np.zeros(dim)
    num_of_tokens = 0
    for token in sentence:
        try:
            sentence_vector = np.add(sentence_vector, glove_model.get_vector(token))
            num_of_tokens += 1
        except:
            continue
    if num_of_tokens > 0:
        sentence_vector = np.divide(sentence_vector, num_of_tokens)
    return sentence_vector

In [38]:
def make_weighted_sentence_vector(sentence, dim, i):
    sentence_vector = np.zeros(dim)
    sum_of_weights = 0
    for token in sentence:
        try:
            tfidf_weight = tfidf_matrix[i, tfidf_all.vocabulary_[token]]
            sentence_vector = np.add(sentence_vector, tfidf_weight * glove_model.get_vector(token))
            sum_of_weights += tfidf_weight
        except:
            continue
    if sum_of_weights > 0:
        sentence_vector = np.divide(sentence_vector, sum_of_weights)
    return sentence_vector

In [39]:
dim = 100

train_x = np.zeros([len(train_tokens), dim])

for i, sentence in enumerate(train_tokens):
    train_x[i] = make_weighted_sentence_vector(sentence, dim, i)

In [65]:
dim = 100

test_x = np.zeros([len(test_tokens), dim])

for i, sentence in enumerate(test_tokens):
    test_x[i] = make_weighted_sentence_vector(sentence, dim, i)

In [66]:
test_x.shape

(153164, 100)

In [43]:
model = OneVsRestClassifier(LogisticRegression(class_weight='balanced'))
score = cross_val_score(model, train_x, train_y, scoring="roc_auc", cv=cv)
print(score)

[0.95532909 0.95491295 0.95404358 0.95282175 0.95314141]


In [44]:
from sklearn.tree import DecisionTreeClassifier

model = OneVsRestClassifier(DecisionTreeClassifier(max_depth=8, class_weight='balanced'), n_jobs=3)
score = cross_val_score(model, train_x, train_y, scoring="roc_auc", cv=cv)
print(score)

[0.83593426 0.84916013 0.85917871 0.85198862 0.85241564]


In [48]:
from sklearn.cross_validation import train_test_split

x_train, x_test, y_train, y_test = train_test_split(train_x, train_y)

In [53]:
depth_list = [30, 40, 50]

for depth in depth_list:
    model = OneVsRestClassifier(RandomForestClassifier(n_estimators=depth, max_depth=9, class_weight='balanced'), n_jobs=3)
    model.fit(x_train, y_train)
    pred = model.predict_proba(x_test)
    print(roc_auc_score(y_test, pred))

0.9485343842498016
0.9505735267664162
0.9514172787341217


In [67]:
model = OneVsRestClassifier(RandomForestClassifier(n_estimators=50, max_depth=9, class_weight='balanced'), n_jobs=3)
model.fit(train_x, train_y)
prediction = model.predict_proba(test_x)

In [69]:
make_submission(sample, '100vector_rf', prediction)

In [54]:
from sklearn.ensemble import GradientBoostingClassifier

GradientBoostingClassifier

In [55]:
model = OneVsRestClassifier(GradientBoostingClassifier(), n_jobs=3)
model.fit(x_train, y_train)
pred = model.predict_proba(x_test)
print(roc_auc_score(y_test, pred))

KeyboardInterrupt: 

In [46]:
from sklearn.ensemble import RandomForestClassifier

model = OneVsRestClassifier(RandomForestClassifier(max_depth=10, class_weight='balanced'), n_jobs=3)
score = cross_val_score(model, train_x, train_y, scoring="roc_auc", cv=cv)
print(score)

[0.92976895 0.92987358 0.93314031 0.93257189 0.92671809]


In [61]:
from sklearn.svm import SVC

In [63]:
model = OneVsRestClassifier(SVC(kernel='poly', probability=True, class_weight='balanced', verbose=True), n_jobs=3)
model.fit(x_train, y_train)
pred = model.predict_proba(x_test)

roc_auc_score(y_test, pred)
# score = cross_val_score(model, train_x, train_y, scoring="roc_auc", cv=cv)
# print(score)

KeyboardInterrupt: 

In [None]:
svd = TruncatedSVD(n_components=train_x.shape[1]-1)
svd.fit(train_x)

In [15]:
X = svd.transform(train_x)

In [22]:
np.sum(svd.explained_variance_ratio_)

0.1514199642787458

In [None]:
np.sum(svd.explained_variance_ratio_)

In [19]:
model = OneVsRestClassifier(LogisticRegression())
score = cross_val_score(model, X, train_y, scoring="roc_auc", cv=cv)

In [21]:
from sklearn.svm import LinearSVC

model = OneVsRestClassifier(LinearSVC())
score = cross_val_score(model, X, train_y, scoring="roc_auc", cv=cv)

print(score)

[0.9486816  0.94820201 0.9493849  0.94904128 0.94804557]


In [36]:


min_df_list = [50, 60, 70, 80, 90, 100]

print('MultinomialNB + BOW')
for min_df in min_df_list:
    bow = CountVectorizer(tokenizer=tokenizer, min_df=min_df, max_df=0.5, stop_words=stop_words, lowercase=True)
    X = bow.fit_transform(train_comment)
    model = OneVsRestClassifier(MultinomialNB())
    score = cross_val_score(model, X, train_y, scoring="roc_auc", cv=cv)
    print('min_df: {}, vacabulary: {}, score: {} {}'.format(min_df, len(bow.vocabulary_), round(np.mean(score), 4), time.ctime()))

print()
print('LogisticRegression + BOW')
for min_df in min_df_list:
    bow = CountVectorizer(tokenizer=tokenizer, min_df=min_df, max_df=0.5, stop_words=stop_words, lowercase=True)
    X = bow.fit_transform(train_comment)
    model = OneVsRestClassifier(LogisticRegression())
    score = cross_val_score(model, X, train_y, scoring="roc_auc", cv=cv)
    print('min_df: {}, vacabulary: {}, score: {} {}'.format(min_df, len(bow.vocabulary_), round(np.mean(score), 4), time.ctime()))
    
print()
print('MultinomialNB + TF-IDF')
for min_df in min_df_list:
    bow = TfidfVectorizer(tokenizer=tokenizer, min_df=min_df, max_df=0.5, stop_words=stop_words, lowercase=True)
    X = bow.fit_transform(train_comment)
    model = OneVsRestClassifier(MultinomialNB())
    score = cross_val_score(model, X, train_y, scoring="roc_auc", cv=cv)
    print('min_df: {}, vacabulary: {}, score: {} {}'.format(min_df, len(bow.vocabulary_), round(np.mean(score), 4), time.ctime()))
    
print()
print('LogisticRegression + TF-IDF')
for min_df in min_df_list:
    bow = TfidfVectorizer(tokenizer=tokenizer, min_df=min_df, max_df=0.5, stop_words=stop_words, lowercase=True)
    X = bow.fit_transform(train_comment)
    model = OneVsRestClassifier(LogisticRegression())
    score = cross_val_score(model, X, train_y, scoring="roc_auc", cv=cv)
    print('min_df: {}, vacabulary: {}, score: {} {}'.format(min_df, len(bow.vocabulary_), round(np.mean(score), 4), time.ctime()))

MultinomialNB + BOW
min_df: 50, vacabulary: 7145, score: 0.9428 Mon Jun 25 00:32:06 2018
min_df: 60, vacabulary: 6369, score: 0.9431 Mon Jun 25 00:33:54 2018
min_df: 70, vacabulary: 5804, score: 0.9434 Mon Jun 25 00:35:44 2018
min_df: 80, vacabulary: 5340, score: 0.943 Mon Jun 25 03:00:30 2018
min_df: 90, vacabulary: 4942, score: 0.9433 Mon Jun 25 04:49:35 2018
min_df: 100, vacabulary: 4605, score: 0.9431 Mon Jun 25 06:35:14 2018

LogisticRegression + BOW
min_df: 50, vacabulary: 7145, score: 0.9425 Mon Jun 25 06:39:46 2018
min_df: 60, vacabulary: 6369, score: 0.9423 Mon Jun 25 06:44:08 2018
min_df: 70, vacabulary: 5804, score: 0.9426 Mon Jun 25 06:48:31 2018
min_df: 80, vacabulary: 5340, score: 0.9421 Mon Jun 25 06:52:54 2018
min_df: 90, vacabulary: 4942, score: 0.9412 Mon Jun 25 06:57:15 2018
min_df: 100, vacabulary: 4605, score: 0.9408 Mon Jun 25 07:01:32 2018

MultinomialNB + TF-IDF
min_df: 50, vacabulary: 7145, score: 0.9551 Mon Jun 25 07:03:20 2018
min_df: 60, vacabulary: 6369, sc

In [50]:
bow = TfidfVectorizer(tokenizer=tokenizer, min_df=50, max_df=0.5, ngram_range=(1, 2), stop_words=stop_words, lowercase=True)
X = bow.fit_transform(train_comment)
model = OneVsRestClassifier(LogisticRegression())
# score = cross_val_score(model, X, train_y, scoring="roc_auc", cv=cv)
# print(score)
# print(round(np.mean(score), 4))

model.fit(X, train_y)

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          n_jobs=1)

In [45]:
bow = TfidfVectorizer(tokenizer=tokenizer, min_df=50, max_df=0.5, ngram_range=(1, 3), stop_words=stop_words, lowercase=True)
X = bow.fit_transform(train_comment)
model = OneVsRestClassifier(LogisticRegression())
score = cross_val_score(model, X, train_y, scoring="roc_auc", cv=cv)
print(score)
print(round(np.mean(score), 4))

[0.97754792 0.97777691 0.97635416 0.97919322 0.97691912]
0.9776


In [46]:

len(bow.vocabulary_)
# joblib.dump(bow, 'bow_model.pkl') 

15326

In [12]:


np.save('bow_train', train_x)

In [14]:
train_x.shape

(159571, 19575)

In [16]:
test_x = bow.transform(test_comment).todense()

In [13]:


clf.fit(train_x, train_y)

OneVsRestClassifier(estimator=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
          n_jobs=1)

In [14]:
from sklearn.externals import joblib

joblib.dump(clf, 'bow_nb.pkl') 

['bow_nb.pkl']

In [35]:
pred_proba = clf.predict_proba(bow_feature)

In [47]:
with open('test_comment.pkl', 'rb') as f:
    test_comment = pickle.load(f)
    
# bow = joblib.load('bow_model.pkl')
# clf = joblib.load('bow_nb.pkl')

In [14]:
test_x = bow.transform(test_comment).todense()

In [15]:
prediction = clf.predict_proba(test_x)

In [16]:
prediction.shape

(153164, 6)

In [37]:
pred_proba[6]

array([0.99984853, 0.92641029, 0.99933135, 0.00160257, 0.99823054,
       0.00928929])

In [38]:
train_y[6]

array([1, 1, 1, 0, 1, 0], dtype=int32)

In [34]:
accuracy_score(train_y, pred)

0.8939594287182508

In [51]:
test_x = bow.transform(test_comment)

In [52]:
prediction = model.predict_proba(test_x)

In [53]:
prediction.shape

(153164, 6)

In [49]:
sample.columns.tolist()[1:]

['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [54]:
idx = sample['id']
columns = sample.columns.tolist()[1:]
fname = 'tfidf_lr'

sub = pd.DataFrame(prediction, index=idx, columns=columns)
sub.to_csv('submissions/{}.csv'.format(fname), index=True)

In [18]:
test_comment[0]

'Yo bitch Ja Rule is more succesful then you will ever be whats up with you and hating you sad mofuckas   i should bitch slap ur pethedic white faces and get you to kiss my ass you guys sicken me  Ja rule is about pride in da music man  dont diss that shit on him  and nothin is wrong bein like tupac he was a brother too   fuckin white boys get things right next time'

In [None]:
from sklearn.svm import SVC

model = OneVsRestClassifier(SVC())

score = cross_val_score(model, X, train_y, scoring="roc_auc", cv=cv)
print(score)

(159571, 13317)

In [12]:
cleaning_text(expand_contraction(train.comment_text[99]))

'you can do all you are doing right now but if you get a username you will be able to do more and have more impact is what i am saying  and you seem to be very familiar with everything so you probably have a username  just get one  it takes NUM seconds'

In [51]:
tokenizing_stemming(comment_list[99])

['right',
 'get',
 'username',
 'able',
 'impact',
 'saying',
 'seem',
 'familiar',
 'everything',
 'probably',
 'username',
 'get',
 'one',
 'take',
 '10',
 'second']

In [49]:
cnt = 0
comment_list = []

pattern = re.compile('REDIRECT Talk:')

for i in range(len(train.comment_text)):
    if '(UTC)' in train.comment_text[i]:
        comment_list.append(i)
        cnt += 1
        
    if cnt > 10:
        break

In [46]:
string = 'fadhfkalgjkalfjakl;fjkald 13:0 fafj 18-9 akl;f'
time_pattern = re.compile('[0-9]+:[0-9]+')

re.findall(time_pattern, string)

['13:0', '18-9']

In [75]:
date_pattern = re.compile('[0-9]+\s\w+\s[0-9]+\s\(UTC\)')
time_pattern = re.compile('[0-9]+:[0-9]+')
ip_pattern = re.compile('[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+')
fname_pattern = re.compile('.+\.jpg|.+\.jpeg|.+\.png')

In [97]:
cnt = 0

for i in range(len(train.comment_text)):
    comment = cleaning_text(train.comment_text[i])
    print(comment)
    print('*' * 20)
    cnt += 1
        
    if cnt > 30:
        break

explanation why the edits made under my username hardcore metallica fan were reverted? they weren't vandalisms, just closure on some gas after i voted at new york dolls fac. and please don't remove the template from the talk page since i'm retired now.
********************
d'aww! he matches this background colour i'm seemingly stuck with. thanks.   , january 11, 2016 (utc)
********************
hey man, i'm really not trying to edit war. it's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. he seems to care more about the formatting than the actual info.
********************
" more i can't make any real suggestions on improvement - i wondered if the section statistics should be later on, or a subsection of ""types of accidents""  -i think the references may need tidying so that they are all in the exact same format ie date format etc. i can do that later on, if no-one else does first - if you have any preferences for

In [77]:
cnt = 0

for i in range(len(train.comment_text)):
    if re.findall('\(talk\|email\)', train.comment_text[i]):
        print(train.comment_text[i])
        print('*' * 20)
        cnt += 1
        
    if cnt > 10:
        break

"== TfD nomination of Template:SilentRedirect ==

Template:SilentRedirect has been nominated for deletion. You are invited to comment on the discussion at Wikipedia:Templates for deletion#Template:SilentRedirect. Thank you.   | (talk) 

 Watermarks 

I shall remove the obvious ones, the less obvious ones stay. I am allowed to watermark images with anything whatsoever I wish to, even if it's a contradiction, and if you wish me to use up further WP  bandwidth by re-uploading them, that has no effect on me. I can't do anything until this evening, and if any get deleted, I shall reupload them. Yours without respect (talk|email) 
You could have the grace to reply. I said that I'd deal with them this evening, but only the ones where the watermark is visible from the page. So *** off (talk|email) 
I am planning to re-upload them with less obtrusive watermarks, such as that on Image:Bliss parody.jpg. (talk|email) 
It wasn't vandalism; I used an external link as per your instructions, which inc

In [69]:
re.findall(pattern, train.comment_text[comment_list[7]])

[('06', '', '', ', ')]

In [70]:
train.comment_text[comment_list[7]]

"06, 29 December 2007 (UTC)\nYep. LOL, the [[Reformist Party (Serbia)|Reformist Party] is having another go (the 20th very last on the parliamentary election, winning less votes than notable to actually be mentioned). ) \nBy the way, here's something very little people have figured out - the new Constitution of Serbia has been brought to enable Kosovo' secession. The 1990 Constitution barred that as a possibility, and after the Kumanovo Military-Technical Agreement was signed between NATO and FRY the SRS broke its coalition with SPS and the government collapsed, causing new elections - because that was unconstitutional, as an act of highest treason, enough to be tried from maximum sentence (which the Radicals demanded from then to his death, to have a trial in Serbia, and to be tried for treason among other reasons). This constitution releases the authorities from that weight, and they won't have to go to prison if they recognize any form of further loss of sovereignty in Kosovo. ;)   

* URL
* REDIRECT Talk:
* (UTC)
* IP address
* Date, Time
* File name

In [28]:
train.comment_text[6193]

'http://www.imdb.com/name/nm2551199/filmoseries#tt1327666 71.223.125.139'

In [25]:
train.iloc[1373, 2:]

toxic            1
severe_toxic     0
obscene          1
threat           0
insult           0
identity_hate    0
Name: 1373, dtype: object

In [13]:
train.comment_text[not_eng]

17         REDIRECT Talk:Voydan Pop Georgiev- Chernodrinski
62                        REDIRECT Talk:Frank Herbert Mason
87          Oh, it's me vandalising?xD See here. Greetings,
146       Azari or Azerbaijani? \n\nAzari-iranian,azerba...
177          86.29.244.57|86.29.244.57]] 04:21, 14 May 2007
351                 Future Perfect at Sunrise|☼]] 14:59, 16
358               I'm afraid that's a broken link for me. -
592                         REDIRECT Talk:José Manuel Rojas
702       Valerie Poxleitner \n\nValeri Poxleitner, A.K....
758                                  |listas = Manos Family
823       Barnes                  Aus     1             ...
829                                06:15, 19 Aug 2004 (UTC)
852                                 P.S. Are you a /b/tard?
886                 Ion G Nemes|talk]]) 04:08, 21 June 2011
897                         "\nNo problem at all!  (talk) "
899                                     I've just seen that
916       "\nNo problem. Thanks for lett