In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from scipy.sparse import hstack
from scipy.special import logit, expit
from tqdm import tqdm

In [2]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [3]:
train = pd.read_csv('../input/train.csv').fillna(' ')
test = pd.read_csv('../input/test.csv').fillna(' ')

In [4]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [5]:
train.describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
count,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0
mean,0.095844,0.009996,0.052948,0.002996,0.049364,0.008805
std,0.294379,0.099477,0.223931,0.05465,0.216627,0.09342
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
id               159571 non-null object
comment_text     159571 non-null object
toxic            159571 non-null int64
severe_toxic     159571 non-null int64
obscene          159571 non-null int64
threat           159571 non-null int64
insult           159571 non-null int64
identity_hate    159571 non-null int64
dtypes: int64(6), object(2)
memory usage: 9.7+ MB


In [7]:
test.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [8]:
test.describe()

Unnamed: 0,id,comment_text
count,153164,153164
unique,153164,153164
top,53a47a682e1841a3,"== pop culture == \n\n the show dexter, and th..."
freq,1,1


In [9]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 153164 entries, 0 to 153163
Data columns (total 2 columns):
id              153164 non-null object
comment_text    153164 non-null object
dtypes: object(2)
memory usage: 2.3+ MB


In [10]:
train_text = train['comment_text']
test_text = test['comment_text']
all_text = pd.concat([train_text, test_text])

In [11]:
train_text.head()

0    Explanation\nWhy the edits made under my usern...
1    D'aww! He matches this background colour I'm s...
2    Hey man, I'm really not trying to edit war. It...
3    "\nMore\nI can't make any real suggestions on ...
4    You, sir, are my hero. Any chance you remember...
Name: comment_text, dtype: object

In [12]:
test_text.head()

0    Yo bitch Ja Rule is more succesful then you'll...
1    == From RfC == \n\n The title is fine as it is...
2    " \n\n == Sources == \n\n * Zawe Ashton on Lap...
3    :If you have a look back at the source, the in...
4            I don't anonymously edit articles at all.
Name: comment_text, dtype: object

In [13]:
all_text.head()

0    Explanation\nWhy the edits made under my usern...
1    D'aww! He matches this background colour I'm s...
2    Hey man, I'm really not trying to edit war. It...
3    "\nMore\nI can't make any real suggestions on ...
4    You, sir, are my hero. Any chance you remember...
Name: comment_text, dtype: object

In [14]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf = True,
    strip_accents = 'unicode',
    analyzer = 'word',
    token_pattern= r'\w{1,}',
    ngram_range = (1, 1),
    max_features = 15000
)

In [15]:
word_vectorizer.fit(all_text)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=15000, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='unicode', sublinear_tf=True,
        token_pattern='\\w{1,}', tokenizer=None, use_idf=True,
        vocabulary=None)

In [16]:
train_word_features = word_vectorizer.transform(train_text)

In [17]:
train_word_features

<159571x15000 sparse matrix of type '<class 'numpy.float64'>'
	with 6838379 stored elements in Compressed Sparse Row format>

In [18]:
print(train_word_features)

  (0, 14633)	0.17682103053236303
  (0, 14348)	0.09611221730177759
  (0, 14286)	0.1889447070318102
  (0, 14285)	0.10207716282923182
  (0, 14121)	0.2003881177896734
  (0, 13929)	0.2676913213172272
  (0, 13868)	0.16213376800995843
  (0, 13673)	0.12226142793792552
  (0, 13167)	0.08734059450386246
  (0, 13123)	0.08233132554959967
  (0, 13040)	0.1428880703354296
  (0, 12939)	0.08349093619277179
  (0, 12898)	0.11370200715870034
  (0, 12236)	0.08896112473816259
  (0, 12035)	0.11240038759012848
  (0, 11213)	0.13727526853794716
  (0, 11180)	0.21279244550633455
  (0, 10998)	0.1271888633952866
  (0, 9961)	0.08393171529553184
  (0, 9523)	0.0781935204610558
  (0, 9304)	0.0611329144564909
  (0, 9125)	0.09855316232468962
  (0, 8945)	0.110134725510256
  (0, 8783)	0.07770475354170067
  (0, 8401)	0.25687024490856886
  :	:
  (159570, 13681)	0.1856917331435626
  (159570, 13176)	0.13064447891285086
  (159570, 13132)	0.14532014137869506
  (159570, 12898)	0.09701269386576174
  (159570, 11296)	0.16396621423905

In [19]:
test_word_features = word_vectorizer.transform(test_text)

In [20]:
test_word_features

<153164x15000 sparse matrix of type '<class 'numpy.float64'>'
	with 5757134 stored elements in Compressed Sparse Row format>

In [21]:
print(test_word_features)

  (0, 14636)	0.10559436854719087
  (0, 14629)	0.15804705955730877
  (0, 14553)	0.103040092394518
  (0, 14453)	0.05550925491862191
  (0, 14326)	0.2180221661261628
  (0, 14302)	0.1687726759844263
  (0, 14198)	0.060357728750088116
  (0, 13838)	0.1519574809793168
  (0, 13816)	0.076674315683588
  (0, 13551)	0.2220810857621476
  (0, 13330)	0.09304408830413483
  (0, 13292)	0.035125173903489834
  (0, 13256)	0.08052050643244624
  (0, 13174)	0.10101497455311528
  (0, 13137)	0.08045700643608131
  (0, 13120)	0.04278665111862687
  (0, 12103)	0.1906990646991488
  (0, 11944)	0.07145696333289152
  (0, 11910)	0.11466384248896573
  (0, 11482)	0.1484126777127306
  (0, 11440)	0.21793520208326064
  (0, 11296)	0.09281330226925529
  (0, 10251)	0.17865863482969443
  (0, 9304)	0.04999034122728002
  (0, 9095)	0.21700659151998997
  :	:
  (153162, 751)	0.17585355169977065
  (153162, 650)	0.08813325406077673
  (153162, 502)	0.05673016008509393
  (153163, 14645)	0.13567621130724186
  (153163, 14636)	0.1592879630551

In [22]:
char_vectorizer = TfidfVectorizer(
    sublinear_tf = True,
    strip_accents = 'unicode',
    analyzer = 'char',
    ngram_range = (1, 5),
    max_features = 20000
)

In [23]:
char_vectorizer.fit(all_text)

TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=20000, min_df=1,
        ngram_range=(1, 5), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='unicode', sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [24]:
train_char_features = char_vectorizer.transform(train_text)

In [25]:
train_char_features

<159571x20000 sparse matrix of type '<class 'numpy.float64'>'
	with 124714559 stored elements in Compressed Sparse Row format>

In [26]:
print(train_char_features)

  (0, 19818)	0.059935432870824416
  (0, 19814)	0.016230946089790747
  (0, 19723)	0.0530911896236433
  (0, 19722)	0.04533553135916414
  (0, 19718)	0.03001347176575773
  (0, 19715)	0.04881275162112216
  (0, 19714)	0.0468743831147328
  (0, 19712)	0.0394819350019667
  (0, 19705)	0.03357105888664221
  (0, 19703)	0.029077839290743233
  (0, 19699)	0.02432588803539244
  (0, 19537)	0.028091014964652567
  (0, 19536)	0.026765808451599402
  (0, 19516)	0.05489708592303375
  (0, 19514)	0.042662580811455
  (0, 19513)	0.04083445004605923
  (0, 19505)	0.03311608179063437
  (0, 19480)	0.02206157814588419
  (0, 19388)	0.05036057102944087
  (0, 19384)	0.03491636250341324
  (0, 19383)	0.03438984967426136
  (0, 19352)	0.018373346652018823
  (0, 19345)	0.036716267707461045
  (0, 19344)	0.06094598721631299
  (0, 19342)	0.0547617126794972
  :	:
  (159570, 593)	0.020221075936592565
  (159570, 532)	0.06013971205698886
  (159570, 531)	0.05494854405048095
  (159570, 528)	0.024820007522327242
  (159570, 527)	0.0269

In [27]:
test_char_features = char_vectorizer.transform(test_text)

In [28]:
test_char_features

<153164x20000 sparse matrix of type '<class 'numpy.float64'>'
	with 106315375 stored elements in Compressed Sparse Row format>

In [29]:
print(test_char_features)

  (0, 19864)	0.05191729660595573
  (0, 19863)	0.043715706672218
  (0, 19844)	0.047755481282585124
  (0, 19843)	0.031344254718224746
  (0, 19838)	0.031073042360456864
  (0, 19837)	0.03085090391827725
  (0, 19827)	0.039686052416149725
  (0, 19821)	0.026234343297666735
  (0, 19820)	0.03670048519184597
  (0, 19819)	0.03624493809269632
  (0, 19814)	0.03770519002015135
  (0, 19563)	0.03807898109416327
  (0, 19545)	0.020641896006976097
  (0, 19537)	0.011138187388761076
  (0, 19536)	0.02984068540020253
  (0, 19522)	0.04769682185911593
  (0, 19521)	0.0354602774427016
  (0, 19520)	0.029326095496142894
  (0, 19480)	0.01835759889827229
  (0, 19464)	0.037152467881211126
  (0, 19463)	0.03714405409225756
  (0, 19462)	0.0343372992501615
  (0, 19456)	0.02732072348722788
  (0, 19414)	0.020329536806025846
  (0, 19413)	0.019407398246644073
  :	:
  (153163, 886)	0.02666249682085831
  (153163, 834)	0.020014471427728737
  (153163, 827)	0.05736093672652773
  (153163, 824)	0.04671378528309872
  (153163, 816)	0

In [30]:
train_features = hstack([train_char_features, train_word_features])

In [31]:
train_features

<159571x35000 sparse matrix of type '<class 'numpy.float64'>'
	with 131552938 stored elements in COOrdinate format>

In [32]:
print(train_features)

  (0, 19818)	0.059935432870824416
  (0, 19814)	0.016230946089790747
  (0, 19723)	0.0530911896236433
  (0, 19722)	0.04533553135916414
  (0, 19718)	0.03001347176575773
  (0, 19715)	0.04881275162112216
  (0, 19714)	0.0468743831147328
  (0, 19712)	0.0394819350019667
  (0, 19705)	0.03357105888664221
  (0, 19703)	0.029077839290743233
  (0, 19699)	0.02432588803539244
  (0, 19537)	0.028091014964652567
  (0, 19536)	0.026765808451599402
  (0, 19516)	0.05489708592303375
  (0, 19514)	0.042662580811455
  (0, 19513)	0.04083445004605923
  (0, 19505)	0.03311608179063437
  (0, 19480)	0.02206157814588419
  (0, 19388)	0.05036057102944087
  (0, 19384)	0.03491636250341324
  (0, 19383)	0.03438984967426136
  (0, 19352)	0.018373346652018823
  (0, 19345)	0.036716267707461045
  (0, 19344)	0.06094598721631299
  (0, 19342)	0.0547617126794972
  :	:
  (159570, 33681)	0.1856917331435626
  (159570, 33176)	0.13064447891285086
  (159570, 33132)	0.14532014137869506
  (159570, 32898)	0.09701269386576174
  (159570, 31296)

In [33]:
test_features = hstack([test_char_features, test_word_features])

In [34]:
test_features

<153164x35000 sparse matrix of type '<class 'numpy.float64'>'
	with 112072509 stored elements in COOrdinate format>

In [35]:
print(test_features)

  (0, 19864)	0.05191729660595573
  (0, 19863)	0.043715706672218
  (0, 19844)	0.047755481282585124
  (0, 19843)	0.031344254718224746
  (0, 19838)	0.031073042360456864
  (0, 19837)	0.03085090391827725
  (0, 19827)	0.039686052416149725
  (0, 19821)	0.026234343297666735
  (0, 19820)	0.03670048519184597
  (0, 19819)	0.03624493809269632
  (0, 19814)	0.03770519002015135
  (0, 19563)	0.03807898109416327
  (0, 19545)	0.020641896006976097
  (0, 19537)	0.011138187388761076
  (0, 19536)	0.02984068540020253
  (0, 19522)	0.04769682185911593
  (0, 19521)	0.0354602774427016
  (0, 19520)	0.029326095496142894
  (0, 19480)	0.01835759889827229
  (0, 19464)	0.037152467881211126
  (0, 19463)	0.03714405409225756
  (0, 19462)	0.0343372992501615
  (0, 19456)	0.02732072348722788
  (0, 19414)	0.020329536806025846
  (0, 19413)	0.019407398246644073
  :	:
  (153162, 20751)	0.17585355169977065
  (153162, 20650)	0.08813325406077673
  (153162, 20502)	0.05673016008509393
  (153163, 34645)	0.13567621130724186
  (153163,

In [36]:
losses = []
predictions = {'id': test['id']}

In [37]:
for class_name in tqdm(class_names):
    train_target = train[class_name]
    classifier = LogisticRegression(solver='sag')
    
    cv_loss = np.mean(cross_val_score(classifier, train_features, train_target, cv=3, scoring='roc_auc'))
    losses.append(cv_loss)
    print('CV score for class {} is {}'.format(class_name, cv_loss))
    
    classifier.fit(train_features, train_target)
    predictions[class_name] = classifier.predict_proba(test_features)[:, 1]

  0%|          | 0/6 [00:00<?, ?it/s]

CV score for class toxic is 0.9781569109707159


 17%|█▋        | 1/6 [02:30<12:33, 150.79s/it]

CV score for class severe_toxic is 0.9885459781862073


 33%|███▎      | 2/6 [05:32<11:04, 166.05s/it]

CV score for class obscene is 0.989949127060258


 50%|█████     | 3/6 [08:01<08:01, 160.52s/it]

CV score for class threat is 0.9889193155217902


 67%|██████▋   | 4/6 [11:58<05:59, 179.72s/it]

CV score for class insult is 0.9824692287617731


 83%|████████▎ | 5/6 [14:30<02:54, 174.19s/it]

CV score for class identity_hate is 0.9827104181540816


100%|██████████| 6/6 [17:39<00:00, 176.53s/it]


In [38]:
print('Total CV score is {}'.format(np.mean(losses)))

Total CV score is 0.9851251631091374


In [39]:
submission = pd.DataFrame.from_dict(predictions)

In [40]:
submission.to_csv('../submission/submission_01.csv', index=False)