In [111]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [113]:
df=pd.read_csv('./data/toxic_comment_train.csv.zip',compression='zip')

In [115]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [117]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
id               159571 non-null object
comment_text     159571 non-null object
toxic            159571 non-null int64
severe_toxic     159571 non-null int64
obscene          159571 non-null int64
threat           159571 non-null int64
insult           159571 non-null int64
identity_hate    159571 non-null int64
dtypes: int64(6), object(2)
memory usage: 9.7+ MB


In [119]:
targets=['toxic','severe_toxic','obscene','threat','insult','identity_hate']

In [121]:
df[targets].sum(axis='rows')

toxic            15294
severe_toxic      1595
obscene           8449
threat             478
insult            7877
identity_hate     1405
dtype: int64

In [123]:
df[targets].sum(axis='rows')/len(df)

toxic            0.095844
severe_toxic     0.009996
obscene          0.052948
threat           0.002996
insult           0.049364
identity_hate    0.008805
dtype: float64

In [None]:
df['bad_message']=df[targets].any(axis='columns')
df['bad_message'].value_counts()/len(df)

False    0.898321
True     0.101679
Name: bad_message, dtype: float64

In [104]:
from keras.preprocessing.text import Tokenizer

In [105]:
tokenizer=Tokenizer()

In [106]:
tokenizer.fit_on_texts(df['comment_text'])

In [107]:
len(tokenizer.word_index)

AttributeError: 'Tokenizer' object has no attribute 'word_index'

In [108]:
threshold=3
vocab_size=len([el for el in tokenizer.word_counts.items() if el[1] > threshold])

In [109]:
vocab_size

20357

In [110]:
tokenizer=Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(df['comment_text'])

In [112]:
len(tokenizer.word_index)

210337

In [114]:
sequences=np.array(tokenizer.texts_to_sequences(df['comment_text']))

In [116]:
longest_sequence=max([len(seq) for seq in sequences])

In [118]:
longest_sequence

1400

In [120]:
max_index=max([max(seq) for seq in sequences if len(seq) >1])

In [122]:
max_index

20356

In [124]:
data=df[['bad_message']].copy()
data['seq']=sequences

KeyError: "['bad_message'] not in index"

In [125]:
data.head()

Unnamed: 0,bad_message,seq
0,False,"[688, 75, 1, 126, 130, 177, 29, 672, 4511, 120..."
1,False,"[52, 2635, 13, 555, 3809, 73, 4556, 2706, 21, ..."
2,False,"[412, 437, 73, 134, 14, 249, 2, 71, 314, 78, 5..."
3,False,"[57, 7, 228, 97, 54, 328, 1436, 15, 2133, 7, 6..."
4,False,"[6, 1677, 19, 29, 3516, 54, 1069, 6, 579, 39, ..."


In [126]:
from sklearn.model_selection import train_test_split

In [127]:
data_train,data_test=train_test_split(data,test_size=0.3,random_state=0,stratify=data['bad_message'])

In [128]:
data_train_good_messages=data_train[data_train['bad_message']==False].copy()
data_train_bad_messages=data_train[data_train['bad_message']==True].copy()
data_test_good_messages=data_test[data_test['bad_message']==False].copy()
data_test_bad_messages=data_test[data_test['bad_message']==True].copy()

In [129]:
from keras.preprocessing.sequence import pad_sequences
#Randomly eliminate some elements in the sequence
def random_eliminate(sequences,max_drop=6):
    seqs=[]
    for sequence in sequences:
        r=range(len(sequence))
        keep=max(1,len(r)-np.random.randint(0,max_drop))
        try:
            keep_idx=sorted(np.random.choice(r,keep,replace=False))
            new_sequence=[sequence[k] for k in keep_idx]
        except:
            new_sequence=sequence
            seqs.append(new_sequence)
            return seqs

In [141]:
def batch_generator(good_seq,bad_seq,batch_size=156,random_drop=True):
    half_batch=batch_size//2
    if half_batch > min(len(good_seq),len(bad_seq)):
        raise_exception("Choose a smaller batch size")
    while True:
        good_batch=good_seq.sample(half_batch).values.copy()
        bad_batch=bad_seq.sample(half_batch).values.copy()
        
        if random_drop:
            good_batch=random_eliminate(good_batch)
            bad_batch=random_eliminate(bad_batch)
            
        combined_seq=np.hstack([good_batch,bad_batch])
        X=pad_sequences(combined_seq)
        y=half_batch* [False] + half_batch* [True]  
        
        yield(X,y)
            

In [142]:
batch_size=256
train_gen=batch_generator(data_train_good_messages['seq'],
                          data_train_bad_messages['seq'],
                          batch_size=batch_size)

In [143]:
from keras.models import Sequential
from keras.layers import Embedding,Dense,GRU
from keras.optimizers import Adam


In [144]:
model=Sequential()
model.add(Embedding(input_dim=vocab_size,output_dim=16))
model.add(GRU(32,dropout=0.15,recurrent_dropout=0.15))
model.add(Dense(1,activation='sigmoid'))
model.compile(optimizer=Adam(lr=0.01),loss='binary_crossentropy',metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, None, 16)          325712    
_________________________________________________________________
gru_6 (GRU)                  (None, 32)                4704      
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 33        
Total params: 330,449
Trainable params: 330,449
Non-trainable params: 0
_________________________________________________________________


In [146]:
for X_val, y_val in batch_generator(
    data_test_good_messages['seq'],
    data_test_bad_messages['seq'],
    batch_size=1024,
    random_drop=False):
Break

In [148]:
h=model.fit_generator(train_gen,
steps_per_epoch=len(data_train_bad_messages)/batch_size,
epochs=10,
verbose=1,
validation_data=(X_val, y_val))

AttributeError: 'bool' object has no attribute 'ndim'