In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MaxAbsScaler

In [2]:
train = pd.read_csv('../Dataset/train.csv')
test = pd.read_csv('../Dataset/test.csv')

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             159571 non-null  object
 1   comment_text   159571 non-null  object
 2   toxic          159571 non-null  int64 
 3   severe_toxic   159571 non-null  int64 
 4   obscene        159571 non-null  int64 
 5   threat         159571 non-null  int64 
 6   insult         159571 non-null  int64 
 7   identity_hate  159571 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 9.7+ MB


In [4]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 153164 entries, 0 to 153163
Data columns (total 2 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            153164 non-null  object
 1   comment_text  153164 non-null  object
dtypes: object(2)
memory usage: 2.3+ MB


In [5]:
train.describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
count,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0
mean,0.095844,0.009996,0.052948,0.002996,0.049364,0.008805
std,0.294379,0.099477,0.223931,0.05465,0.216627,0.09342
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0


In [6]:
train.corr()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
toxic,1.0,0.308619,0.676515,0.157058,0.647518,0.266009
severe_toxic,0.308619,1.0,0.403014,0.123601,0.375807,0.2016
obscene,0.676515,0.403014,1.0,0.141179,0.741272,0.286867
threat,0.157058,0.123601,0.141179,1.0,0.150022,0.115128
insult,0.647518,0.375807,0.741272,0.150022,1.0,0.337736
identity_hate,0.266009,0.2016,0.286867,0.115128,0.337736,1.0


In [7]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [8]:
test.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [9]:
train_comments = train['comment_text'].values
test_comments = test['comment_text'].values
train_comments_lengths = [len(c) for c in tqdm(train_comments)]
test_comments_lengths = [len(c) for c in tqdm(test_comments)]

100%|██████████| 159571/159571 [00:00<00:00, 1342975.47it/s]
100%|██████████| 153164/153164 [00:00<00:00, 1385210.66it/s]


In [10]:
def explore_comments(arr):
    print("Max-length:", np.max(arr))
    print("Avg-length:", np.average(arr))
    print("Min-length:", np.min(arr))
    print("Standard division", np.std(arr))
    print("Range", np.min(arr), np.average(arr) + 2 * np.std(arr))

print("---Train---")
explore_comments(train_comments_lengths)

print("---Test---")
explore_comments(test_comments_lengths) # select 1600 for Char RNN I guess

---Train---
Max-length: 5000
Avg-length: 394.0732213246768
Min-length: 6
Standard division 590.7184309382144
Range 6 1575.5100832011055
---Test---
Max-length: 5000
Avg-length: 364.8751207855632
Min-length: 1
Standard division 592.4901645516661
Range 1 1549.8554498888955


In [11]:
df = pd.concat([train['comment_text'], test['comment_text']], axis=0)
df = df.fillna("unknown") # one na
nrow_train = train.shape[0]

In [12]:
print(df.shape[0])

312735


In [13]:
df.head()

0    Explanation\nWhy the edits made under my usern...
1    D'aww! He matches this background colour I'm s...
2    Hey man, I'm really not trying to edit war. It...
3    "\nMore\nI can't make any real suggestions on ...
4    You, sir, are my hero. Any chance you remember...
Name: comment_text, dtype: object

In [14]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=50000)
data = vectorizer.fit_transform(df)

In [15]:
print(data)

  (0, 711)	0.18204445368864222
  (0, 879)	0.20775413789068986
  (0, 605)	0.23993680844003243
  (0, 1318)	0.2235268466647893
  (0, 36305)	0.22284690643148006
  (0, 31306)	0.08188817087124552
  (0, 42143)	0.08743950124331666
  (0, 42515)	0.14963954366164903
  (0, 35873)	0.13319854787482097
  (0, 13540)	0.0913926564059703
  (0, 16014)	0.20891432730918813
  (0, 13498)	0.28591283079888064
  (0, 47786)	0.18517583907832094
  (0, 29407)	0.11534052498326015
  (0, 46065)	0.20985647318821948
  (0, 18058)	0.21674683272909434
  (0, 9077)	0.24494461695137756
  (0, 23677)	0.08616056487030553
  (0, 45415)	0.28033975873604766
  (0, 46648)	0.19787236031077896
  (0, 36385)	0.1437615365077495
  (0, 16224)	0.18397157427013258
  (0, 27413)	0.2690073855580892
  (0, 19760)	0.2380133505872249
  (0, 45208)	0.16979460216797468
  :	:
  (312733, 12331)	0.14836541130115016
  (312733, 28066)	0.1078417187323574
  (312733, 25431)	0.09483013779915266
  (312733, 40694)	0.08764497581028433
  (312733, 39111)	0.09693465616

In [16]:
X = MaxAbsScaler().fit_transform(data)
col = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
preds = np.zeros((test.shape[0], len(col)))

In [17]:
print(X)

  (0, 711)	0.29084092782002846
  (0, 879)	0.35152824127566124
  (0, 605)	0.4603353338692267
  (0, 1318)	0.26978455301496723
  (0, 36305)	0.23892339074450406
  (0, 31306)	0.08188817087124552
  (0, 42143)	0.08743950124331666
  (0, 42515)	0.14963954366164903
  (0, 35873)	0.13319854787482097
  (0, 13540)	0.0913926564059703
  (0, 16014)	0.232979260289499
  (0, 13498)	0.41153618372855594
  (0, 47786)	0.2482892800474848
  (0, 29407)	0.11534052498326015
  (0, 46065)	0.23460947709299815
  (0, 18058)	0.21674683272909434
  (0, 9077)	0.37143933583085614
  (0, 23677)	0.08616056487030553
  (0, 45415)	0.30649754459641687
  (0, 46648)	0.2793507839722064
  (0, 36385)	0.1437615365077495
  (0, 16224)	0.2275683259151828
  (0, 27413)	0.39268853302997914
  (0, 19760)	0.3084612023438448
  (0, 45208)	0.1908544860554389
  :	:
  (312733, 12331)	0.14836541130115016
  (312733, 28066)	0.17029198723838934
  (312733, 25431)	0.09483013779915266
  (312733, 40694)	0.1170500282830148
  (312733, 39111)	0.0969346561606953

In [48]:
# number of unique words
total_words = 0
unique_words = []
for comment in tqdm(train_comments):
    words = comment.split(" ")
    total_words += len(words)
    unique_words += words
for comment in tqdm(test_comments):
    words = comment.split(" ")
    total_words += len(words)
    unique_words += words

unique_words = set(unique_words)

100%|██████████| 159571/159571 [00:01<00:00, 125723.28it/s]
100%|██████████| 153164/153164 [00:01<00:00, 131313.21it/s]


In [50]:
print("{:,.0f} total words".format(total))
print("{:,.0f} unique words".format(len(unique_words)))

20,850,985 total words
1,043,267 unique words


# Embeddings

In [6]:
!ls ../Features/

bad-words.csv              glove.twitter.27B.200d.txt
big.txt                    glove.twitter.27B.25d.txt
cleanwords.txt             glove.twitter.27B.50d.txt
crawl-300d-2M.vec          spellcheck.ipynb
english_words_479k.txt     spellcheckcorrector.ipynb
glove.840B.300d.txt        typo_correction.ipynb
glove.twitter.27B.100d.txt


In [12]:
!ls ..

[34mDataset[m[m          [34mbaselines[m[m        results.txt
[34mEmbeddings[m[m       clean_data.ipynb [34msotoxic[m[m
[34mFeatures[m[m         [34menv[m[m              [34mtools[m[m


In [1]:
import sys
sys.path.append("..")

import sotoxic
from sotoxic.config import dataset_config, model_config
from sotoxic.data_helper import data_loader

In [2]:
dl = data_loader.DataLoader()

In [3]:
!ls ../Features/

bad-words.csv              glove.twitter.27B.200d.txt
big.txt                    glove.twitter.27B.25d.txt
cleanwords.txt             glove.twitter.27B.50d.txt
crawl-300d-2M.vec          spellcheck.ipynb
english_words_479k.txt     spellcheckcorrector.ipynb
glove.840B.300d.txt        typo_correction.ipynb
glove.twitter.27B.100d.txt


In [4]:
embedding_index = dl.load_embedding("../Features/glove.840B.300d.txt")

54481it [00:04, 13404.36it/s]Err on  ['.', '.']
129704it [00:09, 12229.42it/s]Err on  ['at', 'name@domain.com']
153377it [00:11, 14315.27it/s]Err on  ['.', '.']
202206it [00:15, 14212.85it/s]Err on  ['to', 'name@domain.com']
212487it [00:16, 14737.02it/s]Err on  ['.', '.']
223141it [00:17, 13473.54it/s]Err on  ['.', '.']
255106it [00:19, 12991.09it/s]Err on  ['email', 'name@domain.com']
368198it [00:27, 13680.47it/s]Err on  ['or', 'name@domain.com']
534556it [00:40, 13132.46it/s]Err on  ['contact', 'name@domain.com']
718663it [00:58, 8588.96it/s]Err on  ['Email', 'name@domain.com']
995488it [01:25, 6032.90it/s]Err on  ['on', 'name@domain.com']
1124470it [01:38, 12424.82it/s]Err on  ['At', 'Killerseats.com']
1150111it [01:41, 8114.20it/s]Err on  ['by', 'name@domain.com']
1353887it [02:05, 10877.11it/s]Err on  ['in', 'mylot.com']
1501292it [02:19, 11466.68it/s]Err on  ['emailing', 'name@domain.com']
1535338it [02:22, 11766.22it/s]Err on  ['Contact', 'name@domain.com']
1902017it [03:01, 1

In [8]:
list(embedding_index.keys())[1:5]

['.', 'the', 'and', 'to']

In [9]:
embedding_index['and']

array([-1.8567e-01,  6.6008e-02, -2.5209e-01, -1.1725e-01,  2.6513e-01,
        6.4908e-02,  1.2291e-01, -9.3979e-02,  2.4321e-02,  2.4926e+00,
       -1.7916e-02, -7.1218e-02, -2.4782e-01, -2.6237e-01, -2.2460e-01,
       -2.1961e-01, -1.2927e-01,  1.0867e+00, -6.6072e-01, -3.1617e-02,
       -5.7328e-02,  5.6903e-02, -2.7939e-01, -3.9825e-01,  1.4251e-01,
       -8.5146e-02, -1.4779e-01,  5.5067e-02, -2.8687e-03, -2.0917e-01,
       -7.0735e-02,  2.2577e-01, -1.5881e-01, -1.0395e-01,  9.7110e-02,
       -5.6251e-01, -3.2929e-01, -2.0853e-01,  9.8711e-03,  4.9777e-02,
        1.4883e-03,  1.5884e-01,  4.2771e-02, -2.6956e-03, -2.4620e-02,
       -1.9213e-01, -2.2556e-01,  1.0838e-01,  9.0086e-02, -1.3291e-01,
        3.2559e-01, -1.7038e-01, -1.0990e-01, -2.3986e-01, -2.4289e-02,
        1.4656e-02, -2.3700e-01,  8.4828e-02, -3.5982e-01, -7.6746e-02,
        4.8909e-02,  1.1431e-01, -2.1013e-01,  2.4765e-01, -1.7531e-02,
       -1.4028e-01,  4.6191e-02,  2.2972e-01,  1.1750e-01,  1.27