# 0. Install dependencies and bring up data

In [2]:
!pip install tensorflow pandas matplotlib scikit-learn

Collecting tensorflow
  Using cached tensorflow-2.13.0-cp311-cp311-macosx_10_15_x86_64.whl (216.3 MB)
Collecting pandas
  Using cached pandas-2.0.3-cp311-cp311-macosx_10_9_x86_64.whl (11.6 MB)
Collecting matplotlib
  Using cached matplotlib-3.7.2-cp311-cp311-macosx_10_12_x86_64.whl (7.4 MB)
Collecting scikit-learn
  Using cached scikit_learn-1.3.0-cp311-cp311-macosx_10_9_x86_64.whl (10.1 MB)
Collecting absl-py>=1.0.0
  Using cached absl_py-1.4.0-py3-none-any.whl (126 kB)
Collecting astunparse>=1.6.0
  Using cached astunparse-1.6.3-py2.py3-none-any.whl (12 kB)
Collecting flatbuffers>=23.1.21
  Using cached flatbuffers-23.5.26-py2.py3-none-any.whl (26 kB)
Collecting gast<=0.4.0,>=0.2.1
  Using cached gast-0.4.0-py3-none-any.whl (9.8 kB)
Collecting google-pasta>=0.1.1
  Using cached google_pasta-0.2.0-py3-none-any.whl (57 kB)
Collecting h5py>=2.9.0
  Using cached h5py-3.9.0-cp311-cp311-macosx_10_9_x86_64.whl (3.2 MB)
Collecting libclang>=13.0.0
  Using cached libclang-16.0.6-py2.py3-none-

In [3]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np

2023-08-20 17:09:30.389338: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
df = pd.read_csv(os.path.join('raw_data', 'train.csv'))

In [5]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [14]:
df.iloc[43]['comment_text']

'FUCK YOUR FILTHY MOTHER IN THE ASS, DRY!'

In [13]:
df[df.columns[2:]].iloc[43]

toxic            1
severe_toxic     0
obscene          1
threat           0
insult           1
identity_hate    0
Name: 43, dtype: int64

# 1. Preprocess

In [15]:
from tensorflow.keras.layers import TextVectorization

In [22]:
df.columns

Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate'],
      dtype='object')

In [23]:
X = df['comment_text']
y = df[df.columns[2:]].values

In [26]:
MAX_FEATURES = 200000 # number of words in the vocab

In [28]:
vectorizer = TextVectorization(max_tokens=MAX_FEATURES,
                               output_sequence_length=1800,
                               output_mode='int')

In [30]:
vectorizer.adapt(X.values) # teach the vectorizer our vocab

In [34]:
vectorizer.get_vocabulary()[:10] # list of the vocab that just learnt (first 10 words)

['', '[UNK]', 'the', 'to', 'of', 'and', 'a', 'you', 'i', 'is']

In [37]:
vectorizer('I love you bro!')

<tf.Tensor: shape=(1800,), dtype=int64, numpy=array([  8, 457,   7, ...,   0,   0,   0])>

In [38]:
vectorized_text = vectorizer(X.values)

In [40]:
vectorized_text.shape

TensorShape([159571, 1800])

In [41]:
# initiate tensorflow Dataset Pipeline
# tf dataset rule (MCSHBAP) -> Map - Cache - SHuffle - BAtch - Prefetch
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000) # biffer size
dataset = dataset.batch(16)
dataset = dataset.prefetch(8) # helps prevent bottlenecks

In [43]:
batch_X, batch_y = dataset.as_numpy_iterator().next()

In [46]:
batch_X.shape

(16, 1800)

In [47]:
batch_y.shape

(16, 6)

#### Partition the data

In [51]:
train = dataset.take(int(len(dataset) * .7))
val  = dataset.skip(int(len(dataset) * .7)).take(int(len(dataset) * .2))
test = dataset.skip(int(len(dataset) * .9)).take(int(len(dataset) * .1))

In [53]:
train.as_numpy_iterator().next()

(array([[     2,    854,    308, ...,      0,      0,      0],
        [   490,   1188,   3229, ...,      0,      0,      0],
        [100078,   2015,   2965, ...,      0,      0,      0],
        ...,
        [ 71222,  22620,   3114, ...,      0,      0,      0],
        [ 36039,   7989,     70, ...,      0,      0,      0],
        [     8,    590,     12, ...,      0,      0,      0]]),
 array([[0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0]]))