In [None]:
!pip install transformers
!pip install datasets

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import tensorflow as tf
from tensorflow.keras import activations, optimizers, losses
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
import pickle 
import numpy as np
import pandas as pd
tf.get_logger().setLevel('ERROR')

#1. Import Data

In [None]:
train_path = '/content/drive/MyDrive/PROJECT/NLP/comment_toxic_data/jigsaw-toxic-comment-train.csv'
val_path = '/content/drive/MyDrive/PROJECT/NLP/comment_toxic_data/validation.csv'
test_path = '/content/drive/MyDrive/PROJECT/NLP/comment_toxic_data/test.csv'

In [None]:
train = pd.read_csv(train_path).sample(frac=0.1, random_state=42)
val = pd.read_csv(val_path).sample(frac=0.1, random_state=42)
test = pd.read_csv(test_path).sample(frac=0.1, random_state=42)

In [None]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
170259,2ada3066c863097d,:You might like to consider that I don't give ...,1,0,1,0,1,0
92211,f685d54247b735a8,"What the heck are you talking about? I ask, my...",0,0,0,0,0,0
102203,22ed5297921900fb,Uncle Tom House Niggers,1,0,1,0,0,1
153827,a364a9ef7c480da8,"Well, just because you hate the word doesn't m...",0,0,0,0,0,0
97484,09884c47e5720176,source: Places named Scotland,0,0,0,0,0,0


In [None]:
val.head()

Unnamed: 0,id,comment_text,lang,toxic
2215,2215,Grazie del ridicolo. Grazie di una amministr...,it,0
2582,2582,Selam Docbaba. 1978 tarihili Cadılar Bayramı f...,tr,0
1662,1662,ci sono troppi collegamenti inutili in questa ...,it,0
3027,3027,Disculpe usted yo no he borrado ciertas partes...,es,0
4343,4343,"! Scusami, ma ho protetto la pagina in seguito...",it,0


In [None]:
test.head()

Unnamed: 0,id,content,lang
9955,9955,Por quê ao invés de ficar enchendo meu saco co...,pt
6757,6757,"Mil excuses si tu l as mal prise, mon interven...",fr
5228,5228,Dá para pararem de mandar mensagem para mim? P...,pt
60681,60681,Ты идиот!!! Как ты мог удалить мой труд над эт...,ru
3085,3085,"Merhaba, mutlaka bazı gerekçelerle taşımışsın...",tr


In [None]:
train['toxic'].value_counts()

0    20221
1     2134
Name: toxic, dtype: int64

In [None]:
train = train.drop(['id', 'severe_toxic',	'obscene', 'threat',	'insult',	'identity_hate'], axis=1)
val = val.drop(['id','lang'], axis=1)

In [None]:
train_texts = train['comment_text'].to_list()
val_texts = val['comment_text'].to_list()
test_texts = test['content'].to_list()

train_labels = train['toxic'].to_list()
val_labels = val['toxic'].to_list()

# 2. Preprocessing the data

In [None]:
text_lens = [len(text.split()) for text in train_texts]
MAX_LEN = int(np.percentile(text_lens,95))
MAX_LEN

223

Sử dụng DistilBertTokenizer chuyển đổi từng chuỗi comment thành một tuple chưa input_ids và attention_mask. Với MAX_LEN là độ dài của chuỗi. Nếu câu bé hơn MAX_LEN thì sẽ được đệm thêm 0 ngược lại sẽ bị cắt bớt

In [None]:
MODEL_NAME = 'distilbert-base-uncased'
example = train_texts[0]
tkzr = DistilBertTokenizer.from_pretrained(MODEL_NAME)

inputs = tkzr(example, max_length=MAX_LEN, truncation=True, padding=True)

print(f'Comment: \'{example}\'')
print(f'input ids: {inputs["input_ids"]}')
print(f'attention mask: {inputs["attention_mask"]}')

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/vocab.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/tokenizer_config.json
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_tok

Comment: ':You might like to consider that I don't give a shit what you do or think.'
input ids: [101, 1024, 2017, 2453, 2066, 2000, 5136, 2008, 1045, 2123, 1005, 1056, 2507, 1037, 4485, 2054, 2017, 2079, 2030, 2228, 1012, 102]
attention mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


Áp dụng chuyển đổi cho comment trong train và val

In [None]:
def construct_encodings(x, tkzr, max_len, truncation=True, padding=True):
  return tkzr(x, max_length=max_len, truncation=truncation, padding=padding)

train_encodings = construct_encodings(train_texts, tkzr, max_len=MAX_LEN)
val_encodings = construct_encodings(val_texts, tkzr, max_len=MAX_LEN)

Chuyển đổi encodings và labels thành tensorflow Dataset

In [None]:
def construct_tfdataset(encodings, y=None):
  if y is not None:
    return tf.data.Dataset.from_tensor_slices((dict(encodings), y))
  else:
    return tf.data.Dataset.from_tensor_slices(dict(encodings))

BATCH_SIZE = 32

train_tf_ds = construct_tfdataset(train_encodings, train_labels).batch(BATCH_SIZE)
val_tf_ds = construct_tfdataset(val_encodings, val_labels).batch(BATCH_SIZE)

# 3. Fine-tuning the model

Train model sử dụng pretrained distilbert-base-uncased và dùng Adam, SparseCategoricalCrossentropy với epochs=2

In [None]:
N_EPOCHS = 2

model = TFDistilBertForSequenceClassification.from_pretrained(MODEL_NAME)
optimizer = optimizers.Adam(learning_rate=3e-5)
loss= losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer,
              loss=loss,
              metrics=['accuracy'])
model.summary()

In [None]:
history = model.fit(train_tf_ds,
                    batch_size=BATCH_SIZE,
                    epochs=N_EPOCHS,
                    validation_data=val_tf_ds)

Epoch 1/2
Epoch 2/2


Model đang bị overfitting do dữ liêu đào tạo thấp. Cải thiện bằng cách thêm dữ liệu đào tạo và tăng số epochs

In [None]:
benchmarks = model.evaluate(val_tf_ds, return_dict=True, batch_size=BATCH_SIZE)
print(benchmarks)

{'loss': 1.3199385404586792, 'accuracy': 0.8199999928474426}


# 4. Using the fine-tuned model to predict new samples

In [None]:
# Hàm sử dụng để dự đoán comment mới
def create_prediction(model, model_name, max_len):
  tkzr = DistilBertTokenizer.from_pretrained(model_name)
  def predict_prob(text):
    x = [text]
    encodings = construct_encodings(x, tkzr, max_len=max_len)
    tfdataset = construct_tfdataset(encodings)
    tfdataset = tfdataset.batch(1)
    preds = model.predict(tfdataset).logits
    preds = activations.softmax(tf.convert_to_tensor(preds)).numpy()
    print(f'Text: \'{text}\'')
    if preds[0][0] < 0.6:
      print('Label: Toxic')
    else: print('Label: None-toxic')
    print(f'Probability: {preds[0][0]}')
  return predict_prob

clf = create_prediction(model, MODEL_NAME, MAX_LEN)

In [None]:
clf('I hate you')

Text: 'I hate you'
Label: Toxic
Probability: 0.007574939634650946


In [None]:
clf(test_texts[0])

Text: 'Por quê ao invés de ficar enchendo meu saco com divisão de referências, tu não faz algo útil, como corrigir as referências, como eu fiz ontem ao encontrar um artigo com mais de 50 referências com aviso de erro, passei mais de 2 horas corrigindo e ainda falta mais de 30. Quem sabe assim tu deixa de ser um inútil cyber chato. Get a life!!! Johnnyboytoy (discussão) '
Label: None-toxic
Probability: 0.999397873878479


# 5. Saving and loading the model for future use

In [None]:
save_path = '/content/drive/MyDrive/PROJECT/NLP/saved_model/comment_with_bert_transformers/'
# Lưu model
model.save_pretrained(save_path+'clf')
# Lưu thông tin của model
with open(save_path+'info.pkl', 'wb') as f:
    pickle.dump((MODEL_NAME, MAX_LEN), f)

Configuration saved in /content/drive/MyDrive/PROJECT/NLP/saved_model/comment_with_bert_transformers/clf/config.json
Model weights saved in /content/drive/MyDrive/PROJECT/NLP/saved_model/comment_with_bert_transformers/clf/tf_model.h5


In [None]:
new_model = TFDistilBertForSequenceClassification.from_pretrained(save_path+'clf')
model_name, max_len = pickle.load(open(save_path+'info.pkl', 'rb'))
clf = create_prediction(new_model, model_name, max_len)

In [None]:
clf('i love my dog')

Text: 'i love my dog'
Label: None-toxic
Probability: 0.9743092656135559
