In [2]:
# The great import
import pandas as pd
import numpy as np

In [3]:
df_train = pd.read_csv('../input/vihsd/train.csv')
df_test = pd.read_csv('../input/vihsd/test.csv')
df_valid = pd.read_csv('../input/vihsd/dev.csv')

In [4]:
df_train.head()

Unnamed: 0,free_text,label_id
0,Em được làm fan cứng luôn rồi nè ❤️ reaction q...,0
1,Đúng là bọn mắt híp lò xo thụt :))) bên việt n...,2
2,Đậu Văn Cường giờ giống thằng sida hơn à,0
3,CÔN ĐỒ CỤC SÚC VÔ NHÂN TÍNH ĐỀ NGHI VN. NHÀ NƯ...,2
4,Từ lý thuyết đến thực hành là cả 1 câu chuyện ...,0


# Data cleaning

### Remove na values

In [5]:
df_train.isna().sum()

free_text    2
label_id     0
dtype: int64

In [6]:
df_train[df_train['free_text'].isna()]

Unnamed: 0,free_text,label_id
10950,,1
20880,,0


In [7]:
# Just drop it 
df_train = df_train.dropna(subset=['free_text'])

### Clean out the emoji

In [8]:
import re

df_train['free_text'] = df_train['free_text'].apply(lambda x: re.sub(r'[^\w\s#@/:%.,_-]', '', x))

In [9]:
df_train['free_text'].head()

0    Em được làm fan cứng luôn rồi nè  reaction quá...
1    Đúng là bọn mắt híp lò xo thụt : bên việt nam ...
2             Đậu Văn Cường giờ giống thằng sida hơn à
3    CÔN ĐỒ CỤC SÚC VÔ NHÂN TÍNH ĐỀ NGHI VN. NHÀ NƯ...
4    Từ lý thuyết đến thực hành là cả 1 câu chuyện ...
Name: free_text, dtype: object

### Standardize the vietnamese text

In [10]:
# lowercase all 
df_train['free_text'] = df_train['free_text'].apply(lambda x: x.lower())

In [11]:
# separate punctuation from words
df_train['free_text'] = df_train['free_text'].apply(lambda x: re.sub(r'(?<=[^\s])\s*([^\w\s])', r' \1', x))

I choose not to remove punctuations in this case as it may represent a sentence structure that as a whole shapes an offensive or not sentence. Thus removing punctuation may disrupt the natural structure of the text and impact downstream classifying tasks

In [12]:
df_train.head()

Unnamed: 0,free_text,label_id
0,em được làm fan cứng luôn rồi nè reaction quá...,0
1,đúng là bọn mắt híp lò xo thụt : bên việt nam ...,2
2,đậu văn cường giờ giống thằng sida hơn à,0
3,côn đồ cục súc vô nhân tính đề nghi vn . nhà n...,2
4,từ lý thuyết đến thực hành là cả 1 câu chuyện ...,0


In [13]:
# optional, turn bad worlds into its original form
# form the bad words dictionaries
bad_words_txt = '../vn_offensive_words.txt'
bad_words_dict = {}
with open(bad_words_txt, 'r') as f:
    bad_words = f.read().splitlines()
    origin = ""

    for sent in bad_words:
        temp = sent.split(' ')

        if (len(temp) > 1 and temp[0] == '#'):
            origin = ' '.join(temp[1:])
            continue
        
        if (origin != ""):
            bad_words_dict[sent] = origin

In [14]:
# sorry for the bad words :(
bad_words_dict

{'buồi': 'buồi',
 'buoi': 'buồi',
 'dau buoi': 'buồi',
 'daubuoi': 'buồi',
 'caidaubuoi': 'buồi',
 'nhucaidaubuoi': 'buồi',
 'dau boi': 'buồi',
 'bòi': 'buồi',
 'dauboi': 'buồi',
 'caidauboi': 'buồi',
 'đầu bòy': 'buồi',
 'đầu bùi': 'buồi',
 'dau boy': 'buồi',
 'dauboy': 'buồi',
 'caidauboy': 'buồi',
 'b`': 'buồi',
 'cặc': 'cặc',
 'cak': 'cặc',
 'kak': 'cặc',
 'kac': 'cặc',
 'cac': 'cặc',
 'concak': 'cặc',
 'nungcak': 'cặc',
 'bucak': 'cặc',
 'caiconcac': 'cặc',
 'caiconcak': 'cặc',
 'cu': 'cặc',
 'cặk': 'cặc',
 'dái': 'cặc',
 'giái': 'cặc',
 'zái': 'cặc',
 'kiu': 'cặc',
 'cứt': 'cứt',
 'cuccut': 'cứt',
 'cutcut': 'cứt',
 'cứk': 'cứt',
 'cuk': 'cứt',
 'cười ỉa': 'cứt',
 'cười ẻ': 'cứt',
 'đéo': 'đéo',
 'đếch': 'đéo',
 'đếk': 'đéo',
 'dek': 'đéo',
 'đết': 'đéo',
 'đệt': 'địt',
 'đách': 'đéo',
 'dech': 'đéo',
 "đ'": 'đéo',
 'deo': 'đéo',
 "d'": 'đéo',
 'đel': 'đéo',
 'đél': 'đéo',
 'del': 'đéo',
 'dell ngửi': 'đéo',
 'dell ngui': 'đéo',
 'dell chịu': 'đéo',
 'dell chiu': 'đéo',
 'dell hi

In [15]:
# replace all bad words variants with its original form
def replace_bad_words(text):
    for bad, origin in bad_words_dict.items():
        text = text.replace(bad, origin)
    return text

The function might be useful later

### Check output distribution

In [16]:
df_train['label_id'].value_counts()

label_id
0    19885
2     2556
1     1605
Name: count, dtype: int64

We have the following label:
*   0: non-offensive
*   1: Offensive
*   2: Hate 

We see here the data is imbalance.
1 and 2 are similar, differ only at its level of hate. Thus as 0s outnumber the other 2 labels, we shall merge 1 and 2

In [17]:
df_train['label_id'] = df_train['label_id'].apply(lambda x: 1 if x in [1, 2] else x)
df_train['label_id'].value_counts()

label_id
0    19885
1     4161
Name: count, dtype: int64

yet the data is still imbalance. We shall counter it with the choice of metrics later

### Apply the same processing step for test and valid

In [18]:
# drop na
df_test = df_test.dropna(subset=['free_text'])
df_valid = df_valid.dropna(subset=['free_text'])

# clean the emoji 
df_valid['free_text'] = df_valid['free_text'].apply(lambda x: re.sub(r'[^\w\s#@/:%.,_-]', '', x))
df_test['free_text'] = df_test['free_text'].apply(lambda x: re.sub(r'[^\w\s#@/:%.,_-]', '', x))

# standardize the text
# lowercase all
df_valid['free_text'] = df_valid['free_text'].apply(lambda x: x.lower())
df_test['free_text'] = df_test['free_text'].apply(lambda x: x.lower())

# separate punctuation from words
df_valid['free_text'] = df_valid['free_text'].apply(lambda x: re.sub(r'(?<=[^\s])\s*([^\w\s])', r' \1', x))
df_test['free_text'] = df_test['free_text'].apply(lambda x: re.sub(r'(?<=[^\s])\s*([^\w\s])', r' \1', x))

In [19]:
# Merge the 1 and 2 labels
df_valid['label_id'] = df_valid['label_id'].apply(lambda x: 1 if x in [1, 2] else x)
df_test['label_id'] = df_test['label_id'].apply(lambda x: 1 if x in [1, 2] else x)

In [20]:
df_valid['label_id'].value_counts()

label_id
0    2190
1     482
Name: count, dtype: int64

# Model

### Benchmark : Bag-of-words with logistic regression

We use this basic model as a simple benchmark for out task

In [21]:
# Apply k fold
from sklearn.model_selection import StratifiedKFold

df_train['kfold'] = -1

df_train = df_train.sample(frac=1).reset_index(drop=True)

y = df_train['label_id'].values

kf = StratifiedKFold(n_splits=5)
for f, (t_, v_) in enumerate(kf.split(X=df_train, y=y)):
    df_train.loc[v_, 'kfold'] = f

df_train['kfold'].value_counts()

kfold
0    4810
1    4809
2    4809
3    4809
4    4809
Name: count, dtype: int64

In [22]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from pyvi import ViTokenizer

for fold_ in range(5):
    train_df = df_train[df_train.kfold != fold_]
    valid_df = df_train[df_train.kfold == fold_]
    
    vectorizer = CountVectorizer(tokenizer=ViTokenizer.tokenize)
    vectorizer.fit(train_df['free_text'])
    
    x_train = vectorizer.transform(train_df['free_text'])
    x_valid = vectorizer.transform(valid_df['free_text'])

    y_train = train_df['label_id']
    y_valid = valid_df['label_id']
    
    model = LogisticRegression()
    model.fit(x_train, y_train)

    # threshold currently 0.5
    preds = model.predict(x_valid)
    print(f'Fold {fold_}')
    print(accuracy_score(y_valid, preds))
    print(classification_report(y_valid, preds))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold 0
0.8401247401247401
              precision    recall  f1-score   support

           0       0.85      0.98      0.91      3977
           1       0.66      0.16      0.25       833

    accuracy                           0.84      4810
   macro avg       0.76      0.57      0.58      4810
weighted avg       0.82      0.84      0.80      4810



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold 1
0.8380120607194843
              precision    recall  f1-score   support

           0       0.85      0.98      0.91      3977
           1       0.63      0.15      0.25       832

    accuracy                           0.84      4809
   macro avg       0.74      0.57      0.58      4809
weighted avg       0.81      0.84      0.79      4809



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold 2
0.8342690788105636
              precision    recall  f1-score   support

           0       0.85      0.97      0.91      3977
           1       0.57      0.17      0.26       832

    accuracy                           0.83      4809
   macro avg       0.71      0.57      0.58      4809
weighted avg       0.80      0.83      0.79      4809



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold 3
0.834477022249948
              precision    recall  f1-score   support

           0       0.85      0.98      0.91      3977
           1       0.58      0.15      0.24       832

    accuracy                           0.83      4809
   macro avg       0.71      0.56      0.57      4809
weighted avg       0.80      0.83      0.79      4809

Fold 4
0.8348929091287169
              precision    recall  f1-score   support

           0       0.85      0.98      0.91      3977
           1       0.58      0.16      0.25       832

    accuracy                           0.83      4809
   macro avg       0.72      0.57      0.58      4809
weighted avg       0.80      0.83      0.79      4809



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Say in this problem, I think we focus on recall better, as the cost of missing some hateful comments is high (may affect children). Our recall is already very high.

Average accuracy : ~0.84

Average F1 : ~0.91

Average precision : ~0.85

Average Recall : ~0.97

#### Use n-gram


Usually bad word phrases in Vietnamese goes in pair or group of 3, I have an intuition that using n-gram with bag-of-words can be useful. 

It is also easy to implement

In [23]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from pyvi import ViTokenizer

for fold_ in range(5):
    train_df = df_train[df_train.kfold != fold_]
    valid_df = df_train[df_train.kfold == fold_]

    vectorizer = CountVectorizer(tokenizer=ViTokenizer.tokenize, ngram_range=(1, 3))
    vectorizer.fit(train_df['free_text'])

    x_train = vectorizer.transform(train_df['free_text'])
    x_valid = vectorizer.transform(valid_df['free_text'])

    y_train = train_df['label_id']
    y_valid = valid_df['label_id']

    model = LogisticRegression()
    model.fit(x_train, y_train)

    # threshold currently 0.5
    preds = model.predict(x_valid)
    print(f'Fold {fold_}')
    print(accuracy_score(y_valid, preds))
    print(classification_report(y_valid, preds))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold 0
0.8866943866943867
              precision    recall  f1-score   support

           0       0.91      0.96      0.93      3977
           1       0.73      0.55      0.63       833

    accuracy                           0.89      4810
   macro avg       0.82      0.75      0.78      4810
weighted avg       0.88      0.89      0.88      4810



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold 1
0.8956123934289874
              precision    recall  f1-score   support

           0       0.92      0.96      0.94      3977
           1       0.75      0.59      0.66       832

    accuracy                           0.90      4809
   macro avg       0.83      0.78      0.80      4809
weighted avg       0.89      0.90      0.89      4809



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold 2
0.8837596173840715
              precision    recall  f1-score   support

           0       0.91      0.96      0.93      3977
           1       0.72      0.54      0.62       832

    accuracy                           0.88      4809
   macro avg       0.81      0.75      0.77      4809
weighted avg       0.88      0.88      0.88      4809



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold 3
0.8808484092326887
              precision    recall  f1-score   support

           0       0.91      0.95      0.93      3977
           1       0.70      0.55      0.61       832

    accuracy                           0.88      4809
   macro avg       0.80      0.75      0.77      4809
weighted avg       0.87      0.88      0.88      4809

Fold 4
0.8862549386566854
              precision    recall  f1-score   support

           0       0.91      0.95      0.93      3977
           1       0.71      0.57      0.63       832

    accuracy                           0.89      4809
   macro avg       0.81      0.76      0.78      4809
weighted avg       0.88      0.89      0.88      4809



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Quite an improvement for only a little change.  


Average accuracy : ~0.88

Average F1 : ~0.93

Average precision : ~0.91

Average Recall : ~0.95

### Stack-LSTM

#### init model

In [27]:
import torch


class LSTM(torch.nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = torch.nn.LSTM(
            input_size, hidden_size, num_layers, batch_first=True)
        self.fc = torch.nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(
            0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(
            0), self.hidden_size).to(x.device)

        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return

#### Utils for training

In [None]:
def save_checkpoint(model, optimizer, epoch, loss, filename="checkpoint.pth"):
    checkpoint = {
        "epoch": epoch,
        "model_state_dict": model.state_dict(),
        "optimizer_state_dict": optimizer.state_dict(),
        "loss": loss
    }
    torch.save(checkpoint, filename)

def load_checkpoint(model, optimizer, filename="checkpoint.pth"):
    checkpoint = torch.load(filename)
    model.load_state_dict(checkpoint["model_state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
    epoch = checkpoint["epoch"]
    loss = checkpoint["loss"]
    return model, optimizer, epoch, loss

In [29]:
from tqdm import tqdm

def train(model, optimizer, loss_fn, train_loader, valid_loader, epochs=5, file_name="checkpoint.pth"):
    train_losses = []
    valid_losses = [] 
    for epoch in range(epochs):
        model.train()
        train_loss = 0.0
        pbar = tqdm(train_loader, desc=f'Epoch {epoch}')
        for data in pbar:
            x, y = data
            
            optimizer.zero_grad()
            y_pred = model(x)
            
            loss = loss_fn(y_pred, y)
            loss.backward()
            optimizer.step()


            train_loss += loss.item()
            pbar.set_postfix({'Train Loss': train_loss / len(train_loader)})
            train_losses.append(train_loss / len(train_loader))
        
        model.eval()
        valid_loss = 0.0
        with torch.no_grad():
            for data in valid_loader:
                x, y = data
                y_pred = model(x)
                loss = loss_fn(y_pred, y)
                valid_loss += loss.item()
            valid_losses.append(valid_loss / len(valid_loader))
        
        print(f'Epoch {epoch}, Train Loss: {train_loss / len(train_loader)}, Valid Loss: {valid_loss / len(valid_loader)}')

        save_checkpoint(model, optimizer, epoch, loss, filename=file_name)

        return train_losses, valid_losses

    

#### Preprocess 

In [None]:
MAX_LENGTH = 150
pad_token = "<pad>"
unk_token = "<unk>"
def pad_tokens(tokens):
    if (len(tokens) >= MAX_LENGTH):
        return tokens[:MAX_LENGTH]
    else:
        return tokens + [pad_token] * (MAX_LENGTH - len(tokens))

In [38]:
from pyvi import ViTokenizer
from torchtext.vocab import build_vocab_from_iterator


def yield_tokens(df_series):
    for text in df_series:
        yield ViTokenizer.tokenize(text)

vocab = build_vocab_from_iterator(yield_tokens(df_train['free_text']), specials=[pad_token, unk_token])




OSError: dlopen(/Users/trungpham/Public/Real-time-Hate-speech-detection/.venv/lib/python3.11/site-packages/torchtext/lib/libtorchtext.so, 0x0006): Symbol not found: __ZN3c105ErrorC1ENSt3__112basic_stringIcNS1_11char_traitsIcEENS1_9allocatorIcEEEES7_PKv
  Referenced from: <5436ECC1-6F45-386E-B542-D5F76A22B52C> /Users/trungpham/Public/Real-time-Hate-speech-detection/.venv/lib/python3.11/site-packages/torchtext/lib/libtorchtext.so
  Expected in:     <69A84A04-EB16-3227-9FED-383D2FE98E93> /Users/trungpham/Public/Real-time-Hate-speech-detection/.venv/lib/python3.11/site-packages/torch/lib/libc10.dylib

#### Create train vs valid loader

In [39]:
from torch.utils.data import Dataset, DataLoader


class TextDataset(Dataset):
    def __init__(self, df, vocab):
        self.df = df
        self.vocab = vocab

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = row['free_text']
        padded_tokens = pad_tokens(ViTokenizer.tokenize(text))
        ids = torch.tensor(vocab.lookup_indices(padded_tokens))
        y = row['label_id']
        return ids, y

In [40]:
train_ds = TextDataset(df_train, vocab)
valid_ds = TextDataset(df_valid, vocab)

NameError: name 'vocab' is not defined

In [None]:
BATCH_SIZE = 64
train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE,
                      shuffle=True, num_workers=8, pin_memory=True)
val_dl = DataLoader(valid_ds, batch_size=BATCH_SIZE*2,
                    num_workers=8, pin_memory=True)

#### The training process

In [30]:
model = LSTM(input_size=1, hidden_size=128, num_layers=2, num_classes=1)
loss_fn = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

train(model, optimizer, loss_fn, train_dl, val_dl, epochs=5, file_name="checkpoint.pth")