In [36]:
import os
import pandas as pd

In [37]:
def load_data_from_path(folder_path):
    examples = []
    for label in os.listdir(folder_path):
        full_path = os.path.join(folder_path, label)
        for file_name in os.listdir(full_path):
            file_path = os.path.join(full_path, file_name)
            with open(file_path, "r", encoding='utf-8') as f:
                lines = f.readlines()
            sentence = " ".join(lines)
            if label == "neg":
                label = 0
            if label == "pos":
                label = 1
            data = {
                "sentence": sentence,
                "label": label
            }
            examples.append(data)
    return pd.DataFrame(examples)

In [38]:
folder_paths = {
    'train': './data/ntc-scv/data_train/train',
    'val': './data/ntc-scv/data_train/test',
    'test': './data/ntc-scv/data_test/test'
}

train_df = load_data_from_path(folder_paths['train'])
val_df = load_data_from_path(folder_paths['val'])
test_df = load_data_from_path(folder_paths['test'])

In [39]:
from langid.langid import LanguageIdentifier, model
def identify_vn(df):
    identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
    not_vi_idx = set()
    THRESHOLD = 0.9
    for idx, row in df.iterrows():
        score = identifier.classify(row['sentence'])
        if score[0] != "vi" or (score[0] == "vi" and score[1] <= THRESHOLD):
            not_vi_idx.add(idx)
    vi_df = df[~df.index.isin(not_vi_idx)]
    not_vi_df = df[df.index.isin(not_vi_idx)]
    return vi_df, not_vi_df

train_df_vi, train_df_other = identify_vn(train_df)

In [40]:
import re
import string

def preprocess_text(text):
    # remove URLs https://www.
    url_pattern = re.compile(r'https?://\s+\wwww\.\s+')
    text = url_pattern.sub(r" ", text)

    # remove HTML Tags: <>
    html_pattern = re.compile(r'<[^<>]+>')
    text = html_pattern.sub(" ", text)

    # remove puncs and digits
    replace_chars = list(string.punctuation + string.digits)
    for char in replace_chars:
        text = text.replace(char, " ")

    # remove emoji
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U0001F1F2-\U0001F1F4"  # Macau flag
        u"\U0001F1E6-\U0001F1FF"  # flags
        u"\U0001F600-\U0001F64F"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U0001F1F2"
        u"\U0001F1F4"
        u"\U0001F620"
        u"\u200d"
        u"\u2640-\u2642"
        "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r" ", text)

    # normalize whitespace
    text = " ".join(text.split())     

    return text.lower()

train_df_vi['preprocess_sentence'] = [
    preprocess_text(row['sentence']) for index, row in train_df_vi.iterrows()
]
val_df['preprocess_sentence'] = [
    preprocess_text(row['sentence']) for index, row in val_df.iterrows()
]
test_df         ['preprocess_sentence'] = [
    preprocess_text(row['sentence']) for index, row in test_df.iterrows()
]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df_vi['preprocess_sentence'] = [


In [41]:
from torchtext.data.utils import get_tokenizer

tokenizer = get_tokenizer("basic_english")

def yield_tokens(sentences, tokenizer):
    for sentence in sentences:
        yield tokenizer(sentence)

from torchtext.vocab import build_vocab_from_iterator

vocab_size = 10000
vocabulary = build_vocab_from_iterator(
    yield_tokens(train_df_vi['preprocess_sentence'], tokenizer),
    max_tokens = vocab_size,
    specials = ["<pad>", "<unk>"]
)

vocabulary.set_default_index(vocabulary["<unk>"])

from torchtext.data.functional import to_map_style_dataset
def prepare_dataset(df):
    for index, row in df.iterrows():
        sentence = row['preprocess_sentence']
        encoded_sentence = vocabulary(tokenizer(sentence))
        label = row['label']
        yield encoded_sentence, label

train_dataset = to_map_style_dataset(prepare_dataset(train_df_vi))
val_dataset = to_map_style_dataset(prepare_dataset(val_df))


In [42]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

def collate_batch(batch):
    encoded_sentences, labels = [], []
    for encoded_sentence, label in batch:
        labels.append(label)
        encoded_sentence = torch.tensor(encoded_sentence, dtype = torch.int64)
        encoded_sentences.append(encoded_sentence)

    labels = torch.tensor(labels, dtype = torch.int64)
    encoded_sentences = pad_sequence(
        encoded_sentences,
        padding_value=vocabulary["<pad>"]
    )

    return encoded_sentences, labels

batch_size = 128
train_loader = DataLoader(
    train_dataset,
    batch_size = batch_size,
    shuffle = True,
    collate_fn = collate_batch
)

val_loader = DataLoader(
    val_dataset,
    batch_size = batch_size,
    shuffle = False,
    collate_fn = collate_batch
)

In [43]:
class TextCNN(nn.Module):
    def __init__(
        self,
        vocab_size, embedding_dim, kernel_sizes, num_filters, num_classes):
        super(TextCNN, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.kernel_sizes = kernel_sizes
        self.num_filters = num_filters
        self.num_classes = num_classes
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.conv = nn.ModuleList([
            nn.Conv1d(
                in_channels = embedding_dim,
                out_channels = num_filters,
                kernel_size = k,
                stride = 1
            ) for k in kernel_sizes])
        self.fc = nn.Linear(len(kernel_sizes)*num_filters, num_classes)

    def forward(self, x):
        batch_size, sequence_length = x.shape
        x = self.embedding(x.T).transpose(1, 2)
        x = [F.relu(conv(x)) for conv in self.conv]
        x = [F.max_pool1d(c, c.size(-1)).squeeze(dim = -1) for c in x]
        x = torch.cat(x, dim=1)
        x = self.fc(x)
        return x

In [44]:
import time

def train(model, optimizer, criterion, train_dataloader, device, epoch=0, log_interval=50):
    model.train()
    total_acc, total_count = 0, 0
    losses = []
    start_time = time.time()

    for idx, (inputs, labels) in enumerate(train_dataloader):
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()

        predictions = model(inputs)

        loss = criterion(predictions, labels)
        losses.append(loss.item())

        loss.backward()
        optimizer.step()
        total_acc += (predictions.argmax(1) == labels).sum().item()
        total_count += labels.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print (
                "| epoch {:3d} | {:5d}/{:5d} batches "
                "| accuracy {:8.3f}".format(
                    epoch, idx, len(train_dataloader), total_acc / total_count
                )
            )
            total_acc, total_count = 0, 0
            start_time = time.time()

    epoch_acc = total_acc / total_count
    epoch_loss = sum(losses) / len(losses)
    return epoch_acc, epoch_loss

In [45]:
def evaluate(model, criterion, valid_dataloader):
    model.eval()
    total_acc, total_count = 0, 0
    losses = []

    with torch.no_grad():
        for idx, (inputs, labels) in enumerate(valid_dataloader):
            inputs, labels = inputs.to(device), labels.to(device)

            predictions = model(inputs)

            loss = criterion(predictions, labels)
            losses.append(loss.item())

            total_acc += (predictions.argmax(1) == labels).sum().item()
            total_count += labels.size(0)

    epoch_acc = total_acc / total_count
    epoch_loss = sum(losses) / len(losses)
    return epoch_acc, epoch_loss
        

In [46]:
num_classes = 2
vocab_size = len(vocabulary)
embedding_dim = 100
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = TextCNN(
    vocab_size = vocab_size,
    embedding_dim = embedding_dim,
    kernel_sizes = [3, 4, 5],
    num_filters = 100,
    num_classes = 2
)
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

num_epochs = 10
save_model = './model'

train_accs, train_losses = [], []
eval_accs, eval_losses = [], []
best_loss_eval = 100

for epoch in range(1, num_epochs+1):
    epoch_start_time = time.time()

    train_acc, train_loss = train(model, optimizer, criterion, train_loader, device, epoch)
    train_accs.append(train_acc)
    train_losses.append(train_loss)

    eval_acc, eval_loss = evaluate(model, criterion, val_loader)
    eval_accs.append(eval_acc)
    eval_losses.append(eval_loss)

    if eval_loss < best_loss_eval:
        torch.save(model.state_dict(), save_model + '/text_cnn_model.pt')

    print("-" * 59)
    print(
        "| End of epoch {:3d} | Time : {:5.2f}s | Train Accuracy {:8.3f} | Train Loss {:8.3f} "
        "| Valid Accuracy {:8.3f} | Valid Loss {:8.3f} ".format(
            epoch, time.time() - epoch_start_time, train_acc, train_loss, eval_acc, eval_loss
        )
    )
    print("-" * 59)

    model.load_state_dict(torch.load(save_model + '/text_cnn_model.pt', weights_only=True))
    model.eval()

| epoch   1 |    50/  233 batches | accuracy    0.750
| epoch   1 |   100/  233 batches | accuracy    0.846
| epoch   1 |   150/  233 batches | accuracy    0.863
| epoch   1 |   200/  233 batches | accuracy    0.873
-----------------------------------------------------------
| End of epoch   1 | Time : 114.61s | Train Accuracy    0.873 | Train Loss    0.378 | Valid Accuracy    0.879 | Valid Loss    0.302 
-----------------------------------------------------------
| epoch   2 |    50/  233 batches | accuracy    0.920
| epoch   2 |   100/  233 batches | accuracy    0.913
| epoch   2 |   150/  233 batches | accuracy    0.918
| epoch   2 |   200/  233 batches | accuracy    0.916
-----------------------------------------------------------
| End of epoch   2 | Time : 110.65s | Train Accuracy    0.919 | Train Loss    0.226 | Valid Accuracy    0.872 | Valid Loss    0.303 
-----------------------------------------------------------
| epoch   3 |    50/  233 batches | accuracy    0.965
| epoch 

In [None]:
test_dataset = to_map_style_dataset(prepare_dataset(test_df))

test_loader = DataLoader(
    test_dataset,
    batch_size = batch_size,
    shuffle = False,
    collate_fn = collate_batch
)

test_acc, test_loss = evaluate(model, criterion, test_loader)
test_acc, test_loss


(0.8901, 0.37715408536075035)

: 