In [4]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import shutil
import warnings
warnings.filterwarnings('ignore')

#Local libraries
from data_processor_pipeline.cleaner import *
from data_processor_pipeline.custom_dataset import *
from metrics.plot_confusion_matrix import plot_cfs_matrix

# Data_preprocess

## Read data and cleaning

In [5]:
data=pd.read_csv('vietnamese_student_feedbacks.csv', encoding='utf-8-sig')
data_processor=data_clean()
data['sentence'] = data['sentence'].map(lambda x:data_processor.clean_text(x))

In [9]:
from sklearn.model_selection import train_test_split

data['combined_label']=list(zip(data['sentiment'], data['topic']))
X_train, X_test, y_train, y_test = train_test_split(data['sentence'].reset_index(drop=True).to_list(), 
                                                    data['combined_label'].reset_index(drop=True).to_list(), 
                                                    test_size=0.1, 
                                                    stratify=data['topic'],
                                                    random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

In [10]:
_, y_train_topic = zip(*y_train)
_, y_val_topic = zip(*y_val)
_, y_test_topic = zip(*y_test)

In [11]:
np.unique(y_train_topic, return_counts=True)

(array([0, 1, 2, 3]), array([9392, 2483,  578,  648], dtype=int64))

## Create dataLoader

In [None]:
train_dataset_topic = Custom_Dataset(X_train, torch.Tensor(y_train_topics, dtype=torch.long), file_path='vocab.pkl', is_save_vocab=False)
val_dataset_topic =Custom_Dataset(X_val, y_val_topic, file_path='vocab.pkl', is_save_vocab=False, max_length=train_dateset_topic.max_length)
test_dataset_topic = Custom_Dataset(X_test, y_test_topic, file_path='vocab.pkl', is_save_vocab=False, max_length=train_dataset_topic.max_length)

In [None]:
train_dataloader_topic=DataLoader(train_dataset_topic, batch_size=16, shuffle=True)
val_dataloader_topic=DataLoader(val_dataset_topic, batch_size=16, shuffle=True)
test_dataloader_topic=DataLoader(test_dataset_topic, batch_size=16, shuffle=True)

# DL_model for topic classification

## RNN

### Training

In [10]:
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from build_model.build_rnn import *
from torch.utils.tensorboard import SummaryWriter
from training.trainer import *

In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_rnn = RNN(vocab_size=5000,
                embedding_dim=128,
                num_layers=1,
                activation=None,
                batch_normalization = True,
                bidirectional=False,
                output_dim=4
               )
criterion = nn.CrossEntropyLoss()
optimizer_RNN=optim.Adam(model_rnn.parameters(), lr=0.01)
epochs=5
trainer=trainer()
log_dir='logs/RNN_topic'
if os.path.exists(log_dir):
    shutil.rmtree(log_dir)
writer=SummaryWriter(log_dir='log_dir')

In [12]:
best_Acc_RNN=0
for epoch in epochs:
    trainer.train(model_rnn, 
                  train_dataloader_topic,
                  epoch,
                  epochs,
                  writer,
                  criterion,
                  optimizer_RNN,
                  device
                 )
    val_loss, val_acc = trainer.validation(model_rnn, val_dataloader_topic, criterion, device)
    print(f'VALIDATION | Epoch: {epoch+1}/{epochs} | Loss: {val_loss:.4f} | Accuracy: {val_acc:.4f}')
    writer.add_scalar('Val/Loss', val_loss, epoch+1)
    writer.add_scalar('Val/Acc', val_acc, epoch+1)
    checkpoint = {
        'model_state_dict': model_rnn.state_dict(),
        'epoch': epoch+1,
        'optimizer_RNN': optimizer_RNN.state_dict()
    }
    os.makedir('model/RNN_topic', exists_ok=True)
    torch.save(checkpoint, os.path.join('model/RNN_topic','last.pt'))
    if val_acc>best_Acc_RNN:
        torch.save(checkpoint, os.path.join('model/RNN_topic','best.pt'))
        best_Acc_RNN=val_acc

RNN(
  (embedding): Embedding(5000, 128)
  (rnn): RNN(128, 64, dropout=0.2)
  (dense1): Linear(in_features=64, out_features=64, bias=True)
  (bn): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (dense2): Linear(in_features=64, out_features=4, bias=True)
)


In [None]:
from training.trainner import *
from build_model.build_rnn import * 
trainer=trainer()
deveice=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_state_dict=torch.load('model/RNN_topic/best.pt', map_location=device)
model_rnn.load_state_dict(model_state_dict['model_state_dict'])
result_RNN_on_test=trainer.evualuate(model_rnn, test_loader_topic, device=device)