In [1]:
! pip install transformers

Defaulting to user installation because normal site-packages is not writeable

Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.
You should consider upgrading via the 'c:\program files\python38\python.exe -m pip install --upgrade pip' command.





In [38]:
import torch
from torch.utils.data import Dataset, DataLoader
import random
import numpy as np
import pandas as pd
from torch.utils.tensorboard import SummaryWriter
import torch.optim as optim
from torch import nn
from tqdm.notebook import tqdm
from torch.autograd import Variable 
import matplotlib.pyplot as plt 

random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed(42)
torch.backends.cudnn.deterministic = True
import time
import transformers

import yaml
from yaml.loader import SafeLoader


In [8]:
train_df = pd.read_csv("./Data/processed/train.csv", sep = "|")
val_df = pd.read_csv("./Data/processed/validation.csv", sep = "|")
test_df = pd.read_csv("./Data/processed/test.csv", sep = "|")

In [9]:
train_df

Unnamed: 0,sentences,sentiments
0,slide giáo_trình đầy_đủ,2
1,nhiệt_tình giảng_dạy gần_gũi với sinh_viên,2
2,đi học đầy_đủ full điểm chuyên cần,0
3,chưa áp_dụng công_nghệ_thông_tin và các thiết_...,0
4,thầy giảng bài hay có nhiều bài_tập ví_dụ ngay...,2
...,...,...
11421,chỉ vì môn game mà em học hai lần mà không qua...,0
11422,em cảm_ơn cô nhiều,2
11423,giao bài_tập quá nhiều,0
11424,giáo_viên dạy dễ hiểu nhiệt_tình,2


In [10]:
import spacy
import collections
from tqdm.notebook import tqdm
import youtokentome as yttm
import os
import json
import datetime
spacy_eng = spacy.load("en_core_web_sm")

class Vocabulary():
  '''Vocabulary class: extract all words from corpus, save all words that appears more than freq_threshold times'''
  def __init__(self, freq_threshold=3):
    self.itos = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
    self.stoi = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
    self.freq_threshold = freq_threshold

  def __len__(self):
    return len(self.itos)

  def build_vocab(self, sentence_list):
    # Build the vocab
    idx = 4
    word_counter = collections.Counter([word for sentence in sentence_list for word in sentence])

    for word, count in word_counter.items():
      if count >= self.freq_threshold:
        self.stoi[word] = idx
        self.itos[idx] = word
        idx += 1
    # print(self.stoi)

  def numericalize(self, sent):
    # Turn a sentence into list of word ID
    return [
            self.stoi[token] if token in self.stoi else self.stoi["<UNK>"]
            for token in sent
        ]
  def save_vocab(self, vocab_path):
#     with open(itos_path, 'w') as itos_f:
#         json.dump(self.itos, itos_f)
#     with open(stoi_path, 'w') as stoi_f:
#         json.dump(self.stoi, stoi_f)
    with open(vocab_path, 'w') as f:
        vocab = {'itos': self.itos, 'stoi': self.stoi}
        json.dump(vocab, f)

  def load_vocab(self, vocab_path):
    with open(vocab_path, 'r') as f:
#         print(type(itos_f.read()))
        vocab = json.load(f)
        self.itos = vocab['itos']
        self.stoi = vocab['stoi']
        
    
  

In [11]:
type(train_df['sentiments'][0])

numpy.int64

In [12]:
train_sents = train_df['sentences'].values
train_sents = [sent.split(" ") for sent in train_sents]
train_sents[0:5]

[['slide', 'giáo_trình', 'đầy_đủ'],
 ['nhiệt_tình', 'giảng_dạy', 'gần_gũi', 'với', 'sinh_viên'],
 ['đi', 'học', 'đầy_đủ', 'full', 'điểm', 'chuyên', 'cần'],
 ['chưa',
  'áp_dụng',
  'công_nghệ_thông_tin',
  'và',
  'các',
  'thiết_bị',
  'hỗ_trợ',
  'cho',
  'việc',
  'giảng_dạy'],
 ['thầy',
  'giảng',
  'bài',
  'hay',
  'có',
  'nhiều',
  'bài_tập',
  'ví_dụ',
  'ngay',
  'trên',
  'lớp']]

In [13]:
vocab = Vocabulary(3)
vocab.build_vocab(train_sents)

In [14]:
vocab.stoi

{'<PAD>': 0,
 '<SOS>': 1,
 '<EOS>': 2,
 '<UNK>': 3,
 'slide': 4,
 'giáo_trình': 5,
 'đầy_đủ': 6,
 'nhiệt_tình': 7,
 'giảng_dạy': 8,
 'gần_gũi': 9,
 'với': 10,
 'sinh_viên': 11,
 'đi': 12,
 'học': 13,
 'điểm': 14,
 'chuyên': 15,
 'cần': 16,
 'chưa': 17,
 'áp_dụng': 18,
 'công_nghệ_thông_tin': 19,
 'và': 20,
 'các': 21,
 'thiết_bị': 22,
 'hỗ_trợ': 23,
 'cho': 24,
 'việc': 25,
 'thầy': 26,
 'giảng': 27,
 'bài': 28,
 'hay': 29,
 'có': 30,
 'nhiều': 31,
 'bài_tập': 32,
 'ví_dụ': 33,
 'ngay': 34,
 'trên': 35,
 'lớp': 36,
 'giảng_viên': 37,
 'đảm_bảo': 38,
 'thời_gian': 39,
 'lên': 40,
 'tích_cực': 41,
 'trả_lời': 42,
 'câu_hỏi': 43,
 'của': 44,
 'thường_xuyên': 45,
 'đặt': 46,
 'câu': 47,
 'hỏi': 48,
 'em': 49,
 'sẽ': 50,
 'môn': 51,
 'này': 52,
 'nhưng': 53,
 'học_lại': 54,
 'ở': 55,
 'học_kỳ': 56,
 'thời_lượng': 57,
 'quá': 58,
 'dài': 59,
 'không': 60,
 'tiếp_thu': 61,
 'hiệu_quả': 62,
 'nội_dung': 63,
 'môn_học': 64,
 'phần': 65,
 'thiếu': 66,
 'trọng_tâm': 67,
 'hầu_như': 68,
 'là': 69,

In [137]:
import torch
import numpy as np
from torch.utils.data import DataLoader, Dataset
import random
import torch.nn.functional as F
seed  = 42
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.cuda.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

class SentimentDataset(Dataset):
  def __init__(self,df, vocab = None, max_sent_len = 120):
    
    self.train_sents = [sent.split(" ") for sent in df['sentences'].values]
    self.labels = df['sentiments'].values
    self.max_sent_len = max_sent_len
    if vocab == None:
        self.vocab = Vocabulary(3)
        self.vocab.build_vocab(self.train_sents)
    else:
        self.vocab = vocab
  def __len__(self):
    return len(self.train_sents)
      
  def __getitem__(self, index):
    # Override __getitem__ method of parent class (torch.utils.data.Dataset class) 
    sent_numericalized = [self.vocab.stoi["<SOS>"]]
    sent_numericalized += self.vocab.numericalize(self.train_sents[index])
    
    if len(sent_numericalized) > self.max_sent_len:
        sent_numericalized = sent_numericalized[:self.max_sent_len]

    sent_tensor = np.full(self.max_sent_len, self.vocab.stoi["<PAD>"])
    sent_tensor[:len(sent_numericalized)] = sent_numericalized
    x = torch.Tensor(sent_tensor).long()
    
    y = self.labels[index]
#     y = torch.Tensor([y]).long()
    return x,y
  def get_dataset(self):
    src = []
    target = []
    for index in range(len(self)):
        x,y = self[index]
        src.append(x)
        
#         print(x.shape, y)
    src = torch.stack(sent, dim = 0)
    return src, tgt

In [138]:
def get_loader(df, vocab = None, max_sent_len = 120,batch_size = 64, num_workers = 0, shuffle = True, pin_memory = True):
    '''
    Function to create DataLoader: group sentences into batches of size batch_size
    
    Params:
      src_data_file, tgt_data_file: path to source and target data file
      max_seq_len: max number of tokens in a sentence
      en_vocab, vi_vocab: object of Vocabulary class. If we already have en_vocab and vi_vocab, we avoid creating them again
      back_translation: if we're using this for back translation, set this to True
      batch_size: number of sentences in a batch
      num_workers, shuffle, pin_memory: not very important
    Returns:
      loader: object of DataLoader class
      dataset: object of StanfordEnViDataset class
      en_vocab, vi_vocab: object of Vocabulary class. This is vocab of English and Vietnamese
    '''
#     dataset = StanfordEnViDataset(src_data_file, tgt_data_file, en_vocab_path, vi_vocab_path, input_type,bpe_en_file, bpe_vi_file, max_seq_len, back_translation)  
    dataset = SentimentDataset(df, vocab = vocab, max_sent_len = max_sent_len)
    vocab = dataset.vocab
    loader = DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        num_workers=num_workers,
        shuffle=shuffle,
        pin_memory=pin_memory
    )
    return loader, dataset, vocab

In [139]:
train_loader,_,vocab = get_loader(train_df, vocab = None, max_sent_len = 100)
val_loader,dev_dataset,_ = get_loader(val_df, vocab = vocab, max_sent_len = 100)

In [140]:
dev_dataset.get_dataset()

torch.Size([100]) 0
torch.Size([100]) 0
torch.Size([100]) 2
torch.Size([100]) 0
torch.Size([100]) 0
torch.Size([100]) 0
torch.Size([100]) 0
torch.Size([100]) 2
torch.Size([100]) 0
torch.Size([100]) 0
torch.Size([100]) 2
torch.Size([100]) 2
torch.Size([100]) 0
torch.Size([100]) 2
torch.Size([100]) 2
torch.Size([100]) 0
torch.Size([100]) 2
torch.Size([100]) 2
torch.Size([100]) 1
torch.Size([100]) 0
torch.Size([100]) 0
torch.Size([100]) 0
torch.Size([100]) 2
torch.Size([100]) 1
torch.Size([100]) 2
torch.Size([100]) 2
torch.Size([100]) 0
torch.Size([100]) 0
torch.Size([100]) 0
torch.Size([100]) 2
torch.Size([100]) 1
torch.Size([100]) 2
torch.Size([100]) 2
torch.Size([100]) 0
torch.Size([100]) 0
torch.Size([100]) 0
torch.Size([100]) 0
torch.Size([100]) 1
torch.Size([100]) 2
torch.Size([100]) 0
torch.Size([100]) 0
torch.Size([100]) 0
torch.Size([100]) 2
torch.Size([100]) 2
torch.Size([100]) 0
torch.Size([100]) 0
torch.Size([100]) 2
torch.Size([100]) 0
torch.Size([100]) 2
torch.Size([100]) 2


torch.Size([100]) 0
torch.Size([100]) 0
torch.Size([100]) 2
torch.Size([100]) 2
torch.Size([100]) 2
torch.Size([100]) 2
torch.Size([100]) 2
torch.Size([100]) 0
torch.Size([100]) 2
torch.Size([100]) 2
torch.Size([100]) 2
torch.Size([100]) 0
torch.Size([100]) 2
torch.Size([100]) 0
torch.Size([100]) 2
torch.Size([100]) 2
torch.Size([100]) 1
torch.Size([100]) 2
torch.Size([100]) 0
torch.Size([100]) 2
torch.Size([100]) 2
torch.Size([100]) 2
torch.Size([100]) 0
torch.Size([100]) 0
torch.Size([100]) 1
torch.Size([100]) 0
torch.Size([100]) 0
torch.Size([100]) 2
torch.Size([100]) 2
torch.Size([100]) 2
torch.Size([100]) 0
torch.Size([100]) 2
torch.Size([100]) 0
torch.Size([100]) 0
torch.Size([100]) 0
torch.Size([100]) 0
torch.Size([100]) 0
torch.Size([100]) 0
torch.Size([100]) 2
torch.Size([100]) 2
torch.Size([100]) 2
torch.Size([100]) 2
torch.Size([100]) 2
torch.Size([100]) 2
torch.Size([100]) 2
torch.Size([100]) 2
torch.Size([100]) 0
torch.Size([100]) 2
torch.Size([100]) 0
torch.Size([100]) 0


torch.Size([100]) 2
torch.Size([100]) 1
torch.Size([100]) 0
torch.Size([100]) 2
torch.Size([100]) 0
torch.Size([100]) 0
torch.Size([100]) 2
torch.Size([100]) 1
torch.Size([100]) 0
torch.Size([100]) 0
torch.Size([100]) 0
torch.Size([100]) 2
torch.Size([100]) 2
torch.Size([100]) 0
torch.Size([100]) 0
torch.Size([100]) 0
torch.Size([100]) 2
torch.Size([100]) 0
torch.Size([100]) 0
torch.Size([100]) 2
torch.Size([100]) 2
torch.Size([100]) 2
torch.Size([100]) 2
torch.Size([100]) 2
torch.Size([100]) 0
torch.Size([100]) 0
torch.Size([100]) 0
torch.Size([100]) 2
torch.Size([100]) 2
torch.Size([100]) 2
torch.Size([100]) 0
torch.Size([100]) 2
torch.Size([100]) 0
torch.Size([100]) 2
torch.Size([100]) 0
torch.Size([100]) 2
torch.Size([100]) 0
torch.Size([100]) 0
torch.Size([100]) 0
torch.Size([100]) 2
torch.Size([100]) 0
torch.Size([100]) 0
torch.Size([100]) 2
torch.Size([100]) 0
torch.Size([100]) 1
torch.Size([100]) 2
torch.Size([100]) 2
torch.Size([100]) 2
torch.Size([100]) 2
torch.Size([100]) 2


TypeError: stack(): argument 'tensors' (position 1) must be tuple of Tensors, not Tensor

In [18]:
for batch_idx, (src, tgt) in enumerate(train_loader):
    print(src, tgt)
    print(src.shape,tgt.shape)
    break


tensor([[  1, 300,  13,  ...,   0,   0,   0],
        [  1,   3, 149,  ...,   0,   0,   0],
        [  1,  37,  88,  ...,   0,   0,   0],
        ...,
        [  1,  37,  30,  ...,   0,   0,   0],
        [  1, 426, 643,  ...,   0,   0,   0],
        [  1, 206,  88,  ...,   0,   0,   0]]) tensor([0, 0, 2, 2, 2, 0, 0, 0, 0, 2, 0, 0, 2, 2, 0, 2, 2, 2, 0, 2, 0, 0, 2, 0,
        0, 2, 2, 2, 2, 2, 2, 2, 1, 2, 0, 0, 0, 2, 0, 0, 0, 2, 2, 0, 2, 0, 2, 2,
        2, 0, 2, 0, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 2])
torch.Size([64, 100]) torch.Size([64])


In [234]:

class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx, device):
        
        super().__init__()
        
        self.device = device
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx).to(self.device)
        
        self.convs = nn.ModuleList([
                                    nn.Conv2d(in_channels = 1, 
                                              out_channels = n_filters, 
                                              kernel_size = (fs, embedding_dim)) 
                                    for fs in filter_sizes
                                    ]).to(self.device)
#         self.conv_1 = nn.ModuleList( [nn.Conv2d(in_channels = 1, out_channels = 128, kernel_size = 4, padding = 'same'),
#                                       nn.ReLU()]).to(self.device)
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim).to(self.device)
        
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()
        
#         self.log_softmax = F.log_softmax(dim = 1)
        
    def forward(self, text):
                
        #text = [batch size, sent len]
        
        embedded = self.embedding(text)
                
        #embedded = [batch size, sent len, emb dim]
        
        embedded = embedded.unsqueeze(1)
#         print(embedded.shape)
    
#         embedded = [batch size, 1, sent len, emb dim]
        
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
            
#         #conved_n = [batch size, n_filters, sent len - filter_sizes[n] + 1]
                
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        
        #pooled_n = [batch size, n_filters]
#         pooled = self.conv_1(embedded)
#         print(pooled.shape)
        cat = self.dropout(torch.cat(pooled, dim = 1))

        #cat = [batch size, n_filters * len(filter_sizes)]
        fc = self.relu(self.fc(self.dropout(cat)))
        
        
        proba = F.softmax(fc, dim = 1)
        
        
        return proba

In [235]:
train_loader,_,vocab = get_loader(train_df, vocab = None, max_sent_len = 100)
val_loader,_,_ = get_loader(val_df, vocab = vocab, max_sent_len = 100)

PAD_IDX = vocab.stoi["<PAD>"]
INPUT_DIM = len(vocab)
EMBEDDING_DIM = 256
N_FILTERS = 100
FILTER_SIZES = [3,4,5]
OUTPUT_DIM = 3
DROPOUT = 0.5

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX, device = device)
for batch_idx, (src, tgt) in enumerate(train_loader):
    src = src.to(device)
    tgt = tgt.to(device)
    y = model(src)
    print(y.shape)
    break


torch.Size([64, 3])


In [236]:
def initialize_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.xavier_normal_(m.weight)
        nn.init.zeros_(m.bias)
    elif isinstance(m, nn.Conv2d):
        nn.init.kaiming_normal_(m.weight, nonlinearity='relu')
        nn.init.zeros_(m.bias)


In [245]:
def training_step(model, optimizer,src, tgt, criterion):
  """
  Is called every step to train the model
  """

  model.train()

  inp_data = src.to(device)
  target = tgt.to(device)
#   print(inp_data.shape, target.shape)

  # forward prop
  output = model(inp_data)
#   print(output.shape)
#   output = output.reshape(-1, output.shape[2])
#   target = target[:, 1:].reshape(-1)

  optimizer.zero_grad()

  loss = criterion(output, target)
  loss.backward()
  torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
  optimizer.step()
  lr = optimizer.param_groups[0]["lr"]
#   scheduler.step()
  
  
  
  return loss.item() , lr


In [246]:
def validate(model,dev_loader, criterion, scheduler):
  eval_losses = []
  model.eval()
  with torch.no_grad():
    for idx, (src, tgt) in enumerate(dev_loader):
        inp_data = src.to(device)
        target = tgt.to(device)
          # forward prop
        output = model(inp_data)
        eval_loss = criterion(output, target)
        eval_losses.append(eval_loss.item())
  mean_eval_loss = sum(eval_losses) / len(eval_losses)
  scheduler.step(mean_eval_loss)
  return mean_eval_loss

In [247]:
def save_checkpoint(state, file_name):
    torch.save(state, file_name)

def load_checkpoint(checkpoint_path, model):
    print("=> Loading checkpoint")
    checkpoint = torch.load(checkpoint_path,map_location=torch.device('cpu'))
    model.load_state_dict(checkpoint["state_dict"], strict = False)


In [248]:
class CNN_Classifier():
    def __init__(self, config, device):
      seed  = 42
      torch.manual_seed(seed)
      torch.cuda.manual_seed_all(seed)
      torch.cuda.manual_seed(seed)
      np.random.seed(seed)
      random.seed(seed)
      torch.backends.cudnn.deterministic = True
      torch.backends.cudnn.benchmark = False

      self.config = config
      self.device = device
      self.max_sent_len = self.config['model']['max_sent_len']
      self.train_loader,_,self.vocab = get_loader(train_df, vocab = None, max_sent_len = self.max_sent_len)
      self.dev_loader,self.dev_dataset,_ = get_loader(val_df, vocab = vocab, max_sent_len = self.max_sent_len)
      
      self.PAD_IDX = self.vocab.stoi["<PAD>"]
      self.INPUT_DIM = len(self.vocab)
      self.EMBEDDING_DIM = self.config['model']['embedding_dim']
      self.N_FILTERS = self.config['model']['n_filters']
      self.FILTER_SIZES = self.config['model']['filter_sizes']
      self.OUTPUT_DIM = self.config['model']['output_dim']
      self.DROPOUT = self.config['model']['dropout']
      self.checkpoint_path = self.config['model']['checkpoint_path']
      
      self.learning_rate = self.config['train']['learning_rate']
      self.num_epochs = self.config['train']['num_epochs']
      print(self.num_epochs)
      self.model = CNN(self.INPUT_DIM, self.EMBEDDING_DIM, self.N_FILTERS, self.FILTER_SIZES, self.OUTPUT_DIM, self.DROPOUT, self.PAD_IDX, device = self.device)
      self.model.apply(initialize_weights)

    def train(self):
      optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate, betas = (0.99,0.98))

    #   scheduler = transformers.get_cosine_schedule_with_warmup(optimizer, num_warmup_steps = int(self.num_epochs * len(self.train_loader) * 0.15), 
    #                                                      num_training_steps = self.num_epochs * len(self.train_loader) )
      scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor = 0.1, patience = 5)
      criterion = nn.CrossEntropyLoss(ignore_index=self.PAD_IDX)

      # Tensorboard for nice plot

      step = 0
      best_val_loss = 999
      lrs = []
      train_loss = []
      val_loss = []

      start_time = datetime.datetime.now()
      for epoch in range(self.num_epochs):
        print("**************************************************************************    Epoch number {}    **************************************************************************".format(epoch + 1))
        losses = []
        for batch_idx, (src, tgt) in enumerate(tqdm(self.train_loader, position=0, leave=True)):
          step += 1
          loss, lr = training_step(self.model,optimizer,src,tgt,criterion)
          lrs.append(lr)
          losses.append(loss)



          if (step == 1) or step % 50 == 0 or step == len(self.train_loader) * self.num_epochs:
            eval_loss = validate(self.model,self.dev_loader, criterion, scheduler)

            train_loss.append(sum(losses) / len(losses))
            val_loss.append(eval_loss)
            current_time =  datetime.datetime.now()
            time_taken = current_time - start_time
            print("Current step: {0}, epoch: {1}".format(step,epoch + 1), end = "    ")
            print("Training loss: ", sum(losses) / len(losses), end = "    ")
            print("Evaluation loss: ", eval_loss, end = "    " )
            print("Time elapsed: ", time_taken, end = "    ")
#             writer_train.add_scalar('Loss', sum(losses) / len(losses) , step)
#             writer_dev.add_scalar('Loss', eval_loss, step)

            if eval_loss < best_val_loss:
              print("(Best weights saved!) ")
              best_val_loss = eval_loss
              checkpoint = {
                "state_dict" : self.model.state_dict(),
                "optimizer" : optimizer.state_dict(),
                "scheduler" : scheduler.state_dict()}

              # Save the checkpoint with the lowest loss
              save_checkpoint(checkpoint , self.checkpoint_path) 


            else:
              print()
      
    def predict(self,sentences):
        load_checkpoint(self.checkpoint_path, self.model)
        x = []
        for sent in sentences:
#             print(sent)
            sent_numericalized = [self.vocab.stoi["<SOS>"]]
            sent_numericalized += self.vocab.numericalize(sent)

            if len(sent_numericalized) > self.max_sent_len:
                sent_numericalized = sent_numericalized[:self.max_sent_len]

            sent_tensor = np.full(self.max_sent_len, self.vocab.stoi["<PAD>"])
            sent_tensor[:len(sent_numericalized)] = sent_numericalized
            sent_tensor = torch.Tensor(sent_tensor).long()
            x.append(sent_tensor)
#             print(sent_tensor.shape)
        x = torch.stack(x, dim = 0)
        x = x.to(self.device)
        output  = self.model(x)
        classification = torch.argmax(output, dim = 1)
        return classification


In [249]:
with open('config/cnn.yml') as f:
    config = yaml.load(f, Loader=SafeLoader)

print(config)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

cnnClassifier = CNN_Classifier(config, device)
cnnClassifier.train()
# tokenizeTranslator = TransformerTranslator(tk_config, device, input_type = 'tokenization')
# print(len(tokenizeTranslator.vi_vocab), len(tokenizeTranslator.en_vocab))
# # bpeTranslator.inference("My family was not poor , and myself , I had never experienced hunger since 2002.", tensor_input = False)

{'model': {'embedding_dim': 128, 'n_filters': 50, 'filter_sizes': [3, 4], 'output_dim': 3, 'dropout': 0.5, 'max_sent_len': 120, 'checkpoint_path': 'model_checkpoint/cnn.pth.tar'}, 'train': {'learning_rate': 0.0005, 'num_epochs': 10}}
10
**************************************************************************    Epoch number 1    **************************************************************************


  0%|          | 0/179 [00:00<?, ?it/s]

Current step: 1, epoch: 1    Training loss:  1.0683135986328125    Evaluation loss:  0.7915439772605896    Time elapsed:  0:00:00.187512    (Best weights saved!) 
Current step: 50, epoch: 1    Training loss:  0.7739304971694946    Evaluation loss:  0.6346079897880554    Time elapsed:  0:00:00.793894    (Best weights saved!) 
Current step: 100, epoch: 1    Training loss:  0.7019010418653489    Evaluation loss:  0.6341163039207458    Time elapsed:  0:00:01.386289    (Best weights saved!) 
Current step: 150, epoch: 1    Training loss:  0.6782874488830566    Evaluation loss:  0.6334522080421447    Time elapsed:  0:00:01.980704    (Best weights saved!) 
**************************************************************************    Epoch number 2    **************************************************************************


  0%|          | 0/179 [00:00<?, ?it/s]

Current step: 200, epoch: 2    Training loss:  0.6411525862557548    Evaluation loss:  0.6334294843673706    Time elapsed:  0:00:02.611987    (Best weights saved!) 
Current step: 250, epoch: 2    Training loss:  0.6228831653863611    Evaluation loss:  0.6317338633537293    Time elapsed:  0:00:03.236311    (Best weights saved!) 


KeyboardInterrupt: 

In [244]:
classification = cnnClassifier.predict(test_df['sentences'])

classification[100:1000]

=> Loading checkpoint


tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

In [217]:
test_df

Unnamed: 0,sentences,sentiments
0,giáo_trình chưa cụ_thể,0
1,giảng buồn_ngủ,0
2,giáo_viên vui_tính tận_tâm,2
3,giảng_viên nên giao bài_tập nhiều hơn chia nhó...,0
4,giảng_viên cần giảng bài chi_tiết hơn đi sâu h...,0
...,...,...
1578,hướng_dẫn lab mơ_hồ,0
1579,thầy cho chúng em những bài_tập mang tính thực...,2
1580,thầy không dạy nhiều chủ_yếu cho sinh_viên tự ...,0
1581,em muốn đổi tên môn_học vì tên môn là lập_trìn...,0


In [None]:
test_losses = [] # track loss
num_correct = 0
model = cnnClassifier.model
load_checkpoint(model, config['model']['checkpoint_path'])
model.eval()
# iterate over test data
pred_total = torch.tensor([])
labels_total = torch.tensor([])
for inputs, labels in test_loader:

    inputs = inputs.to(device)
    labels = labels.to(device)
    
    # get predicted outputs
    output = model(inputs)
    
    # calculate loss
    test_loss = criterion(output, labels)
    test_losses.append(test_loss.item())
    
    #convert model output to predicted labels
    out = output.tolist()
    out = np.array(out)
    pred = torch.tensor(out.argmax(axis=1))
    pred_total = torch.cat((pred_total, pred))
    labels_total = torch.cat((labels_total, labels))

    # compare predictions to true label
    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.numpy()) if not torch.cuda.is_available() else np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)


# -- stats! -- ##
# avg test loss
print("Test loss: {:.3f}".format(np.mean(test_losses)))

pred = pred.tolist()
labels = labels.tolist()


# accuracy over all test data
test_acc = num_correct/len(test_loader.dataset)
target_names = ['class 0', 'class 1', 'class 2']
print(classification_report(labels_total, pred_total, target_names=target_names))