In [1]:
! pip install transformers

Defaulting to user installation because normal site-packages is not writeable

Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.
You should consider upgrading via the 'c:\program files\python38\python.exe -m pip install --upgrade pip' command.





In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
import random
import numpy as np
import pandas as pd
from torch.utils.tensorboard import SummaryWriter
import torch.optim as optim
from torch import nn
from tqdm.notebook import tqdm
from torch.autograd import Variable 
import matplotlib.pyplot as plt 

random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed(42)
torch.backends.cudnn.deterministic = True
import time
import transformers




In [2]:
train_df = pd.read_csv("./Data/processed/train.csv", sep = "|")
val_df = pd.read_csv("./Data/processed/validation.csv", sep = "|")
test_df = pd.read_csv("./Data/processed/test.csv", sep = "|")

In [3]:
train_df

Unnamed: 0,sentences,sentiments
0,slide giáo_trình đầy_đủ,2
1,nhiệt_tình giảng_dạy gần_gũi với sinh_viên,2
2,đi học đầy_đủ full điểm chuyên cần,0
3,chưa áp_dụng công_nghệ_thông_tin và các thiết_...,0
4,thầy giảng bài hay có nhiều bài_tập ví_dụ ngay...,2
...,...,...
11421,chỉ vì môn game mà em học hai lần mà không qua...,0
11422,em cảm_ơn cô nhiều,2
11423,giao bài_tập quá nhiều,0
11424,giáo_viên dạy dễ hiểu nhiệt_tình,2


In [4]:
import spacy
import collections
from tqdm.notebook import tqdm
import youtokentome as yttm
import os
import json
import datetime
spacy_eng = spacy.load("en_core_web_sm")

class Vocabulary():
  '''Vocabulary class: extract all words from corpus, save all words that appears more than freq_threshold times'''
  def __init__(self, freq_threshold=3):
    self.itos = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
    self.stoi = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
    self.freq_threshold = freq_threshold

  def __len__(self):
    return len(self.itos)

  def build_vocab(self, sentence_list):
    # Build the vocab
    idx = 4
    word_counter = collections.Counter([word for sentence in sentence_list for word in sentence])

    for word, count in word_counter.items():
      if count >= self.freq_threshold:
        self.stoi[word] = idx
        self.itos[idx] = word
        idx += 1
    # print(self.stoi)

  def numericalize(self, sent):
    # Turn a sentence into list of word ID
    return [
            self.stoi[token] if token in self.stoi else self.stoi["<UNK>"]
            for token in sent
        ]
  def save_vocab(self, vocab_path):
#     with open(itos_path, 'w') as itos_f:
#         json.dump(self.itos, itos_f)
#     with open(stoi_path, 'w') as stoi_f:
#         json.dump(self.stoi, stoi_f)
    with open(vocab_path, 'w') as f:
        vocab = {'itos': self.itos, 'stoi': self.stoi}
        json.dump(vocab, f)

  def load_vocab(self, vocab_path):
    with open(vocab_path, 'r') as f:
#         print(type(itos_f.read()))
        vocab = json.load(f)
        self.itos = vocab['itos']
        self.stoi = vocab['stoi']
        
    
  

In [6]:
type(train_df['sentiments'][0])

numpy.int64

In [7]:
train_sents = train_df['sentences'].values
train_sents = [sent.split(" ") for sent in train_sents]
train_sents[0:5]

[['slide', 'giáo_trình', 'đầy_đủ'],
 ['nhiệt_tình', 'giảng_dạy', 'gần_gũi', 'với', 'sinh_viên'],
 ['đi', 'học', 'đầy_đủ', 'full', 'điểm', 'chuyên', 'cần'],
 ['chưa',
  'áp_dụng',
  'công_nghệ_thông_tin',
  'và',
  'các',
  'thiết_bị',
  'hỗ_trợ',
  'cho',
  'việc',
  'giảng_dạy'],
 ['thầy',
  'giảng',
  'bài',
  'hay',
  'có',
  'nhiều',
  'bài_tập',
  'ví_dụ',
  'ngay',
  'trên',
  'lớp']]

In [8]:
vocab = Vocabulary(3)
vocab.build_vocab(train_sents)

In [9]:
vocab.stoi

{'<PAD>': 0,
 '<SOS>': 1,
 '<EOS>': 2,
 '<UNK>': 3,
 'slide': 4,
 'giáo_trình': 5,
 'đầy_đủ': 6,
 'nhiệt_tình': 7,
 'giảng_dạy': 8,
 'gần_gũi': 9,
 'với': 10,
 'sinh_viên': 11,
 'đi': 12,
 'học': 13,
 'điểm': 14,
 'chuyên': 15,
 'cần': 16,
 'chưa': 17,
 'áp_dụng': 18,
 'công_nghệ_thông_tin': 19,
 'và': 20,
 'các': 21,
 'thiết_bị': 22,
 'hỗ_trợ': 23,
 'cho': 24,
 'việc': 25,
 'thầy': 26,
 'giảng': 27,
 'bài': 28,
 'hay': 29,
 'có': 30,
 'nhiều': 31,
 'bài_tập': 32,
 'ví_dụ': 33,
 'ngay': 34,
 'trên': 35,
 'lớp': 36,
 'giảng_viên': 37,
 'đảm_bảo': 38,
 'thời_gian': 39,
 'lên': 40,
 'tích_cực': 41,
 'trả_lời': 42,
 'câu_hỏi': 43,
 'của': 44,
 'thường_xuyên': 45,
 'đặt': 46,
 'câu': 47,
 'hỏi': 48,
 'em': 49,
 'sẽ': 50,
 'môn': 51,
 'này': 52,
 'nhưng': 53,
 'học_lại': 54,
 'ở': 55,
 'học_kỳ': 56,
 'thời_lượng': 57,
 'quá': 58,
 'dài': 59,
 'không': 60,
 'tiếp_thu': 61,
 'hiệu_quả': 62,
 'nội_dung': 63,
 'môn_học': 64,
 'phần': 65,
 'thiếu': 66,
 'trọng_tâm': 67,
 'hầu_như': 68,
 'là': 69,

In [16]:
import torch
import numpy as np
from torch.utils.data import DataLoader, Dataset
import random
import torch.nn.functional as F
seed  = 42
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.cuda.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

class SentimentDataset(Dataset):
  def __init__(self,df, vocab = None, max_sent_len = 120):
    
    self.train_sents = [sent.split(" ") for sent in df['sentences'].values]
    self.labels = df['sentiments'].values
    self.max_sent_len = max_sent_len
    if vocab == None:
        self.vocab = Vocabulary(3)
        self.vocab.build_vocab(self.train_sents)
    else:
        self.vocab = vocab
  def __len__(self):
    return len(self.train_sents)
      
  def __getitem__(self, index):
    # Override __getitem__ method of parent class (torch.utils.data.Dataset class) 
    sent_numericalized = [self.vocab.stoi["<SOS>"]]
    sent_numericalized += self.vocab.numericalize(self.train_sents[index])
    
    if len(sent_numericalized) > self.max_sent_len:
        sent_numericalized = sent_numericalized[:self.max_sent_len]

    sent_tensor = np.full(self.max_sent_len, self.vocab.stoi["<PAD>"])
    sent_tensor[:len(sent_numericalized)] = sent_numericalized
    x = torch.Tensor(sent_tensor).long()
    
    y = self.labels[index]
#     y = torch.Tensor([y]).long()
    return x,y

In [17]:
def get_loader(df, vocab = None, max_sent_len = 120,batch_size = 64, num_workers = 0, shuffle = True, pin_memory = True):
    '''
    Function to create DataLoader: group sentences into batches of size batch_size
    
    Params:
      src_data_file, tgt_data_file: path to source and target data file
      max_seq_len: max number of tokens in a sentence
      en_vocab, vi_vocab: object of Vocabulary class. If we already have en_vocab and vi_vocab, we avoid creating them again
      back_translation: if we're using this for back translation, set this to True
      batch_size: number of sentences in a batch
      num_workers, shuffle, pin_memory: not very important
    Returns:
      loader: object of DataLoader class
      dataset: object of StanfordEnViDataset class
      en_vocab, vi_vocab: object of Vocabulary class. This is vocab of English and Vietnamese
    '''
#     dataset = StanfordEnViDataset(src_data_file, tgt_data_file, en_vocab_path, vi_vocab_path, input_type,bpe_en_file, bpe_vi_file, max_seq_len, back_translation)  
    dataset = SentimentDataset(df, vocab = vocab, max_sent_len = max_sent_len)
    vocab = dataset.vocab
    loader = DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        num_workers=num_workers,
        shuffle=shuffle,
        pin_memory=pin_memory
    )
    return loader, dataset, vocab

In [18]:
train_loader,_,vocab = get_loader(train_df, vocab = None, max_sent_len = 100)
val_loader,_,_ = get_loader(val_df, vocab = vocab, max_sent_len = 100)

In [13]:
for batch_idx, (src, tgt) in enumerate(train_loader):
    print(src, tgt)
    print(src.shape,tgt.shape)
    break


tensor([[  1, 300,  13,  ...,   0,   0,   0],
        [  1,   3, 149,  ...,   0,   0,   0],
        [  1,  37,  88,  ...,   0,   0,   0],
        ...,
        [  1,  37,  30,  ...,   0,   0,   0],
        [  1, 426, 643,  ...,   0,   0,   0],
        [  1, 206,  88,  ...,   0,   0,   0]]) tensor([0, 0, 2, 2, 2, 0, 0, 0, 0, 2, 0, 0, 2, 2, 0, 2, 2, 2, 0, 2, 0, 0, 2, 0,
        0, 2, 2, 2, 2, 2, 2, 2, 1, 2, 0, 0, 0, 2, 0, 0, 0, 2, 2, 0, 2, 0, 2, 2,
        2, 0, 2, 0, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 2])
torch.Size([64, 100]) torch.Size([64])


In [19]:

class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx, device):
        
        super().__init__()
        
        self.device = device
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx).to(self.device)
        
        self.convs = nn.ModuleList([
                                    nn.Conv2d(in_channels = 1, 
                                              out_channels = n_filters, 
                                              kernel_size = (fs, embedding_dim)) 
                                    for fs in filter_sizes
                                    ]).to(self.device)
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim).to(self.device)
        
        self.dropout = nn.Dropout(dropout)
        
#         self.log_softmax = F.log_softmax(dim = 1)
        
    def forward(self, text):
                
        #text = [batch size, sent len]
        
        embedded = self.embedding(text)
                
        #embedded = [batch size, sent len, emb dim]
        
        embedded = embedded.unsqueeze(1)
        
        #embedded = [batch size, 1, sent len, emb dim]
        
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
            
        #conved_n = [batch size, n_filters, sent len - filter_sizes[n] + 1]
                
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        
        #pooled_n = [batch size, n_filters]
        
        cat = self.dropout(torch.cat(pooled, dim = 1))

        #cat = [batch size, n_filters * len(filter_sizes)]
        fc = self.fc(self.dropout(cat))
        
        
        proba = F.log_softmax(fc, dim = 1)
        
        
        return proba

In [21]:
train_loader,_,vocab = get_loader(train_df, vocab = None, max_sent_len = 100)
val_loader,_,_ = get_loader(val_df, vocab = vocab, max_sent_len = 100)

PAD_IDX = vocab.stoi["<PAD>"]
INPUT_DIM = len(vocab)
EMBEDDING_DIM = 256
N_FILTERS = 100
FILTER_SIZES = [3,4,5]
OUTPUT_DIM = 3
DROPOUT = 0.5

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX, device = device)
for batch_idx, (src, tgt) in enumerate(train_loader):
    src = src.to(device)
    tgt = tgt.to(device)
    y = model(src)
    print(y.shape)
    break


torch.Size([64, 3])
