In [6]:
# Import library
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np

In [2]:
path_data = "/home/tiendat/AI-LAB/NLP/pytorch/nn/dataset.csv"

In [4]:
class Dataset:
    def __init__(self, path=None, tablesize=1000000):
        if not path:
            path = "/home/tiendat/AI-LAB/NLP/pytorch/nn"
        
        self.path = path
        self.tablesize = tablesize
    
    def tokens(self):
        if hasattr(self, "_tokens") and self._tokens:
            return self._tokens

        tokens = dict()
        tokenfreq = dict()
        wordcount = 0
        revtokens = []
        idx = 0

        for sentence in self.sentences():
            for w in sentence:
                wordcount += 1
                if not w in tokens:
                    # tokens[w] = idx
                    revtokens += [w]
                    tokenfreq[w] = 1
                    idx += 1
                else:
                    tokenfreq[w] += 1
        revtokens = sorted(list(set(revtokens)))
        print(len(revtokens))
        for i, word in enumerate(revtokens):
            tokens[word] = i
        tokens["UNK"] = len(revtokens)
        revtokens += ["UNK"]
        tokenfreq["UNK"] = 1
        wordcount += 1

        self._tokens = tokens
        self._tokenfreq = tokenfreq
        self._wordcount = wordcount
        self._revtokens = revtokens
        return self._tokens
    
    def revtokens(self):
        if hasattr(self, "_revtokens") and self._revtokens:
            return self._revtokens

        tokens = dict()
        tokenfreq = dict()
        wordcount = 0
        revtokens = []
        idx = 0

        for sentence in self.sentences():
            for w in sentence:
                wordcount += 1
                if not w in tokens:
                    # tokens[w] = idx
                    revtokens += [w]
                    tokenfreq[w] = 1
                    idx += 1
                else:
                    tokenfreq[w] += 1
        revtokens = sorted(list(set(revtokens)))
        print(len(revtokens))
        for i, word in enumerate(revtokens):
            tokens[word] = i
        tokens["UNK"] = len(revtokens)
        revtokens += ["UNK"]
        tokenfreq["UNK"] = 1
        wordcount += 1

        self._tokens = tokens
        self._tokenfreq = tokenfreq
        self._wordcount = wordcount
        self._revtokens = revtokens
        return self._revtokens
    
    def label(self):
        if hasattr(self, "_label") and self._label:
            return self._label 

        df = pd.read_csv(self.path + '/dataset.csv')
        label = df["label"].to_list()
        self._label = label
        return self._label



    def sentences(self):
        if hasattr(self, "_sentences") and self._sentences:
            return self._sentences

        sentences = []
        df = pd.read_csv(self.path + '/dataset.csv')
        all_text = df['text_final'].to_list()
        for sent in all_text:
            splitted = sent.split(' ')
            sentences += [[w for w in splitted]]
        
        self._sentences = sentences
        self._sentlengths = np.array([len(s) for s in sentences])
        self._cumsentlen = np.cumsum(self._sentlengths)

        return self._sentences
        

    def numSentences(self):
        if hasattr(self, "_numSentences") and self._numSentences:
            return self._numSentences
        else:
            self._numSentences = len(self.sentences())
            return self._numSentences
        
    def getRandomContext(self, C=5):
        allsent = self.sentences()
        sentID = random.randint(0, len(allsent) - 1)
        sent = allsent[sentID]
        wordID = random.randint(0, len(sent) - 1)

        context = sent[max(0, wordID - C):wordID]
        if wordID+1 < len(sent):
            context += sent[wordID+1:min(len(sent), wordID + C + 1)]

        centerword = sent[wordID]
        context = [w for w in context if w != centerword]

        if len(context) > 0:
            return centerword, context
        else:
            return self.getRandomContext(C)

    def getDevSentences(self):
        return self.getSplitSentences(2)

    def getTestSentences(self):
        return self.getSplitSentences(1)

    def getTrainSentences(self):
        return self.getSplitSentences(0)

    def getSplitSentences(self, split=0):
        ds_split = self.dataset_split()
        return [(self.sentences()[i], self.categorify(self.sent_labels()[i])) for i in ds_split[split]]
    
    def sampleTable(self):
        if hasattr(self, '_sampleTable') and self._sampleTable is not None:
            return self._sampleTable

        nTokens = len(self.tokens())
        samplingFreq = np.zeros((nTokens,))
        self.allSentences()
        i = 0
        for w in range(nTokens):
            w = self._revtokens[i]
            if w in self._tokenfreq:
                freq = 1.0 * self._tokenfreq[w]
                # Reweigh
                freq = freq ** 0.75
            else:
                freq = 0.0
            samplingFreq[i] = freq
            i += 1

        samplingFreq /= np.sum(samplingFreq)
        samplingFreq = np.cumsum(samplingFreq) * self.tablesize

        self._sampleTable = [0] * self.tablesize

        j = 0
        for i in range(self.tablesize):
            while i > samplingFreq[j]:
                j += 1
            self._sampleTable[i] = j

        return self._sampleTable
    
    def rejectProb(self):
        if hasattr(self, '_rejectProb') and self._rejectProb is not None:
            return self._rejectProb

        threshold = 1e-5 * self._wordcount

        nTokens = len(self.tokens())
        rejectProb = np.zeros((nTokens,))
        for i in range(nTokens):
            w = self._revtokens[i]
            freq = 1.0 * self._tokenfreq[w]
            # Reweigh
            rejectProb[i] = max(0, 1 - np.sqrt(threshold / freq))

        self._rejectProb = rejectProb
        return self._rejectProb

    def sampleTokenIdx(self):
        return self.sampleTable()[random.randint(0, self.tablesize-1)]

In [7]:
dataset = Dataset()

tokenizer = dataset.tokens()
all_sentences = dataset.sentences()
vocab_word = dataset.revtokens()

3744


In [8]:
with open('/home/tiendat/Downloads/word2vec_vi_syllables_100dims.txt') as f:
        lines = f.readlines()
len(lines)

979461

In [9]:
def word_vectors(lines, vocab):
    vocab_copy = vocab.copy()
    vocab_ebedded = {}
    for line in lines[1:]:
        l = line.split(' ')
         
        if l[0] in vocab_copy:

            vocab_ebedded[l[0]] = np.array(l[1:], dtype ='float32')
            vocab_copy.remove(l[0])
    return vocab_ebedded, vocab_copy

In [10]:
vocab_ebedded, vocab_copy = word_vectors(lines,vocab_word)

In [15]:
max_length = 64
pad_token = "UNK"
all_sentences_padded = []
for sent in all_sentences:
    word_padded = []
    if len(sent) < max_length:
        word_padded += sent + [pad_token] * (max_length-len(sent))
        # print(len(word_padded))
    elif len(sent) > max_length:
        word_padded = sent[:max_length]
        # print(word_padded)
    all_sentences_padded.append(word_padded)

# all_sentences_padded

In [23]:
embd_sent = []
for sent in all_sentences_padded:
    word_embd = []
    for word in sent:
        try:
            word_embd.append(vocab_ebedded[word])
        except:
            word_embd.append(vocab_ebedded["UNK"])

    # word_embd = torch.tensor(word_embd)
    word_embd = np.array(word_embd).reshape(len(word_embd), -1)
    # print(word_embd.shape)
    # word_embd = np.concatenate(word_embd)
    # word_embd = word
    # print(word_embd.shape)

    embd_sent.append(word_embd)
embd_sent = np.array(embd_sent)
print(embd_sent.shape)

(9934, 64, 100)


In [27]:
embd_sent = torch.tensor(embd_sent)

In [43]:
label = torch.tensor(dataset.label()).type(torch.LongTensor)

In [44]:
import torch
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self, data, labels):
        self.data = torch.Tensor(data)  # Assuming data is a NumPy array
        self.labels = torch.LongTensor(labels)  # Assuming labels are a NumPy array

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]


# Create an instance of the custom dataset
train_dataset = CustomDataset(embd_sent[:8000], label[:8000])
test_dataset = CustomDataset(embd_sent[8000:], label[8000:])

# Define batch size
batch_size = 64 # You can change this to your desired batch size

# Create a DataLoader
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [30]:
# Hyperparameters
input_size = 100
hidden_size = 256
num_layers = 2
num_classes = 10
sequence_length = 64
learning_rate = 0.005
batch_size = 64
num_epochs = 3

In [31]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [33]:
from tqdm import tqdm
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size * sequence_length, num_classes)

    def forward(self, x):
        # Set initial hidden and cell states
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)

        # Forward propagate LSTM
        out, _ = self.rnn(x, h0)
        out = out.reshape(out.shape[0], -1)

        # Decode the hidden state of the last time step
        out = self.fc(out)
        return out


# Recurrent neural network with GRU (many-to-one)
class RNN_GRU(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(RNN_GRU, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size * sequence_length, num_classes)

    def forward(self, x):
        # Set initial hidden and cell states
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)

        # Forward propagate LSTM
        out, _ = self.gru(x, h0)
        out = out.reshape(out.shape[0], -1)

        # Decode the hidden state of the last time step
        out = self.fc(out)
        return out


# Recurrent neural network with LSTM (many-to-one)
class RNN_LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(RNN_LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size * sequence_length, num_classes)

    def forward(self, x):
        # Set initial hidden and cell states
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)

        # Forward propagate LSTM
        out, _ = self.lstm(
            x, (h0, c0)
        )  # out: tensor of shape (batch_size, seq_length, hidden_size)
        out = out.reshape(out.shape[0], -1)

        # Decode the hidden state of the last time step
        out = self.fc(out)
        return out


# Initialize network (try out just using simple RNN, or GRU, and then compare with LSTM)
model = RNN_LSTM(input_size, hidden_size, num_layers, num_classes).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [45]:
# Train Network
for epoch in range(num_epochs):
    for batch_idx, (data, targets) in enumerate(tqdm(train_loader)):
        # Get data to cuda if possible
        data = data.to(device=device).squeeze(1)
        targets = targets.to(device=device)

        # forward
        scores = model(data)
        loss = criterion(scores, targets)

        # backward
        optimizer.zero_grad()
        loss.backward()

        # gradient descent update step/adam step
        optimizer.step()

100%|██████████| 125/125 [00:45<00:00,  2.72it/s]
100%|██████████| 125/125 [00:58<00:00,  2.14it/s]
100%|██████████| 125/125 [00:45<00:00,  2.72it/s]


In [46]:
# Check accuracy on training & test to see how good our model
def check_accuracy(loader, model):
    num_correct = 0
    num_samples = 0

    # Set model to eval
    model.eval()

    with torch.no_grad():
        for x, y in loader:
            x = x.to(device=device).squeeze(1)
            y = y.to(device=device)

            scores = model(x)
            _, predictions = scores.max(1)
            num_correct += (predictions == y).sum()
            num_samples += predictions.size(0)

    # Toggle model back to train
    model.train()
    return num_correct / num_samples


print(f"Accuracy on training set: {check_accuracy(train_loader, model)*100:2f}")
print(f"Accuracy on test set: {check_accuracy(test_loader, model)*100:.2f}")

Accuracy on training set: 81.787506
Accuracy on test set: 71.30
