In [1]:
import os
import re
import torch
import torch.nn as nn
from torch.utils.data import DataLoader,Dataset
import torch.optim as optim

In [9]:
train_path = '../data/20news-bydate-train'
test_path = '../data/20news-bydate-test'
saved_path = '../data/'
MAX_SENTENCE_LENGTH = 500
padding_ID = 0
unknown_ID = 1
NUM_OF_CLASSES = 20
EMBEDDINGS_SIZE = 300
LSTM_HIDDEN_SIZE = 50
batch_size = 50
learning_rate = 0.01

In [None]:
#generate data and vocabulary
def get_data_and_vocab():
    def collect_data_from(parent_path, newsgroup_list, word_count=None):
        data = []
        for id, newsgroup in enumerate(newsgroup_list):
            path = parent_path+'/'+newsgroup+'/'
            files = [(file_name, path+file_name)
                     for file_name in os.listdir(path)]
            print('Processing: {}-{}'.format(id, newsgroup))
            for file_name, file_path in files:
                with open(file_path) as f:
                    text = f.read().lower()
                    words = re.split('\W+', text)
                    if word_count is not None:
                        for word in words:
                            if word in word_count:
                                word_count[word] += 1
                            else:
                                word_count[word] = 0
                    content = ' '.join(words)
                    assert len(content.splitlines()) == 1
                    data.append(str(id)+'<fff>'+file_name+'<fff>'+content)
        return data
    word_count = dict()
    newsgroup_list = [newsgroup for newsgroup in os.listdir(train_path)]
    newsgroup_list.sort()
    train_data = collect_data_from(train_path, newsgroup_list, word_count)
    vocab = [word for word, freq in word_count.items() if freq > 10]
    vocab.sort()
    with open(saved_path+'vocab-raw.txt', 'w') as f:
        f.write('\n'.join(vocab))
    newsgroup_list = [newsgroup for newsgroup in os.listdir(test_path)]
    test_data = collect_data_from(test_path, newsgroup_list)
    with open(saved_path+'20news-train-raw.txt', 'w') as f:
        f.write('\n'.join(train_data))
    with open(saved_path+'20news-test-raw.txt', 'w') as f:
        f.write('\n'.join(test_data))

In [None]:
def encode_data(data_path, vocab_path):
    with open(vocab_path) as f:
        vocab = dict([(word, word_ID+2)
                     for word_ID, word in enumerate(f.read().splitlines())])
    with open(data_path) as f:
        documents = [(line.split('<fff>')) for line in f.read().splitlines()]
    encoded_data = []
    for document in documents:
        label, doc_id, text = document
        words = text.split()[:MAX_SENTENCE_LENGTH]
        sentence_length = len(words)
        encoded_text = []
        for word in words:
            if word in vocab:
                encoded_text.append(str(vocab[word]))
            else:
                encoded_text.append(str(unknown_ID))
        for i in range(sentence_length, MAX_SENTENCE_LENGTH):
            encoded_text.append(str(padding_ID))
        encoded_data.append(str(label)+'<fff>'+str(doc_id)+'<fff>' +
                            str(sentence_length)+'<fff>'+' '.join(encoded_text))
    dir_name = '/'.join(data_path.split('/')[:-1])
    file_name = '-'.join(data_path.split('/')
                         [-1].split('-')[:-1])+'-encoded.txt'
    with open(dir_name+'/'+file_name, 'w') as f:
        f.write('\n'.join(encoded_data))

In [None]:
get_data_and_vocab()

Processing: 0-alt.atheism
Processing: 1-comp.graphics
Processing: 2-comp.os.ms-windows.misc
Processing: 3-comp.sys.ibm.pc.hardware
Processing: 4-comp.sys.mac.hardware
Processing: 5-comp.windows.x
Processing: 6-misc.forsale
Processing: 7-rec.autos
Processing: 8-rec.motorcycles
Processing: 9-rec.sport.baseball
Processing: 10-rec.sport.hockey
Processing: 11-sci.crypt
Processing: 12-sci.electronics
Processing: 13-sci.med
Processing: 14-sci.space
Processing: 15-soc.religion.christian
Processing: 16-talk.politics.guns
Processing: 17-talk.politics.mideast
Processing: 18-talk.politics.misc
Processing: 19-talk.religion.misc
Processing: 0-alt.atheism
Processing: 1-comp.graphics
Processing: 2-comp.os.ms-windows.misc
Processing: 3-comp.sys.ibm.pc.hardware
Processing: 4-comp.sys.mac.hardware
Processing: 5-comp.windows.x
Processing: 6-misc.forsale
Processing: 7-rec.autos
Processing: 8-rec.motorcycles
Processing: 9-rec.sport.baseball
Processing: 10-rec.sport.hockey
Processing: 11-sci.crypt
Processing

In [None]:
encode_data(saved_path+'20news-train-raw.txt',saved_path+'vocab-raw.txt')
encode_data(saved_path+'20news-test-raw.txt',saved_path+'vocab-raw.txt')

In [5]:
class LSTM_Model(nn.Module):
    def __init__(self, vocab_size, embedding_size, LSTM_size):
        super().__init__()
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.LSTM_size = LSTM_size
        self.embeddings = nn.Embedding(vocab_size, embedding_size, padding_idx=0) #Embeddings Layer
        self.LSTM = nn.LSTM(embedding_size, LSTM_size, batch_first=True) #LSTM layer
        self.linear = nn.Linear(LSTM_size, NUM_OF_CLASSES) #Linear layer
        self.softmax = nn.Softmax() #softmax layer

    def forward(self, x, mask, length):
        embeddings_text = self.embeddings(x) #(batch_size, seq_length, embeddings_size)
        lstm_output, (ht, ct) = self.LSTM(embeddings_text) #(batch_size, seq_length, lstm_hidden_size)
        lstm_output = mask*lstm_output #mul with mask (batch_size, seq_length, lstm_hidden_size)
        lstm_output = torch.sum(lstm_output, dim=1)/length # average (batch_size,lstm_hidden_size)
        output = self.linear(lstm_output) #(batch_size, NUM_OF_CLASSES)
        return self.softmax(output)

In [6]:
class Data(Dataset):
    def __init__(self, labels, encoded_data, data_length):
        super().__init__()
        self.labels = labels
        self.encoded_data = encoded_data
        self.data_length = data_length
    def __len__(self):
        return self.encoded_data.size(0)

    def __getitem__(self, index):
        return self.encoded_data[index], self.labels[index], self.data_length[index]

In [7]:
def load_data(data_path):
    encoded_data = []
    labels = []
    sentence_length = []
    with open(data_path, 'r') as f:
        lines = f.read().splitlines()
    for line in lines:
        label, doc_id, sent_length, text = line.split('<fff>')
        labels.append(int(label))
        encoded_text = [int(u) for u in text.split(' ')]
        encoded_data.append(encoded_text)
        sentence_length.append(int(sent_length))
    return torch.tensor(encoded_data), torch.tensor(labels), torch.tensor(sentence_length)

In [11]:
#Load data
train_data, train_labels, train_length = load_data(saved_path+'20news-train-encoded.txt')
test_data, test_labels, test_length = load_data(saved_path+'20news-test-encoded.txt')
with open(saved_path+'vocab-raw.txt', 'r',encoding='iso 8859-15') as f:
    vocab_size = len(f.read().splitlines())+2

In [18]:
# Generate data, model and optimizer
train_set = Data(train_labels, train_data, train_length)
valid_set = Data(test_labels[:1000], test_data[:1000], test_length[:1000])
model = LSTM_Model(vocab_size, EMBEDDINGS_SIZE, LSTM_HIDDEN_SIZE)
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()
it = 0

In [14]:
def validation(model, optimizer, data, criterion):
  valid_loader = DataLoader(data, batch_size=batch_size)
  total_loss = 0.0
  num_sample = 0
  for data, labels, length in valid_loader:
    mask = (data!=0).unsqueeze(2) #generate sentence mask
    length = length.unsqueeze(1)
    predict = model.forward(data, mask, length)
    loss = criterion(predict, labels)
    total_loss+= loss
    num_sample+=1
  return total_loss/num_sample

In [25]:
total_loss = 0.0
for epoch in range(10):
    print('Epoch %d: ' % epoch)
    for data, labels, length in train_loader:
        it += 1
        mask = (data!=0).unsqueeze(2) #generate sentence mask
        length = length.unsqueeze(1)
        predict = model.forward(data, mask, length)
        loss = criterion(predict, labels)
        total_loss+= loss
        if it % 100 == 0:
            print('Iteration %d:\nTrain loss: %f\nVal loss: %f' % (it, total_loss/100, validation(model, optimizer, valid_set, criterion)))
            total_loss = 0
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

Epoch 0: 




Iteration 600:
Train loss: 2.276887
Val loss: 2.687359
Iteration 700:
Train loss: 2.306279
Val loss: 2.623698
Epoch 1: 
Iteration 800:
Train loss: 2.277609
Val loss: 2.675293
Iteration 900:
Train loss: 2.258175
Val loss: 2.644224
Epoch 2: 
Iteration 1000:
Train loss: 2.257921
Val loss: 2.641708
Iteration 1100:
Train loss: 2.228795
Val loss: 2.506616
Epoch 3: 
Iteration 1200:
Train loss: 2.218322
Val loss: 2.469356
Iteration 1300:
Train loss: 2.200508
Val loss: 2.462802
Iteration 1400:
Train loss: 2.197955
Val loss: 2.453601
Epoch 4: 
Iteration 1500:
Train loss: 2.182336
Val loss: 2.414129
Iteration 1600:
Train loss: 2.181095
Val loss: 2.439703
Epoch 5: 
Iteration 1700:
Train loss: 2.183646
Val loss: 2.419493
Iteration 1800:
Train loss: 2.168574
Val loss: 2.429601
Epoch 6: 
Iteration 1900:
Train loss: 2.168563
Val loss: 2.462502
Iteration 2000:
Train loss: 2.140737
Val loss: 2.475493
Epoch 7: 
Iteration 2100:
Train loss: 2.152933
Val loss: 2.478423
Iteration 2200:
Train loss: 2.137226
V

In [29]:
torch.save(
    {'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict()},
    f'LSTM_model_batch_size_{batch_size}_LSTM_{LSTM_HIDDEN_SIZE}_Embeddings_{EMBEDDINGS_SIZE}_lr_{learning_rate}.pth'
)

In [30]:
#Load model for test
u = torch.load(f'LSTM_model_batch_size_{batch_size}_LSTM_{LSTM_HIDDEN_SIZE}_Embeddings_{EMBEDDINGS_SIZE}_lr_{learning_rate}.pth')
model = LSTM_Model(vocab_size, EMBEDDINGS_SIZE, LSTM_HIDDEN_SIZE)
model.load_state_dict(u['model_state_dict'])
optimizer.load_state_dict(u['optimizer_state_dict'])

In [28]:
test_set = Data(test_labels, test_data, test_length) #Load test data
test_loader = DataLoader(test_set, batch_size=50)
num_true_preds = 0
for data, labels, length in test_loader:
    mask = (data!=0).unsqueeze(2)
    length = length.unsqueeze(1)
    test_plabels_eval = model.forward(data, mask, length)
    s_labels = torch.argmax(test_plabels_eval, axis=1)
    num_true_preds += float(torch.sum(s_labels == labels))
print('Accuracy on test data: ', num_true_preds/test_set.__len__())



Accuracy on test data:  0.6988847583643123
