In [2]:
# This is code to download and install pytorch
import os
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())

accelerator = 'cu80' if os.path.exists('/opt/bin/nvidia-smi') else 'cpu'
!pip install http://download.pytorch.org/whl/{accelerator}/torch-0.4.1-{platform}-linux_x86_64.whl torchvision

import torch
print('Version', torch.__version__)
print('CUDA enabled:', torch.cuda.is_available())

Version 0.4.1
CUDA enabled: True


In [3]:
import os
BASE_PATH = '/gdrive/My Drive/colab_files/fake_review_generator/'
if not os.path.exists(BASE_PATH):
    os.makedirs(BASE_PATH)
DATA_PATH = BASE_PATH + 'fake_review_generator/'
if not os.path.exists(DATA_PATH):
    os.makedirs(DATA_PATH)

!pwd
!ls
    
os.chdir(BASE_PATH)
if not os.path.exists(BASE_PATH + 'pt_util.py'):
  !wget https://vinitha910.github.io/pt_util.py
    
os.chdir(DATA_PATH)

if not os.path.exists(DATA_PATH + 'processed_data/Office_Products.csv'):
    !wget https://vinitha910.github.io/office_products_review.tar.gz
    !tar -xvf office_products_review.tar.gz
    !rm office_products_review.tar.gz
os.chdir('/content')

/content
sample_data


In [0]:
import pandas
import torch
import torch.nn as nn
from torchvision import datasets
from torchvision import transforms
import numpy as np
import torch.nn.functional as F
import torch.optim as optim
import sys
import pickle
import re
sys.path.append(BASE_PATH)
import pt_util
import string
from math import log
from math import exp

In [4]:
from google.colab import drive
drive.mount('/gdrive')
!ls /gdrive

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
'My Drive'


In [0]:
def prepare_data(data_path, name):
    with open(data_path) as f:
        # This reads all the data from the file, but does not do any processing on it.
        data = f.read()
        data = data.replace(string.whitespace, " ")
        data = data.replace("\n", " ")
        data = data.replace("\t", " ")
        data = data.replace("\x1f", "")
        data = data.replace("\x08", "")
        data = data.replace("\x1c", "")
        
    tokens = []
    data = data[:int(0.3*len(data))]
    for character in data:
      tokens.append(character)
    tokens = np.array(tokens)    
    unique_tokens = np.unique(tokens)

    voc2ind = {}
    for i in range(len(unique_tokens)):
      voc2ind[unique_tokens[i]] = i
    
    data_tokens = []
    for char in data:
        data_tokens.append(voc2ind[char])

    ind2voc = {val: key for key, val in voc2ind.items()}

    train_text = data_tokens[:int(0.8*len(data_tokens))]
    test_text = data_tokens[int(0.8*len(data_tokens)):]

    pickle.dump({'tokens': train_text, 'ind2voc': ind2voc, 'voc2ind':voc2ind}, open(DATA_PATH + name + '_chars_train.pkl', 'wb'))
    pickle.dump({'tokens': test_text, 'ind2voc': ind2voc, 'voc2ind':voc2ind}, open(DATA_PATH + name + '_chars_test.pkl', 'wb'))
    
prepare_data(DATA_PATH + 'processed_data/Office_Products.csv', 'office_products')

In [0]:
class Vocabulary(object):
    def __init__(self, data_file):
        with open(data_file, 'rb') as data_file:
            dataset = pickle.load(data_file)
        self.ind2voc = dataset['ind2voc']
        self.voc2ind = dataset['voc2ind']

    # Returns a string representation of the tokens.
    def array_to_words(self, arr):
        return ''.join([self.ind2voc[int(ind)] for ind in arr])

    # Returns a torch tensor representing each token in words.
    def words_to_array(self, words):
        return torch.LongTensor([self.voc2ind[word] for word in words])

    # Returns the size of the vocabulary.
    def __len__(self):
        return len(self.voc2ind)

In [0]:
class ReviewsDataset(torch.utils.data.Dataset):
    def __init__(self, data_file, sequence_length, batch_size):
        super(ReviewsDataset, self).__init__()

        self.sequence_length = sequence_length
        self.batch_size = batch_size
        self.vocab = Vocabulary(data_file)

        with open(data_file, 'rb') as data_pkl:
            dataset = pickle.load(data_pkl)

        self.tokens = dataset['tokens']
        remainder = len(self.tokens) % (self.batch_size*self.sequence_length)
        num_tokens = len(self.tokens) - remainder
        self.tokens = self.tokens[:num_tokens]

        assert len(self.tokens) % batch_size == 0
  
        incr = len(self.tokens)/self.batch_size
        index_range = len(self.tokens)/self.batch_size
        data_start_idx = 0
        label_start_idx = 1
        data_end_idx = data_start_idx + self.sequence_length
        label_end_idx = label_start_idx + self.sequence_length
        batch = 0 
        data = [[]]
        labels = [[]]

        while label_start_idx < len(self.tokens):
            data[batch].append(self.tokens[int(data_start_idx):int(data_end_idx)])
            labels[batch].append(self.tokens[int(label_start_idx):int(label_end_idx)])

            if label_end_idx == index_range:
                data.append([])
                labels.append([])
                data_start_idx = data_end_idx + 1
                label_start_idx = label_end_idx + 1
                data_end_idx = data_start_idx + self.sequence_length
                label_end_idx = label_start_idx + self.sequence_length
                index_range += incr
                batch += 1

            else:
                data_start_idx += self.sequence_length
                label_start_idx += self.sequence_length

                data_end_idx += self.sequence_length
                if data_end_idx > index_range - 1:
                    data_end_idx = index_range - 1;

                label_end_idx += self.sequence_length
                if label_end_idx > index_range:
                    label_end_idx = index_range
        
        self.data = []
        self.labels = []
        for b in range(len(data[0])):
            self.data.append([])
            self.labels.append([])
            for d in range(len(data)):
                if b < len(data[d]):
                    self.data[-1].append(data[d][b])
                    self.labels[-1].append(labels[d][b])
    
    def __len__(self):
        sequences = []
        for batch in self.data:
            for sequence in batch:
                sequences.append(sequence)
        return len((np.array(sequences)))
        
    def __getitem__(self, idx):
        col = int(idx % self.batch_size)
        row = int(idx / self.batch_size)

        if row >= len(self.data) or col >= len(self.data[row]):
            print("ReviewsDataset index out of bounds")
            
        item_data = torch.LongTensor(self.data[row][col])
        item_label = torch.LongTensor(self.labels[row][col])
        
        return item_data, item_label

    def vocab_size(self):
        return len(self.vocab)

In [0]:
class Generator(nn.Module):
    def __init__(self, vocab_size, embed_size, feature_size, num_layers):
        super(Generator, self).__init__()
        self.vocab_size = vocab_size
        self.num_layers = num_layers
        self.encoder = nn.Embedding(self.vocab_size, embed_size,)
        self.lstm = nn.LSTM(embed_size, feature_size, num_layers, batch_first=True)
        self.decoder = nn.Linear(feature_size, self.vocab_size)
        
        self.decoder.weight = self.encoder.weight
        self.decoder.bias.data.zero_()
        
        self.best_accuracy = -1
        
    def forward(self, x, hidden=None):
        # Embed word ids to vectors
        x = self.encoder(x)
         
        # Forward propagate LSTM
        output, (hidden, c) = self.lstm(x, hidden)
        
        # Reshape output to (batch_size*sequence_length, feature_size)
        output = output.reshape(output.size(0)*output.size(1), output.size(2))
        
        # Decode hidden states of all time steps
        output = self.decoder(output)
        return output, (hidden, c)
      
    # This defines the function that gives a probability distribution and implements the temperature computation.
    def inference(self, x, hidden_state=None, temperature=1.5):
        x = x.view(-1, 1)
        x, hidden_state = self.forward(x, hidden_state)
        x = x.view(1, -1)
        x = x / max(temperature, 1e-20)
        x = F.softmax(x, dim=1)
        return x, hidden_state
      
    def loss(self, prediction, label, reduction='elementwise_mean'):
        loss_val = F.cross_entropy(prediction.view(-1, self.vocab_size), label.view(-1), reduction=reduction)
        return loss_val
      
    # Saves the current model
    def save_model(self, file_path, num_to_keep=1):
        pt_util.save(self, file_path, num_to_keep)

    # Saves the best model so far
    def save_best_model(self, accuracy, file_path, num_to_keep=1):
        if accuracy > self.best_accuracy:
            self.save_model(file_path, num_to_keep)
            self.best_accuracy = accuracy

    def load_model(self, file_path):
        pt_util.restore(self, file_path)

    def load_last_model(self, dir_path):
        return pt_util.restore_latest(self, dir_path)

In [0]:

BEAM_WIDTH = 10

def generate_language(model, device, seed_words, sequence_length, vocab, sampling_strategy='max', beam_width=BEAM_WIDTH):
    model.eval()

    with torch.no_grad():
        seed_words_arr = vocab.words_to_array(seed_words)
        
        # Computes the initial hidden state from the prompt (seed words).
        hidden = None
        for ind in seed_words_arr:
            data = ind.to(device)
            output, hidden = model.inference(data, hidden)

        outputs = []
        # Initializes the beam list.
        beams = [([], output, hidden, 0)]
        
        for ii in range(sequence_length):

            if sampling_strategy == 'max':
                val = [np.argmax(a).to(device) for a in output]
                outputs += val 
                output, hidden = model.inference(val[0], hidden)

            elif sampling_strategy == 'sample':
                val = torch.multinomial(output, 1)
                
                outputs += [val[0]]
                output, hidden = model.inference(val[0], hidden)

            elif sampling_strategy == 'beam':
                all_beams = list()
                # For each beam in the beam list
                for i in range(len(beams)):
                    sequence, output, hidden, score = beams[i]
                 
                    if (len(sequence) > 0):
                        # Compute the next distribution over the output space for that state
                        output, hidden = model.inference(sequence[-1], hidden)
                        
                    # Sample from the distribution    
                    samples = torch.multinomial(output, BEAM_WIDTH)
                    
                    # For each sample
                    for sample in samples[0]:
                        # Compute its score and Record its hidden state and chosen value
                        beam = (sequence + [sample], output, hidden, score + log(output[0][sample]))
                        # Add all the samples to the new beam list
                        all_beams.append(beam)
                
                # Rank the new beam list
                ordered_beams = sorted(all_beams, key=lambda beam:beam[3], reverse=True)
                    
                # Throw out all but the top N beams
                beams = ordered_beams[:5]
                    
                # Return the top beam's chosen values
                outputs = beams[0][0]

        return vocab.array_to_words(seed_words_arr.tolist() + outputs)

In [0]:
def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)

def train(model, device, train_loader, lr, epoch, log_interval):
    model.train()
    losses = []
    hidden = None
    for batch_idx, (data, label) in enumerate(train_loader):
        data, label = data.to(device), label.to(device)
        # Separates the hidden state across batches. 
        # Otherwise the backward would try to go all the way to the beginning every time.
        if hidden is not None:
            hidden = repackage_hidden(hidden)
        optimizer.zero_grad()
        output, hidden = model(data, hidden)
        pred = output.max(-1)[1]
        loss = model.loss(output, label)
        losses.append(loss.item())
        loss.backward()
        optimizer.step()
        if batch_idx % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))
    return np.mean(losses)


def test(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0

    with torch.no_grad():
        hidden = None
        for batch_idx, (data, label) in enumerate(test_loader):
            data, label = data.to(device), label.to(device)
            output, hidden = model(data, hidden)
            test_loss += model.loss(output, label).item()
            pred = output.max(-1)[1]
            correct_mask = pred.eq(label.view_as(pred))
            num_correct = correct_mask.sum().item()
            correct += num_correct

    test_loss /= len(test_loader)
    test_accuracy = 100. * correct / (len(test_loader.dataset) * test_loader.dataset.sequence_length)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset) * test_loader.dataset.sequence_length,
        100. * correct / (len(test_loader.dataset) * test_loader.dataset.sequence_length)))
    return test_loss, test_accuracy

In [41]:
data_train = ReviewsDataset(DATA_PATH + 'office_products_chars_train.pkl', 100, 256)
print(len(data_train))
# print("batch size: " + str(len(data_train.data[0])))
# print("sequence length: " + str(len(data_train.data[0][0])))
# print("batch size: " + str(len(data_train.data[-1])))
# print("sequence length: " + str(len(data_train.data[-1][-1])))
for data_list in data_train.data:
    if len(data_list) != 256:
        print(len(data_list))
for label_list in data_train.labels:
    if len(label_list) != 256:
        print(len(label_list))

device = torch.device("cuda" if use_cuda else "cpu")
print('Using device', device)
import multiprocessing
num_workers = multiprocessing.cpu_count()
USE_CUDA = True
use_cuda = USE_CUDA and torch.cuda.is_available()
kwargs = {'num_workers': num_workers,
          'pin_memory': True} if use_cuda else {}
train_loader = torch.utils.data.DataLoader(data_train, batch_size=256,
                                           shuffle=False, **kwargs)
for batch_idx, (data, label) in enumerate(train_loader):
    if data.size()[0] != 256:
      print(batch_idx)
      print(data.size())

102400
Using device cuda


In [42]:
SEQUENCE_LENGTH = 100
BATCH_SIZE = 256
EMBED_SIZE = 512
FEATURE_SIZE = 512
TEST_BATCH_SIZE = 256
EPOCHS = 10
LEARNING_RATE = 0.002
WEIGHT_DECAY = 0.0005
USE_CUDA = True
PRINT_INTERVAL = 10
LOG_PATH = DATA_PATH + 'logs/log.pkl'
NUM_LAYERS = 1
!export CUDA_LAUNCH_BLOCKING=1; 

data_train = ReviewsDataset(DATA_PATH + 'office_products_chars_train.pkl', SEQUENCE_LENGTH, BATCH_SIZE)
data_test = ReviewsDataset(DATA_PATH + 'office_products_chars_test.pkl', SEQUENCE_LENGTH, TEST_BATCH_SIZE)
vocab = data_train.vocab

use_cuda = USE_CUDA and torch.cuda.is_available()

device = torch.device("cuda" if use_cuda else "cpu")
print('Using device', device)
import multiprocessing
num_workers = multiprocessing.cpu_count()
print('num workers:', num_workers)

kwargs = {'num_workers': num_workers,
          'pin_memory': True} if use_cuda else {}

train_loader = torch.utils.data.DataLoader(data_train, batch_size=BATCH_SIZE,
                                           shuffle=False, **kwargs)
test_loader = torch.utils.data.DataLoader(data_test, batch_size=TEST_BATCH_SIZE,
                                          shuffle=False, **kwargs)

model = Generator(data_train.vocab_size(), EMBED_SIZE, FEATURE_SIZE, NUM_LAYERS).to(device)

optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
start_epoch = model.load_last_model(DATA_PATH + 'checkpoints')

train_losses, test_losses, test_accuracies, train_p, test_p = pt_util.read_log(LOG_PATH, ([], [], [], [], []))
test_loss, test_accuracy = test(model, device, test_loader)

test_losses.append((start_epoch, test_loss))
test_accuracies.append((start_epoch, test_accuracy))

try:
    for epoch in range(start_epoch, EPOCHS + 1):
        lr = LEARNING_RATE * np.power(0.25, (int(epoch / 6)))
        train_loss = train(model, device, train_loader, lr, epoch, PRINT_INTERVAL)
        test_loss, test_accuracy = test(model, device, test_loader)
        train_losses.append((epoch, train_loss))
        test_losses.append((epoch, test_loss))
        test_accuracies.append((epoch, test_accuracy))
        test_p.append((epoch, exp(test_loss)))
        train_p.append((epoch, exp(train_loss)))
        pt_util.write_log(LOG_PATH, (train_losses, test_losses, test_accuracies, train_p, test_p))
        model.save_best_model(test_accuracy, DATA_PATH + 'checkpoints/%03d.pt' % epoch)
        seed_words = 'Office'
        for ii in range(10):
            generated_sentence = generate_language(model, device, seed_words, 200, vocab, 'sample')
            print('generated sample\t', generated_sentence)
        generated_sentence = generate_language(model, device, seed_words, 200, vocab, 'beam')
        print('generated beam\t\t', generated_sentence)
        print('')

except KeyboardInterrupt as ke:
    print('Interrupted')
except:
    import traceback
    traceback.print_exc()
finally:
    print('Saving final model')
    model.save_model(DATA_PATH + 'checkpoints/%03d.pt' % epoch, 0)

Using device cuda
num workers: 2
Restoring:
encoder.weight -> 	torch.Size([93, 512]) = 0MB
lstm.weight_ih_l0 -> 	torch.Size([2048, 512]) = 4MB
lstm.weight_hh_l0 -> 	torch.Size([2048, 512]) = 4MB
lstm.bias_ih_l0 -> 	torch.Size([2048]) = 0MB
lstm.bias_hh_l0 -> 	torch.Size([2048]) = 0MB
decoder.weight -> 	torch.Size([93, 512]) = 0MB
decoder.bias -> 	torch.Size([93]) = 0MB

Restored all variables
No new variables
Restored /gdrive/My Drive/colab_files/fake_review_generator/fake_review_generator/checkpoints/000.pt

Test set: Average loss: 1.4489, Accuracy: 1482769/2560000 (58%)


Test set: Average loss: 1.4320, Accuracy: 1502945/2560000 (59%)

Saved /gdrive/My Drive/colab_files/fake_review_generator/fake_review_generator/checkpoints/000.pt

generated sample	 Office.thour FVife masing,Thip 6P4Cnassortver w). Let farme so musturum iso.Gjoppons in what you wash, hut wamply, glo, scho-chols" for sIsconb Geal)., new printers, I ddidnsfeent 1p_4)r jorkscokior phons i
generated sample	 Office, cent

In [43]:
seed_words = 'This printer is'
sequence_length = 200

for ii in range(10):
    generated_sentence = generate_language(model, device, seed_words, sequence_length, vocab, 'beam')
    print('generated with beam\t', generated_sentence)


generated with beam	 This printer issues that they seem to last a long time. This is a great product for this product and this tape is nothing that there is nothing that this stapler is easy to replace this printer. This is a great pric
generated with beam	 This printer is that there is nothing that you need to print this product. This is great for this printer and this stapler is that there is nothing that they are still staples that there is nothing that there is not
generated with beam	 This printer is nothing that they would not have to use these. These are great folders and these are great.  These are great folders that these are great folders and they are great. They are great folders and these 
generated with beam	 This printer is that there is nothing that they are all the stapler. This is that there is not that this tape is that this stapler is very good.  This is a great quality printer and this product that there is nothin
generated with beam	 This printer issues with this p