### Utility Module

In [1]:
import random
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
import numpy as np
import os
import pandas as pd
import gzip
import nltk
from collections import Counter
from nltk.corpus import stopwords
from nltk import FreqDist,ngrams,word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import re

def remove_stopwords(data):
    stop_words = Counter(stopwords.words('english'))
    ans = []
    for each in data:
        if(each not in stop_words.keys()):
            ans.append(each)
    return ans

def lemmatizer(data):
    lmtzr = WordNetLemmatizer()
    ans = []
    for each in data:
        ans.append(lmtzr.lemmatize(each))
    return ans

def stemmer(data):
    ps = PorterStemmer()
    ans = []
    for each in data:
        ans.append(ps.stem(each))
    return ans

def cleanData(data):
    data = word_tokenize(data)
    data = lemmatizer(remove_stopwords(data))
    string = ' '.join(data)
    return data, string


def folder_count(path):
    count = 0
    l = []
    for f in os.listdir(path):
        child = os.path.join(path,f)
        if os.path.isdir(child):
            l.append(child)
            count += 1
    return count, l


def create_vocab(dataset_path):
    vocab_file = os.path.join(dataset_path,"vocabulary.txt")
    with open(vocab_file, 'r') as myfile:
        data=myfile.read().replace('\n', ' ')
    return data.split(' ')[:-1]

def create_tfidf(dataset_path,vocab):
    list_docs = []
    vectorizer = TfidfVectorizer(stop_words='english',vocabulary=vocab,strip_accents='unicode')
    for f in os.listdir(dataset_path):
        child = os.path.join(dataset_path,f)
        with open(child, 'r', errors='ignore') as myfile:
            data=myfile.read().replace('\n', '')
        _ , final_data = cleanData(data)
        list_docs.append(final_data)
    response = vectorizer.fit_transform(list_docs)
    n_response = response.toarray()
    row_sum = n_response.sum(axis=1)
    length = len(row_sum)
    n_result = n_response/row_sum.reshape(length,1)
    where_are_NaNs = np.isnan(n_result)
    n_result[where_are_NaNs] = 0
    n_c_result = sparse.csr_matrix(n_result)
#     return response
    return n_c_result

def sample_document(tfidf_mat):
    tfidf_mat = tfidf_mat.transpose()
    _,num_docs = tfidf_mat.shape
    sampled_document = random.randint(0,num_docs-1)
    result = tfidf_mat.getcol(sampled_document).toarray().T
    return result

In [2]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)
use_cuda = False

In [3]:
'''
Important model parameters
'''
DATASET = "20newsgroups" # For now, we just test it on 20newsgroups dataset
NUM_TOPICS = 20
LAMBDA = 10 # Gradient penalty lambda hyperparameter
CRITIC_ITERS = 5 # For WGAN and WGAN-GP, number of critic iters per gen iter
ITERS = 200000 # How many generator iterations to train for
VOCAB_SIZE = 61188# Vocab length of the generator
GENERATOR_PARAM = 100 # Number of neurons in the middle layer of the generator
LEAK_FACTOR = 0.2 # leak parameter used in generator
BATCH_SIZE = 256
A_1 = 0.0001
B_1 = 0
B_2 = 0.9

In [4]:
alpha = [np.random.randint(1,11) for i in range(0,20)]

In [5]:
# Temporary change, needs to be changed later
dataset_path = "/home/ysahil/Academics/Sem_8/ATM_GANs/20news/20news-18828/all_docs"
dataset_path_1 = "/home/ysahil/Academics/Sem_8/ATM_GANs/20news/20news-18828"

#Create the TF-IDF matrix
def get_tfidf():
    vocab = create_vocab(dataset_path_1)
    result = create_tfidf(dataset_path,vocab)
    return result

## TODO: Incorporate normalization of tf-idf matrix over row-sum
def representation_map(result):
#    vocab = create_vocab(dataset_path_1)
#    result = create_tfidf(dataset_path,vocab)
    sam_doc = sample_document(result)
    #print(sam_doc)
    #print(sam_doc.shape)
    return sam_doc

In [6]:
test_result = get_tfidf()

In [8]:
representation_map(test_result)

array([[0., 0., 0., ..., 0., 0., 0.]])

## Definitions of GAN's and the training procedure

In [19]:
class generator(nn.Module):
    def __init__(self):
        super(generator,self).__init__()
        main = nn.Sequential(
               nn.Linear(NUM_TOPICS,GENERATOR_PARAM),
               nn.LeakyReLU(LEAK_FACTOR,True),
               nn.BatchNorm1d(GENERATOR_PARAM),
               nn.Linear(GENERATOR_PARAM,VOCAB_SIZE),
#                nn.Softmax(VOCAB_SIZE)
               nn.Softmax()
               )
        self.main = main

    def forward(self,noise):
        output = self.main(noise)
        return output

In [20]:
class discriminator(nn.Module):
    def __init__(self):
        super(discriminator,self).__init__()
        main = nn.Sequential(
               nn.Linear(VOCAB_SIZE,GENERATOR_PARAM),
               nn.LeakyReLU(LEAK_FACTOR,True),
               nn.Linear(GENERATOR_PARAM,1))
        self.main = main

    def forward(self,inputs):
        output = self.main(inputs)
        return output.view(-1)

In [21]:
def extract(v):
    return v.data.storage().tolist()

def stats(d):
    return [np.mean(d), np.std(d)]

In [22]:
def weights_init(m):
    classname = m.__class__.__name__
    if classname.find('Linear') != -1:
        m.weight.data.normal_(0.0, 0.02)
        m.bias.data.fill_(0)
    elif classname.find('BatchNorm') != -1:
        m.weight.data.normal_(1.0, 0.02)
        m.bias.data.fill_(0)

In [23]:
def inf_data_gen(alpha):
    if DATASET == "20newsgroups":
        while True:
            dataset = []
            for i in range(BATCH_SIZE):
                sample = np.random.dirichlet(alpha)
                dataset.append(sample)
            dataset = np.array(dataset, dtype='float32')
            np.random.shuffle(dataset)
            yield dataset

In [24]:
def real_data_sampler(test_result):
    while True:
        dataset = []
        for i in range(BATCH_SIZE):
            sample = np.array(representation_map(test_result))
            dataset.append(sample[0])
        dataset = np.array(dataset, dtype='float32')
        np.random.shuffle(dataset)
        yield dataset

In [25]:
def calc_gradient_penalty(netD, real_data, fake_data):
    alpha = torch.rand(BATCH_SIZE, 1)
    alpha = alpha.expand(real_data.size())
    alpha = alpha.cuda() if use_cuda else alpha

    interpolates = alpha * real_data + ((1 - alpha) * fake_data)

    if use_cuda:
        interpolates = interpolates.cuda()
    interpolates = autograd.Variable(interpolates, requires_grad=True)

    disc_interpolates = netD(interpolates)

    gradients = autograd.grad(outputs=disc_interpolates, inputs=interpolates,
                              grad_outputs=torch.ones(disc_interpolates.size()).cuda() if use_cuda else torch.ones(
                                  disc_interpolates.size()),
                              create_graph=True, retain_graph=True, only_inputs=True)[0]

    gradient_penalty = ((gradients.norm(2, dim=1) - 1) ** 2).mean() * LAMBDA
    return gradient_penalty

In [26]:
ATM_G = generator()
ATM_D = discriminator()
ATM_G.apply(weights_init)
ATM_D.apply(weights_init)
print(ATM_G)
print(ATM_D)

generator(
  (main): Sequential(
    (0): Linear(in_features=20, out_features=100, bias=True)
    (1): LeakyReLU(negative_slope=0.2, inplace)
    (2): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Linear(in_features=100, out_features=61188, bias=True)
    (4): Softmax()
  )
)
discriminator(
  (main): Sequential(
    (0): Linear(in_features=61188, out_features=100, bias=True)
    (1): LeakyReLU(negative_slope=0.2, inplace)
    (2): Linear(in_features=100, out_features=1, bias=True)
  )
)


In [27]:
if use_cuda:
    ATM_D = ATM_D.cuda()
    ATM_G = ATM_G.cuda()

criterion = nn.BCELoss()
optimizerD = optim.Adam(ATM_D.parameters(), lr=A_1, betas=(B_1, B_2))
optimizerG = optim.Adam(ATM_G.parameters(), lr=A_1, betas=(B_1, B_2))

one = torch.FloatTensor([1])
# mone = one * -1
mone = torch.FloatTensor([0])
if use_cuda:
    one = one.cuda()
    mone = mone.cuda()

data = inf_data_gen(alpha)
real_data = real_data_sampler(test_result)

In [28]:
for iteration in range(ITERS):
    ############################
    # (1) Update D network
    ###########################
    for p in ATM_D.parameters():  # reset requires_grad
        p.requires_grad = True  # they are set to False below in netG update

    for iter_d in range(CRITIC_ITERS):
        #_data = data.next()
        _data = next(data)
        sampled_data = torch.Tensor(_data)
        if use_cuda:
            sampled_data = sampled_data.cuda()
        sampled_data_v = autograd.Variable(sampled_data)

        #print(sampled_data_v.size())
         # train with sampled(fake data)
        fake = autograd.Variable(ATM_G(sampled_data_v).data)
        D_fake = ATM_D(fake)
        D_fake = D_fake.mean()
        D_fake.backward(one)
#         D_fake_error = criterion(D_fake,autograd.Variable(one))
#         D_fake_error.backward()

        #_realdata = real_data.next()
        _realdata = next(real_data)
        sampled_real_data = torch.Tensor(_realdata)
        if use_cuda:
            sampled_real_data = sampled_real_data.cuda()
        sampled_real_data_v = autograd.Variable(sampled_real_data)

        D_real = ATM_D(sampled_real_data_v)
        D_real = D_real.mean()
        D_real.backward(mone)
#         D_real_error = criterion(D_real,autograd.Variable(mone))
#         D_real_error.backward()

        #print(sampled_real_data_v.size())
        # train with gradient penalty
        gradient_penalty = calc_gradient_penalty(ATM_D, sampled_real_data_v.data, fake.data)
        gradient_penalty.backward()

        D_cost = D_fake - D_real + gradient_penalty
        Wasserstein_D = D_real - D_fake
        optimizerD.step()

#         dre, dfe = extract(D_real_error)[0], extract(D_fake_error)[0]

    for p in ATM_D.parameters():
        p.requires_grad = False  # to avoid computation
    ATM_G.zero_grad()

    #_data = data.next()
    _data = next(data)
    sampled_data = torch.Tensor(_data)
    if use_cuda:
        sampled_data = sampled_data.cuda()
    sampled_data_v = autograd.Variable(sampled_data)

    fake = ATM_G(sampled_data_v)
    G = ATM_D(fake)
    G = G.mean()
    G.backward(mone)
#     G_error = criterion(G,autograd.Variable(mone))
#     G_error.backward()
#     ge = extract(G_error)[0]
    G_cost = -G
    optimizerG.step()
    if iteration % 100 == 99:
        print(iteration)

  input = module(input)


99
199
299
399
499


KeyboardInterrupt: 