# Testing VAE

In [11]:
import spacy
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import CountVectorizer
import torch
from torch import nn, optim
from torch.utils.data import TensorDataset, DataLoader
from torch.nn import functional as F
import pandas as pd

In [2]:
import vae

In [3]:
v = vae.TweetData(debug=True, max_df=0.1)
v.get_tweet_count_vecs()
x_tr_tensor, x_test_tensor = v.to_tensor_dataset()

In [4]:
# Create a DataLoader to handle the data
train_loader = DataLoader(torch.Tensor(v.X_train.todense()), batch_size=128, shuffle=True)
test_loader = DataLoader(torch.Tensor(v.X_test.todense()), batch_size=128, shuffle=False)

# Define the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = vae.VAE(vocab=v.get_vocab_size(), num_components=20)


In [47]:
class VAE(nn.Module):
	"""
	Should rename -- PFA for Poisson Factor Analysis
	"""

	def __init__(self, vocab, num_components=20, prior_mean=0, prior_var=1):
		"""
		Inputs
		--------
		vocab<int>: the size of the vocabulary

		This model only has the variational layer, then the output
		to the reconstruction. At this point, there are no hidden layers.
		"""
		super(VAE, self).__init__()
		self.num_components = num_components

		self.prior_mean = prior_mean
		self.prior_var = prior_var

		self.enc_mu = nn.Linear(vocab, num_components, bias=False)
		self.enc_logvar = nn.Linear(vocab, num_components, bias=False)
		self.W_tilde = torch.rand(num_components, vocab)
		self.pois_nll = nn.PoissonNLLLoss(log_input=False)
		self.softplus = nn.Softplus()

	def reparameterize(self, mu, logvar):
		std = torch.exp(0.5*logvar)
		eps = torch.randn_like(std)
		return mu + eps*std

	def forward(self, x):
		mu = self.enc_mu(x)
		logvar = self.enc_logvar(x)

		s_tilde = self.reparameterize(mu, logvar)

		s = self.softplus(s_tilde)
		W = self.softplus(self.W_tilde)

		return s, W, mu, logvar

	def get_topic_dist(self, x):
		"""
		When it comes to looking at the norm, we want to calculate the 
		probability that a certain sample belongs to each topic.
		"""
		s, _ = self.encode(x)
		W = self.parameters() # TODO - figure out which parameters to add.
		norm = torch.norm(s @ W, p = 1) # Return the L1 norm
		# TODO -- add in the multinomial distribution.

		# TODO - need to calculate elementwise product.
		return s @ W / norm

	def _kl_divergence(self, mean, logvar):
		# see Appendix B from VAE paper:
		# Kingma and Welling. Auto-Encoding Variational Bayes. ICLR, 2014
		# https://arxiv.org/abs/1312.6114
		# 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2)
		# BUT...
		# Code extended to handle a more informative prior
		# Referencing this derivation found here:
		# https://stanford.edu/~jduchi/projects/general_notes.pdf
		# Assume diagonal matrices for variance
		KLD = -0.5 * torch.sum(1 + logvar - (mean).pow(2) - logvar.exp())#, axis=0)

		return KLD

	def loss_function(self, recon_x, x, mu, logvar):
		KLD = self._kl_divergence(mu, logvar)
		PNLL = self.pois_nll(x, recon_x)
		return torch.mean(PNLL + KLD)

	@torch.no_grad()
	def reconstruct(self, X):
		s, W, mu, logvar = self.forward(X)

		return s @ W

	def fit(self, X, n_epochs=20, lr=1e-3, print_rate=10):
		"""
		Fit the model to the data, X. Assume X is in count vector format as a tensor.
		"""
		# train_loader = DataLoader(X, batch_size=128)
		optimizer = optim.Adam(self.parameters(), lr=lr)
		for epoch in range(n_epochs):
			epoch_train_loss = 0
			epoch_test_loss = 0
			for batch_idx, data in enumerate(train_loader):
				self.train()
				optimizer.zero_grad()
				s, W, mu, logvar = self.forward(data)
				recon_batch = s @ W # Calculate the reconstructed matrix
				loss = self.loss_function(recon_batch, data, mu, logvar)
				# loss.backward()
				epoch_train_loss += loss.item()
				optimizer.step()
				if batch_idx % print_rate == 0:
					print('Train epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
						epoch, batch_idx * len(data), len(train_loader.dataset),
						100. * batch_idx / len(train_loader),
						loss.item() / len(data)))
			print('===> Epoch: {} Average Loss: {:.4f}'.format(
				epoch, epoch_train_loss / len(train_loader.dataset)
			))


In [48]:
model = VAE(vocab=v.get_vocab_size(), num_components=200)

In [49]:
model.fit(train_loader, n_epochs=20)

===> Epoch: 0 Average Loss: 23.0861
===> Epoch: 1 Average Loss: 23.1249
===> Epoch: 2 Average Loss: 23.1097
===> Epoch: 3 Average Loss: 23.0999
===> Epoch: 4 Average Loss: 23.1015
===> Epoch: 5 Average Loss: 23.0976
===> Epoch: 6 Average Loss: 23.0907
===> Epoch: 7 Average Loss: 23.0890
===> Epoch: 8 Average Loss: 23.0873
===> Epoch: 9 Average Loss: 23.1133
===> Epoch: 10 Average Loss: 23.1008
===> Epoch: 11 Average Loss: 23.1157
===> Epoch: 12 Average Loss: 23.1097
===> Epoch: 13 Average Loss: 23.1082
===> Epoch: 14 Average Loss: 23.1116
===> Epoch: 15 Average Loss: 23.1021
===> Epoch: 16 Average Loss: 23.0721
===> Epoch: 17 Average Loss: 23.0934
===> Epoch: 18 Average Loss: 23.0904
===> Epoch: 19 Average Loss: 23.1101


In [50]:
model.reconstruct(torch.Tensor(v.X_train.todense()))

tensor([[155.2921, 156.8593, 154.7162,  ..., 150.3946, 156.2766, 152.8408],
        [159.8372, 161.2811, 157.3672,  ..., 159.7819, 159.5841, 160.2900],
        [155.6090, 158.1974, 153.2449,  ..., 154.0242, 154.4203, 153.4816],
        ...,
        [148.8502, 147.7955, 147.4792,  ..., 146.8992, 147.4423, 147.2446],
        [155.6352, 157.2970, 154.6425,  ..., 157.7994, 155.2149, 156.3728],
        [149.6803, 150.9980, 148.7019,  ..., 149.6415, 149.3845, 147.7869]])

In [43]:
torch.Tensor(v.X_test.todense())

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [52]:
v.X_test.shape

(2000, 1841)

In [None]:
mu.shape

torch.Size([128, 20])

In [None]:
torch.sum(1+logvar - torch.log(torch.Tensor(1))- 1*((0-mu).pow(2)-logvar.exp()))

tensor(5121.4717, grad_fn=<SumBackward0>)

In [None]:
mu.shape

torch.Size([128, 20])

In [None]:
s.shape

torch.Size([128, 20])

In [None]:
W.shape

torch.Size([20, 176])

In [None]:
s(torch.Tensor(1))

tensor([0.6931])

In [None]:
for data in train_loader:
	model.forward(data)

In [53]:
import pandas as pd
path = '../data/san_francisco/2018-02.csv'
df = pd.read_csv(path)
df

Unnamed: 0,id,author_id,text,geo,created_at,lat,lon
0,968999128641323008,25624940,@Volker_E Am I hallucinating that you are walk...,{'place_id': '5a110d312052166f'},2018-02-28 23:59:30,37.708075,-122.514926
1,968999100757680128,8888,There’s just something weird about living in N...,{'place_id': '5ef5b7f391e30aff'},2018-02-28 23:59:23,37.845953,-122.324818
2,968999065273774080,1557223812,Tomorrow’s March sheeeesh I’m basically 24 😕 t...,{'place_id': '5ecbd073f39c00fa'},2018-02-28 23:59:14,37.592632,-122.160814
3,968999056537088000,1164993320,Wednesday really be draining 6-6 school day 😴,{'place_id': '5ecbd073f39c00fa'},2018-02-28 23:59:12,37.592632,-122.160814
4,968999038493245440,18650764,@remedy415 @Brycesavoy510 This hella dope,{'place_id': 'ab2f2fac83aa388d'},2018-02-28 23:59:08,37.699279,-122.342660
...,...,...,...,...,...,...,...
206690,958852576660738048,45073046,Which one of you flatfoots stole Red Panda’s u...,{'place_id': '1a5fd1b93128bb9e'},2018-02-01 00:00:43,37.678709,-122.130814
206691,958852515361046529,315133994,@1113JD @Nikkiii_88 @CoryBooker Him! Nah! But ...,{'place_id': '5ef5b7f391e30aff'},2018-02-01 00:00:28,37.845953,-122.324818
206692,958852485124317184,559198723,Trying not to cry out of pain during my 3 hour...,{'place_id': '5a110d312052166f'},2018-02-01 00:00:21,37.708075,-122.514926
206693,958852474856554496,33448971,@takkubun 😊 Thank you again ❤️,{'place_id': '99e789320196ef6a'},2018-02-01 00:00:19,37.827015,-122.315509


In [None]:
from torch.utils.data import Dataset
class TweetsDataset(Dataset):
	"""Tweet Dataset"""
	def __init__(self, path, agg_count=1000):
		"""
		Input:
			path: file name of preprocessed count vector JSON.
			agg_count: the number of tweets to aggregate by.
			sample_rate: the number of total samples that we want to get
		"""
		self.path = path
		self.agg_count = agg_count
		self.sample_rate = sample_rate
		self.data = pass # TODO - read JSON

	def __len__(self):
		return len(data.dates)*self.sample_rate
	
	def __getitem__(self, idx):

		# Randomly sample a date

		# Randomly sample agg_count tweets and agg
		


In [7]:
set_ex = list({1,2,3,4,5,6})

In [10]:
set_ex[20%5]

1

In [12]:
# Testing out functionality
df = pd.read_csv("../data/san_francisco/2018-02.csv")

In [27]:
from datetime import datetime
dates = [datetime.strptime(d,'%Y-%m-%d %H:%M:%S').date() for d in
            df['created_at']]

In [28]:
df['date'] = dates

In [22]:
dates = list(set(dates))

In [25]:
d = dates[0]

In [26]:
d

datetime.date(2018, 2, 7)

In [38]:
import numpy as np
arr = df[df['date'] == d]['text']

In [49]:
arr.sample(5).to_numpy()

array(['@Crypto__Honey @marcan42 @siavashg The ENTIRE POINT of blockchains is that adding a transaction is an arms race, that it can’t be hijacked by a single party amassing more computing power. If it were efficient to solve, the Russians or Mafia would hijack it and it’d collapse. Inefficiency is MANDATORY',
       'we really got a hustle player in the all star game again lmaooooooooo',
       '@callaghannz @SaaStrAnnual Great insight being shared here @saastr',
       'My eyelid keeps twitching🙄', 'Bet u said Wtf'], dtype=object)

In [74]:
df_t = pd.DataFrame({
    "col":[np.array([1,1]),np.array([2,2]),np.array([3,3]),np.array([4,4])]
})

In [79]:
df_t['col'].sum()

array([10, 10])