In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
import functools
import time
import os
import csv
import sys
import pandas as pd

%tensorflow_version 1.x #colab magic line to select Tensorflow 1.x
import numpy as np
import scipy.sparse as sparse
from sklearn.decomposition import NMF
import tensorflow as tf
import tensorflow_probability as tfp

`%tensorflow_version` only switches the major version: 1.x or 2.x.
You set: `1.x #colab magic line to select Tensorflow 1.x`. This will be interpreted as: `1.x`.


TensorFlow 1.x selected.


In [None]:
tf.test.gpu_device_name() #shows /device:GPU:0

'/device:GPU:0'

In [None]:
num_topics = 25
tf.set_random_seed(0)
random_state = np.random.RandomState(0)
pre_initialize_parameters = True
#parameters for ADAM
eps = 1
learningrate = 0.0001
batch_size = 512
max_steps = 300000
print_steps = 25000

#paths to ~data/97/input
data_dir = '/content/gdrive/My Drive/tbip_BSP/data/97/input'
#path to ~data/97/output
save_dir = os.path.join('/content/gdrive/My Drive/tbip_BSP/data/97/output' + str(max_steps)) #output folder indicates the number of iterations used
#path to tbip.py, to change as needed
py_file = '/content/gdrive/My Drive/tbip_BSP/code/'

if not os.path.isdir(save_dir):
  os.mkdir(save_dir)

In [None]:
sys.path.append(os.path.abspath(py_file))
import tbip

In [None]:
counts = sparse.load_npz(os.path.join(data_dir, 'counts.npz'))
num_documents,num_words = counts.shape

if pre_initialize_parameters:
    nmf_model = NMF(n_components=num_topics,
                  init='random',
                  random_state=0,
                  max_iter=500)
    # Add offset to make sure none are zero.
    initial_document_loc = np.float32(nmf_model.fit_transform(counts) + 1e-3)
    initial_objective_topic_loc = np.float32(nmf_model.components_ + 1e-3)

else:
    initial_document_loc = np.float32(
       np.exp(random_state.randn(num_documents, num_topics)))
    initial_objective_topic_loc = np.float32(
       np.exp(random_state.randn(num_topics, num_words)))

In [None]:
(iterator, author_weights, vocabulary, author_map,
 num_documents, num_words, num_authors) = tbip.build_input_pipeline(
      data_dir,
      batch_size,
      random_state,
      counts_transformation='nothing')
document_indices, counts, author_indices = iterator.get_next()

Instructions for updating:
Use `for ... in dataset:` to iterate over a dataset. If using `tf.estimator`, return the `Dataset` object directly from your input function. As a last resort, you can use `tf.compat.v1.data.make_one_shot_iterator(dataset)`.


In [None]:
# Create Lognormal variational family for document intensities (theta).
document_loc = tf.get_variable(
    "document_loc",
    initializer=tf.constant(np.log(initial_document_loc)))
document_scale_logit = tf.get_variable(
    "document_scale_logit",
    shape=[num_documents, num_topics],
    initializer=tf.initializers.random_normal(mean=-2, stddev=1.),
    dtype=tf.float32)
document_scale = tf.nn.softplus(document_scale_logit)
document_distribution = tfp.distributions.LogNormal(
    loc=document_loc,
    scale=document_scale)

# Create Lognormal variational family for objective topics (beta).
objective_topic_loc = tf.get_variable(
    "objective_topic_loc",
    initializer=tf.constant(np.log(initial_objective_topic_loc)))
objective_topic_scale_logit = tf.get_variable(
    "objective_topic_scale_logit",
    shape=[num_topics, num_words],
    initializer=tf.initializers.random_normal(mean=-2, stddev=1.),
    dtype=tf.float32)
objective_topic_scale = tf.nn.softplus(objective_topic_scale_logit)
objective_topic_distribution = tfp.distributions.LogNormal(
    loc=objective_topic_loc,
    scale=objective_topic_scale)

# Create Gaussian variational family for ideological topics (eta).
ideological_topic_loc = tf.get_variable(
    "ideological_topic_loc",
    shape=[num_topics, num_words],
    dtype=tf.float32)
ideological_topic_scale_logit = tf.get_variable(
    "ideological_topic_scale_logit",
    shape=[num_topics, num_words],
    dtype=tf.float32)
ideological_topic_scale = tf.nn.softplus(ideological_topic_scale_logit)
ideological_topic_distribution = tfp.distributions.Normal(
    loc=ideological_topic_loc,
    scale=ideological_topic_scale)

# Create Gaussian variational family for ideal points (x).
ideal_point_loc = tf.get_variable(
    "ideal_point_loc",
    shape=[num_authors],
    dtype=tf.float32)
ideal_point_scale_logit = tf.get_variable(
    "ideal_point_scale_logit",
    initializer=tf.initializers.random_normal(mean=0, stddev=1.),
    shape=[num_authors],
    dtype=tf.float32)
ideal_point_scale = tf.nn.softplus(ideal_point_scale_logit)
ideal_point_distribution = tfp.distributions.Normal(
    loc=ideal_point_loc,
    scale=ideal_point_scale)


In [None]:
# Approximate ELBO.
elbo = tbip.get_elbo(counts,
                     document_indices,
                     author_indices,
                     author_weights,
                     document_distribution,
                     objective_topic_distribution,
                     ideological_topic_distribution,
                     ideal_point_distribution,
                     num_documents,
                     batch_size)
loss = -elbo

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



In [None]:
optim = tf.train.AdamOptimizer(learning_rate=learningrate, epsilon=eps)
train_op = optim.minimize(loss)

document_mean = document_loc + document_scale ** 2 / 2

neutral_mean = objective_topic_loc + objective_topic_scale ** 2 / 2

positive_mean = (objective_topic_loc +
                 ideological_topic_loc +
                 (objective_topic_scale ** 2 +
                  ideological_topic_scale ** 2) / 2)

negative_mean = (objective_topic_loc -
                 ideological_topic_loc +
                 (objective_topic_scale ** 2 +
                  ideological_topic_scale ** 2) / 2)

In [None]:
loss_vals = []
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
start_time = time.time()
for step in range(max_steps):
  (_, elbo_val) = sess.run([train_op, elbo])
  duration = (time.time() - start_time) / (step + 1)
  loss_vals.append(elbo_val) #Keeping a track of elbo values
  if step % print_steps == 0 or step == max_steps - 1:
    print("Step: {:>3d} ELBO: {:.3f} ({:.3f} sec/step)".format(
        step, elbo_val, duration))
  if (step + 1) % 50000 == 0 or step == max_steps - 1:
    (document_topic_mean, neutral_topic_mean, negative_topic_mean, positive_topic_mean,
     ideal_point_mean, ots, otl, itl, its) = sess.run([document_mean, neutral_mean, negative_mean,
                                   positive_mean, ideal_point_loc, objective_topic_scale, objective_topic_loc,
                                   ideological_topic_loc, ideological_topic_scale, ])
    np.save(os.path.join(save_dir, "document_topic_mean.npy"), document_topic_mean)
    np.save(os.path.join(save_dir, "neutral_topic_mean.npy"), neutral_topic_mean)
    np.save(os.path.join(save_dir, "negative_topic_mean.npy"), negative_topic_mean)
    np.save(os.path.join(save_dir, "positive_topic_mean.npy"), positive_topic_mean)
    np.save(os.path.join(save_dir, "ideal_point_mean.npy"), ideal_point_mean)
    np.save(os.path.join(save_dir, "objective_topic_scale.npy"), ots)
    np.save(os.path.join(save_dir, "objective_topic_loc.npy"), otl)
    np.save(os.path.join(save_dir, "ideological_topic_loc.npy"), itl)
    np.save(os.path.join(save_dir, "ideological_topic_scale.npy"), its)
    loss_df = pd.DataFrame(data={"loss": loss_vals})
    loss_df.to_csv(os.path.join(save_dir, 'loss_values.csv'), sep=',', index=False) #save loss values as a csv file


Step:   0 ELBO: -12745670.000 (3.057 sec/step)
Step: 25000 ELBO: -7278692.000 (0.122 sec/step)
Step: 50000 ELBO: -5975661.000 (0.122 sec/step)
Step: 75000 ELBO: -5733517.500 (0.122 sec/step)
Step: 100000 ELBO: -5202743.000 (0.122 sec/step)
Step: 125000 ELBO: -5153035.000 (0.122 sec/step)
Step: 150000 ELBO: -4787533.000 (0.122 sec/step)
Step: 175000 ELBO: -4827905.000 (0.122 sec/step)
Step: 200000 ELBO: -4772831.000 (0.122 sec/step)
Step: 225000 ELBO: -4744893.000 (0.122 sec/step)
Step: 250000 ELBO: -4737532.500 (0.122 sec/step)
Step: 275000 ELBO: -5018617.000 (0.122 sec/step)
Step: 299999 ELBO: -4685226.500 (0.122 sec/step)
