Text classification with attention and synthetic gradients.



Imports and set-up:

In [None]:
%tensorflow_version 2.x
import numpy as np
import tensorflow as tf
import pandas as pd
import subprocess
from sklearn.model_selection import train_test_split
import gensim
import re
import sys
import time

# TODO: actually implement distribution in the training loop
strategy = tf.distribute.get_strategy()

use_mixed_precision = False
tf.config.run_functions_eagerly(False)
tf.get_logger().setLevel('ERROR')

is_tpu = None
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    is_tpu = True
except ValueError:
    is_tpu = False

if is_tpu:
    print('TPU available.')
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.TPUStrategy(tpu)
    if use_mixed_precision:
        policy = tf.keras.mixed_precision.experimental.Policy('mixed_bfloat16')
        tf.keras.mixed_precision.experimental.set_policy(policy)
else:
    print('No TPU available.')
    result = subprocess.run(
        ['nvidia-smi', '-L'],
        stdout=subprocess.PIPE).stdout.decode("utf-8").strip()
    if "has failed" in result:
        print("No GPU available.")
    else:
        print(result)
        strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy(
            tf.distribute.experimental.CollectiveCommunication.NCCL)
        if use_mixed_precision:
            policy = tf.keras.mixed_precision.experimental.Policy('mixed_float16')
            tf.keras.mixed_precision.experimental.set_policy(policy)

Downloading the data

In [None]:
# Download the Sentiment140 dataset
!mkdir -p data
!wget -nc https://nyc3.digitaloceanspaces.com/ml-files-distro/v1/sentiment-analysis-is-bad/data/training.1600000.processed.noemoticon.csv.zip -P data
!unzip -n -d data data/training.1600000.processed.noemoticon.csv.zip

Loading and splitting the data

In [None]:
sen140 = pd.read_csv(
    "data/training.1600000.processed.noemoticon.csv", encoding='latin-1',
    names=["target", "ids", "date", "flag", "user", "text"])
sen140.head()
sen140 = sen140.sample(frac=1).reset_index(drop=True)
sen140 = sen140[['text', 'target']]
features, targets = sen140.iloc[:, 0].values, sen140.iloc[:, 1].values

print("A random tweet\t:", features[0])

# split between train and test sets
x_train, x_test, y_train, y_test = train_test_split(features,
                                                    targets,
                                                    test_size=0.33)
y_train = y_train.astype("float32") / 4.0
y_test = y_test.astype("float32") / 4.0
x_train = np.expand_dims(x_train, -1)
x_test = np.expand_dims(x_test, -1)

Preprocessing data

In [4]:
def process_tweet(x):
    x = x.strip()
    x = x.lower()
    x = re.sub(r"[^a-zA-Z0-9üöäÜÖÄß\.,!\?\-%\$€\/ ]+'", ' ', x)
    x = re.sub('([\.,!\?\-%\$€\/])', r' \1 ', x)
    x = re.sub('\s{2,}', ' ', x)
    x = x.split()
    x.append("[&END&]")
    length = len(x)
    return x


tweets_train = []
tweets_test = []
for tweet in x_train:
    tweets_train.append(process_tweet(tweet[0]))
for tweet in x_test:
    tweets_test.append(process_tweet(tweet[0]))


# Building the initial vocab with all words from the training set
def add_or_update_word(_vocab, word):
    entry = None
    if word in _vocab:
        entry = _vocab[word]
        entry = (entry[0], entry[1] + 1)
    else:
        entry = (len(_vocab), 1)
    _vocab[word] = entry


vocab_pre = {}
# "[&END&]" is for padding, "[&UNK&]" for unknown words
add_or_update_word(vocab_pre, "[&END&]")
add_or_update_word(vocab_pre, "[&UNK&]")
for tweet in tweets_train:
    for word in tweet:
        add_or_update_word(vocab_pre, word)

# limiting the vocabulary to only include words that appear at least 3 times
# in the training data set. Reduces vocab size to about 1/6th.
# This is to make it harder for the model to overfit by focusing on words that
# may only appear in the training data, and also to generally make it learn to
# handle unknown words (more robust)
keys = vocab_pre.keys()
vocab = {}
vocab["[&END&]"] = 0
vocab["[&UNK&]"] = 1
for key in keys:
    freq = vocab_pre[key][1]
    index = vocab_pre[key][0]
    if freq >= 3 and index > 1:
        vocab[key] = len(vocab)


# Replace words that have been removed from the vocabulary with "[&UNK&]" in
# both the training and testing data
def filter_unknown(_in, _vocab):
    for tweet in _in:
        for i in range(len(tweet)):
            if not tweet[i] in _vocab:
                tweet[i] = "[&UNK&]"


filter_unknown(tweets_train, vocab)
filter_unknown(tweets_test, vocab)

Using gensim word2vec to get a good word embedding.

In [5]:
# train the embedding
embedding_dims = 128
embedding = gensim.models.Word2Vec(tweets_train,
                                   size=embedding_dims, min_count=0)

In [6]:
def tokenize(_in, _vocab):
    _out = []
    for i in range(len(_in)):
        tweet = _in[i]
        wordlist = []
        for word in tweet:
            wordlist.append(_vocab[word].index)
        _out.append(wordlist)
    return _out


tokens_train = tokenize(tweets_train, embedding.wv.vocab)
tokens_test = tokenize(tweets_test, embedding.wv.vocab)

Creating modules and defining the model.

In [7]:
class SequenceCollapseAttention(tf.Module):
    '''
    Collapses a sequence of arbitrary length into num_out_entries entries from 
    the sequence according to dot-product attention. So, a variable length 
    sequence is reduced to a sequence of a fixed, known length.
    '''

    def __init__(self,
                 num_out_entries,
                 initializer=tf.keras.initializers.HeNormal,
                 name=None):
        super().__init__(name=name)
        self.is_built = False
        self.num_out_entries = num_out_entries
        self.initializer = initializer()

    def __call__(self, keys, query):
        if not self.is_built:
            self.weights = tf.Variable(
                self.initializer([query.shape[-1], self.num_out_entries]),
                trainable=True)
            self.biases = tf.Variable(tf.zeros([self.num_out_entries]),
                                      trainable=True)
            self.is_built = True

        scores = tf.linalg.matmul(query, self.weights) + self.biases
        scores = tf.transpose(scores, perm=(0, 2, 1))
        scores = tf.nn.softmax(scores)
        output = tf.linalg.matmul(scores, keys)
        return output


class WordEmbedding(tf.Module):
    '''
    Creates a word-embedding module from a provided embedding matrix.
    '''

    def __init__(self, embedding_matrix, trainable=False, name=None):
        super().__init__(name=name)
        self.embedding = tf.Variable(embedding_matrix, trainable=trainable)

    def __call__(self, x):
        return tf.nn.embedding_lookup(self.embedding, x)


testvar = None


class PositionalEncoding1D(tf.Module):
    '''
    Positional encoding as in the Attention Is All You Need paper. I hope.

    For experimentation, the weight by which the positional information is mixed
    into the input vectors is learned.
    '''

    def __init__(self, axis=-2, base=1000, name=None):
        super().__init__(name=name)
        self.axis = axis
        self.base = base
        self.encoding_weight = tf.Variable([2.0], trainable=True)
        testvar = self.encoding_weight

    def __call__(self, x):
        sequence_length = tf.shape(x)[self.axis]
        d = tf.shape(x)[-1]
        T = tf.shape(x)[self.axis]
        pos_enc = tf.range(0, d / 2, delta=1, dtype=tf.float32)
        pos_enc = (-2.0 / tf.cast(d, dtype=tf.float32)) * pos_enc
        base = tf.cast(tf.fill(tf.shape(pos_enc), self.base), dtype=tf.float32)
        pos_enc = tf.math.pow(base, pos_enc)
        pos_enc = tf.expand_dims(pos_enc, axis=0)
        pos_enc = tf.tile(pos_enc, [T, 1])
        t = tf.expand_dims(tf.range(1, T+1, delta=1, dtype=tf.float32), axis=-1)
        pos_enc = tf.math.multiply(pos_enc, t)
        pos_enc_sin = tf.expand_dims(tf.math.sin(pos_enc), axis=-1)
        pos_enc_cos = tf.expand_dims(tf.math.cos(pos_enc), axis=-1)
        pos_enc = tf.concat((pos_enc_sin, pos_enc_cos), axis=-1)
        pos_enc = tf.reshape(pos_enc, [T, d])
        return x + (pos_enc * self.encoding_weight)


class MLP_Block(tf.Module):
    '''
    With batch normalization before the activations.
    A regular old multilayer perceptron, hidden shapes are defined by the
    "shapes" argument.
    '''

    def __init__(self,
                 shapes,
                 initializer=tf.keras.initializers.HeNormal,
                 name=None,
                 activation=tf.nn.swish,
                 trainable_batch_norms=False):
        super().__init__(name=name)
        self.is_built = False
        self.shapes = shapes
        self.initializer = initializer()
        self.weights = [None] * len(shapes)
        self.biases = [None] * len(shapes)
        self.bnorms = [None] * len(shapes)
        self.activation = activation
        self.trainable_batch_norms = trainable_batch_norms

    def _build(self, x):
        for n in range(0, len(self.shapes)):
            in_shape = x.shape[-1] if n == 0 else self.shapes[n - 1]
            factor = 1 if self.activation != tf.nn.crelu or n == 0 else 2
            self.weights[n] = tf.Variable(
                self.initializer([in_shape * factor, self.shapes[n]]),
                trainable=True)
            self.biases[n] = tf.Variable(tf.zeros([self.shapes[n]]),
                                         trainable=True)
            self.bnorms[n] = tf.keras.layers.BatchNormalization(
                trainable=self.trainable_batch_norms)
        self.is_built = True

    def __call__(self, x, training=False):
        if not self.is_built:
            self._build(x)

        h = x
        for n in range(len(self.shapes)):
            h = tf.linalg.matmul(h, self.weights[n]) + self.biases[n]
            h = self.bnorms[n](h, training=training)
            h = self.activation(h)

        return h


class SyntheticGradient(tf.Module):
    '''
    An implementation of synthetic gradients. When added to a model, this
    module will intercept incoming gradients and replace them by learned,
    synthetic ones.

    If you encounter NANs, try setting the sg_output_scale parameter to a lower
    value, or increase the number of initial_epochs or epochs.

    When the model using this module does not learn, the generator might be too
    simple, the sg_output_scale might be too low, the learning rate of the
    generator might be too large or too low, or the number of epochs might be
    too large or too low.

    If the number of initial epochs is too large, the generator can get stuck
    in a local minimum and fail to learn.

    The relative_generator_hidden_shapes list defines the shapes of the hidden
    layers of the generator as a multiple of its input dimension. For an affine
    transormation, pass an empty list.
    '''

    def __init__(self,
                 initializer=tf.keras.initializers.GlorotUniform,
                 activation=tf.nn.tanh,
                 relative_generator_hidden_shapes=[6, ],
                 learning_rate=0.01,
                 epochs=1,
                 initial_epochs=16,
                 sg_output_scale=1,
                 name=None):
        super().__init__(name=name)
        self.is_built = False
        self.initializer = initializer
        self.activation = activation
        self.relative_generator_hidden_shapes = relative_generator_hidden_shapes
        self.initial_epochs = initial_epochs
        self.epochs = epochs
        self.sg_output_scale = sg_output_scale
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

    def build(self, xy, dy):
        '''
        Builds the gradient generator on its first run, and trains on the first
        incoming batch of gradients for a number of epochs to avoid bad results
        (including NANs) in the first few batches where the generator still
        outputs bad approximations. To further reduce NANs due to bad gradients,
        a fixed scaler for the outputs of the generator is computed based on the
        first batch.
        '''
        if self.is_built:
            return

        if len(self.relative_generator_hidden_shapes) > 0:
            generator_shape = [
                               xy.shape[-1] * mult
                               for mult in
                               self.relative_generator_hidden_shapes]
            self.generator_hidden = MLP_Block(
                generator_shape,
                activation=self.activation,
                initializer=self.initializer,
                trainable_batch_norms=False)
        else:
            self.generator_hidden = tf.identity

        self.generator_out = MLP_Block(
            [dy.shape[-1]],
            activation=tf.identity,
            initializer=self.initializer,
            trainable_batch_norms=False)

        # calculate a static scaler for the generated gradients to avoid
        # overflows due to too large gradients
        self.generator_out_scale = 1.0
        x = self.generate_gradient(xy) / self.sg_output_scale
        mag_y = tf.math.sqrt(tf.math.reduce_sum(tf.math.square(dy), axis=-1))
        mag_x = tf.math.sqrt(tf.math.reduce_sum(tf.math.square(x), axis=-1))
        mag_scale = tf.math.reduce_mean(mag_y / mag_x,
                                        axis=tf.range(0, tf.rank(dy) - 1))
        self.generator_out_scale = tf.Variable(mag_scale, trainable=False)

        # train for a number of epochs on the first run, by default 16, to avoid
        # bad results in the beginning of training.
        for i in range(self.initial_epochs):
            self.train_generator(xy, dy)

        self.is_built = True

    def generate_gradient(self, x):
        '''
        Just an MLP, or an affine transformation if the hidden shape in the 
        constructor is set to be empty.
        '''
        x = self.generator_hidden(x)
        out = self.generator_out(x)
        out = out * self.generator_out_scale
        return out * self.sg_output_scale

    def train_generator(self, x, target):
        '''
        Gradient descend for the gradient generator. This is called every time a
        gradient comes in, although in theory (especially with deeper gradient
        generators) once the gradients are modeled sufficiently, it could be OK
        to stop training on incoming gradients, thus fully decoupling the lower
        parts of the network from the upper parts relative to this SG module.
        '''
        with tf.GradientTape() as tape:
            l2_loss = target - self.generate_gradient(x)
            l2_loss = tf.math.reduce_sum(tf.math.square(l2_loss), axis=-1)
            # l2_loss = tf.math.sqrt(l2_dist)
            grads = tape.gradient(l2_loss, self.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.trainable_variables))

    @tf.custom_gradient
    def sg(self, x, y):
        '''
        In the forward pass it is essentially a no-op (identity). In the
        backwards pass it replaces the incoming gradient by a synthetic one.
        '''
        x = tf.identity(x)

        def grad(dy):
            # concat x and the label to be inputs for the generator:
            xy = self.concat_x_and_y(x, y)

            if not self.is_built:
                self.build(xy, dy)

            # train the generator on the incoming gradient:
            for i in range(self.epochs):
                self.train_generator(xy, dy)

            # return the gradient. The second return value is the gradient for y
            # which should be zero since we only need y (labels) to generate the
            # synthetic gradients
            dy = self.generate_gradient(xy)
            return dy, tf.zeros(tf.shape(y))

        return x, grad

    def __call__(self, x, y):
        return self.sg(x, y)

    def concat_x_and_y(self, x, y):
        '''
        Probably an overly complex yet incomplete solution to a rather small
        inconvenience.
        Inconvenience: The gradient generators take the output of the last 
        module AND the target/labels of the network as inputs. But those two 
        tensors can be of different shapes. The obvious solution would be to 
        manually reshape the targets so they can be concatenated with the 
        outputs of the past state. But because i wanted this SG module to be as 
        "plug-and-play" as possible, i tried to attempt automatic reshaping.

        Should work for 1d->1d, and 1d-sequence -> 1d, possibly 1d seq->seq,
        unsure about the rest.
        '''
        # insert as many dims before the last dim of y to give it the same rank
        # as x
        amount = tf.math.maximum(tf.rank(x) - tf.rank(y), 0)
        new_shape = tf.concat((tf.shape(y)[:-1],
                               tf.tile([1], [amount]),
                               [tf.shape(y)[-1]]), axis=-1)
        y = tf.reshape(y, new_shape)

        # tile the added dims such that x and y can be concatenated
        # In order to tile only the added dims, i need to set the dimensions 
        # with a length of 1 (except the last) to the length of the 
        # corresponding dimensions in x, while setting the rest to 1.
        # This is waiting to break.
        mask = tf.cast(tf.math.less_equal(tf.shape(y),
                                          tf.constant([1])), dtype=tf.int32)
        # ignore the last dim
        mask = tf.concat([mask[:-1], tf.constant([0])], axis=-1)

        zeros_to_ones = tf.math.subtract(
            tf.ones(tf.shape(mask), dtype=tf.int32),
            mask)
        # has ones where there is a one in the shape, now the 1s are set to the
        # length in x
        mask = tf.math.multiply(mask, tf.shape(x))
        # add ones to all other dimensions to preserve their shape
        mask = tf.math.add(zeros_to_ones, mask)
        # tile
        y = tf.tile(y, mask)
        return tf.concat((x, y), axis=-1)


class FlattenL2D(tf.Module):
    "Flattens the last two dimensions only"

    def __init__(self, name=None):
        super().__init__(name=name)

    def __call__(self, x):
        new_shape = tf.concat(
            (tf.shape(x)[:-2], [(tf.shape(x)[-1]) * (tf.shape(x)[-2])]),
            axis=-1)
        return tf.reshape(x, new_shape)


initializer = tf.keras.initializers.HeNormal


class SentimentAnalysisWithAttention(tf.Module):
    def __init__(self, name=None):
        super().__init__(name=name)

        # Structure and the idea behind it:
        # 1: The input sequence is embedded and is positionally encoded.
        # 2.1: An MLP block ('query') computes scores for the following
        #      attention layer for each entry in the sequence. Ie, it decides
        #      which words are worth a closer look.
        # 2.2: An attention layer selects n positionally encoded word
        #      embeddings from the input sequence based on the learned queries.
        # 3: The result is flattened into a tensor of known shape and a number
        #    of dense layers compute the final classification.

        self.embedding = WordEmbedding(embedding.wv.vectors)
        self.batch_norm = tf.keras.layers.BatchNormalization(trainable=True)
        self.pos_enc = PositionalEncoding1D()
        self.query = MLP_Block([256, 128], initializer=initializer)
        self.attention = SequenceCollapseAttention(num_out_entries=9,
                                                   initializer=initializer)
        self.flatten = FlattenL2D()
        self.dense = MLP_Block([512, 256, 128, 64],
                               initializer=initializer,
                               trainable_batch_norms=True)
        self.denseout = MLP_Block([1],
                                  initializer=initializer,
                                  activation=tf.nn.sigmoid,
                                  trainable_batch_norms=True)

        # Synthetic gradient modules for the various layers.
        self.sg_query = SyntheticGradient(relative_generator_hidden_shapes=[9])
        self.sg_attention = SyntheticGradient()
        self.sg_dense = SyntheticGradient()

    def __call__(self, x, y=tf.constant([]), training=False):
        x = self.embedding(x)
        x = self.pos_enc(x)
        x = self.batch_norm(x, training=training)
        q = self.query(x, training=training)
        # q = self.sg_query(q, y)           # SG
        x = self.attention(x, q)
        x = self.flatten(x)
        x = self.sg_attention(x, y)         # SG
        x = self.dense(x, training=training)
        x = self.sg_dense(x, y)             # SG
        output = self.denseout(x, training=training)
        return output


model = SentimentAnalysisWithAttention()

In [8]:
class BatchGenerator(tf.keras.utils.Sequence):
    '''
    Creates batches from the given data, specifically it pads the sequences
    per batch only as much as necessary to make every sequence within a batch 
    be of the same length.
    '''

    def __init__(self, inputs, labels, padding, batch_size):
        self.batch_size = batch_size
        self.labels = labels
        self.inputs = inputs
        self.padding = padding
        # self.on_epoch_end()

    def __len__(self):
        return int(np.floor(len(self.inputs) / self.batch_size))

    def __getitem__(self, index):
        max_length = 0
        start_index = index * self.batch_size
        end_index = start_index + self.batch_size
        for i in range(start_index, end_index):
            l = len(self.inputs[i])
            if l > max_length:
                max_length = l

        out_x = np.empty([self.batch_size, max_length], dtype='int32')
        out_y = np.empty([self.batch_size, 1], dtype='float32')
        for i in range(self.batch_size):
            out_y[i] = self.labels[start_index + i]
            tweet = self.inputs[start_index + i]
            l = len(tweet)
            l = min(l, max_length)
            for j in range(0, l):
                out_x[i][j] = tweet[j]
            for j in range(l, max_length):
                out_x[i][j] = self.padding
        return out_x, out_y

Training the model

In [None]:
def train_validation_loop(model_caller, data_generator, epochs, metrics=[]):
    batch_time = -1
    for epoch in range(epochs):
        start_e = time.time()
        start_p = time.time()
        num_batches = len(data_generator)
        predictions = [None] * num_batches
        for b in range(num_batches):
            start_b = time.time()

            x_batch, y_batch = data_generator[b]
            predictions[b] = model_caller(x_batch, y_batch, metrics=metrics)

            # progress output
            elapsed_t = time.time() - start_b
            if batch_time != -1:
                batch_time = 0.05 * elapsed_t + 0.95 * batch_time
            else:
                batch_time = elapsed_t
            if int(time.time() - start_p) >= 1 or b == (num_batches - 1):
                start_p = time.time()
                eta = int((num_batches - b) * batch_time)
                ela = int(time.time() - start_e)
                out_string = "\rEpoch %d/%d,\tbatch %d/%d,\telapsed: %d/%ds" % (
                    (epoch + 1), epochs, b + 1, num_batches, ela, ela + eta)
                for metric in metrics:
                    out_string += "\t %s: %f" % (metric.name,
                                                 float(metric.result()))
                out_length = len(out_string)
                sys.stdout.write(out_string)
                sys.stdout.flush()
        for metric in metrics:
            metric.reset_states()
        sys.stdout.write("\n")
    return np.concatenate(predictions)


def trainer(model, loss, optimizer):
    @tf.function(experimental_relax_shapes=True)
    def training_step(x_batch,
                      y_batch,
                      model=model,
                      loss=loss,
                      optimizer=optimizer,
                      metrics=[]):
        with tf.GradientTape() as tape:
            predictions = model(x_batch, y_batch, training=True)
            losses = loss(y_batch, predictions)
            grads = tape.gradient(losses, model.trainable_variables)

        optimizer.apply_gradients(zip(grads, model.trainable_variables))
        for metric in metrics:
            metric.update_state(y_batch, predictions)
        return predictions

    return training_step


loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.9)
metrics = (tf.keras.metrics.BinaryCrossentropy(from_logits=True),
           tf.keras.metrics.BinaryAccuracy())
batch_size = 512
epochs = 4

padding = embedding.wv.vocab["[&END&]"].index
training_generator = BatchGenerator(tokens_train,
                                    y_train,
                                    padding,
                                    batch_size=batch_size)

train_validation_loop(trainer(model, loss, optimizer),
                      training_generator,
                      epochs,
                      metrics)

Testing it on validation data

In [None]:
def validator(model):
    @tf.function(experimental_relax_shapes=True)
    def validation_step(x_batch, y_batch, model=model, metrics=[]):
        predictions = model(x_batch, y_batch, training=False)
        for metric in metrics:
            metric.update_state(y_batch, predictions)
        return predictions

    return validation_step


testing_generator = BatchGenerator(tokens_test,
                                   y_test,
                                   padding,
                                   batch_size=batch_size)

predictions = train_validation_loop(validator(model),
                                    testing_generator,
                                    1,
                                    metrics)

Get some example results from the the test data.

In [None]:
most_evil_tweet=None
most_evil_evilness=1
most_cool_tweet=None
most_cool_coolness=1
most_angelic_tweet=None
most_angelic_angelicness=0
y_pred = np.concatenate(predictions)
for i in range(0,len(y_pred)):
    judgement = y_pred[i]
    polarity = abs(judgement-0.5)*2

    if judgement>=most_angelic_angelicness:
        most_angelic_angelicness = judgement
        most_angelic_tweet = x_test[i]
    if judgement<=most_evil_evilness:
        most_evil_evilness = judgement
        most_evil_tweet = x_test[i]
    if polarity<=most_cool_coolness:
        most_cool_coolness = polarity
        most_cool_tweet = x_test[i]


print("The evilest tweet known to humankind:\n\t", most_evil_tweet)
print("Evilness: ", 1.0-most_evil_evilness)
print("\n")
print("The most angelic tweet any mortal has ever laid eyes upon:\n\t",
      most_angelic_tweet)
print("Angelicness: ", most_angelic_angelicness)
print("\n")
print("This tweet is too cool for you, don't read:\n\t", most_cool_tweet)
print("Coolness: ", 1.0-most_cool_coolness)