# Slackformer: Basic transformer on WBSlack data

Following [this tutorial](https://colab.research.google.com/github/keras-team/keras-io/blob/master/examples/nlp/ipynb/text_classification_with_transformer.ipynb#scrollTo=HaNCFrk9xtv2) from Keras

In [1]:
import weave
import slack_trainer

In [2]:
SLACK_DATA_DIR = "/Users/timothysweeney/Documents/jan_1_2022_slack_dump"
raw_data = slack_trainer.load_data(SLACK_DATA_DIR)

In [3]:
config = {
    "min_msg_count"  : 100, 
    "train_frac"     : 0.8,
    "use_baseline"   : True,
    "vocab_size"     : 20000, 
    "sequence_length": 200,
    "embed_dim"      : 64,
    "num_heads"      : 6,
    "ff_dim"         : 64,
    "optimizer"      : "adam",
    "loss"           : "sparse_categorical_crossentropy",
    "metrics"        : ["accuracy"],
    "batch_size"     : 64, 
    "epochs"         : 1
}

In [4]:
# Process the data
data = slack_trainer.process_data(users         = raw_data["users"], 
                                  messages      = raw_data["messages"], 
                                  min_msg_count = config["min_msg_count"], 
                                  train_frac    = config["train_frac"])

In [5]:
# Build & Compile the Model
if config["use_baseline"]:
    model = slack_trainer.make_baseline_model(data, 
                                               vocab_size      = config["vocab_size"], 
                                               sequence_length = config["sequence_length"])
else:
    model = slack_trainer.make_transformer_model(data, 
                                                   vocab_size      = config["vocab_size"], 
                                                   sequence_length = config["sequence_length"],
                                                   embed_dim       = config["embed_dim"],
                                                   num_heads       = config["num_heads"],
                                                   ff_dim          = config["ff_dim"])
model.compile(optimizer = config["optimizer"], 
                       loss      = config["loss"], 
                       metrics   = config["metrics"])

2022-08-11 01:42:53.164308: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
# Train the model
results = slack_trainer.fit_model(model, data, 
                                  batch_size = config["batch_size"], 
                                  epochs     = config["epochs"])



In [7]:
# Package model for inference
packaged_model = slack_trainer.package_model(model, data)

In [12]:
# Save or Publish Model
saved_model = weave.save(packaged_model)

In [13]:
from weave.ecosystem import keras as weave_keras
weave_keras.call_string(saved_model, "i love weave")



In [16]:
res = packaged_model.predict(["asdf"]).tolist()[0][0]
if type(res) == bytes:
    print(res.decode('utf-8'))
type(type(res))

Gourab De


type

In [10]:
import tensorflow as tf
inputs = tf.keras.Input(shape=(1,), dtype="string")
indicies = tf.keras.layers.Reshape(target_shape=(1,))(slack_trainer._argmax_layer()(model(inputs)))

vocab = data["users"].sort_values("model_id")["real_name"].tolist()
layer = tf.keras.layers.StringLookup(vocabulary=vocab, invert=True)

outputs = layer(indicies)

# Our end to end model
end_to_end_model = tf.keras.Model(inputs, outputs)
end_to_end_model.compile(
    loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [12]:
res = packaged_model.predict(["adf"]).tolist()[0][0]
type(res)



bytes

In [52]:
import tensorflow as tf

class argmax_layer(tf.keras.layers.Layer):
    def __init__(self):
        super(argmax_layer, self).__init__()

    def call(self, inputs):
        return tf.math.argmax(inputs, axis=1)

# A string input
inputs = tf.keras.Input(shape=(1,), dtype="string")
indicies = tf.keras.layers.Reshape(target_shape=(1,))(argmax_layer()(model(inputs)))

vocab = data["users"].sort_values("model_id")["real_name"].tolist()
layer = tf.keras.layers.StringLookup(vocabulary=vocab, invert=True)

outputs = layer(indicies)


# Our end to end model
end_to_end_model = tf.keras.Model(inputs, outputs)
end_to_end_model.compile(
    loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"]
)



# # data = tf.constant([[0], [1]])
# # vocab = ["aa", "b", "c", "d"]
# layer(model.predict(["I love weave"]).argmax())
# end_to_end_model = 

In [55]:
end_to_end_model.predict([["I love weave"], ["customers"]])



array([[b'Kenna Zilis'],
       [b'Hugo Ponte']], dtype=object)

In [54]:
end_to_end_model.outputs

[<KerasTensor: shape=(None, 1) dtype=string (created by layer 'string_lookup_9')>]

In [31]:
data["users"].loc[model.predict(["I love weave"]).argmax()]



user_id                                            U016KFFRXFH
count                                                      762
real_name                                    Arman Harutyunyan
image_72     https://avatars.slack-edge.com/2020-07-14/1264...
model_id                                                     0
Name: 101, dtype: object

In [42]:
model.predict(["adsfasdfasdfads"]).argmax()



42

In [8]:
import weave
weave.publish(model, name="tim_transformer_demo")



INFO:tensorflow:Assets written to: /var/folders/wv/j7nq4t9j7tl2cq_2mz2mhbj40000gn/T/tmpdenqikis/assets


INFO:tensorflow:Assets written to: /var/folders/wv/j7nq4t9j7tl2cq_2mz2mhbj40000gn/T/tmpdenqikis/assets
[34m[1mwandb[0m: Adding directory to artifact (/var/folders/wv/j7nq4t9j7tl2cq_2mz2mhbj40000gn/T/tmpdenqikis)... Done. 0.1s


In [9]:
m = weave.get("wandb-artifact://timssweeney/weave_ops/tim_transformer_demo:v0")

['Arman Harutyunyan',
 'Taylor Hersom',
 'John Qian',
 'Matthew Spencer',
 'Igor Veksler',
 'Ani Safaryan',
 'Thomas Capelle',
 'Noah Wisnia',
 'Sanyam Bhutani',
 'Nir Podoleanu',
 'Rajne Kumari',
 'Kat Markfield',
 'Noah Luna',
 'Altay Guvench',
 'Lukas Biewald',
 'Dave Davies',
 'Yan-David Erlich',
 'Spencer Pearson',
 'Mic lee',
 'Michelle Bergeron',
 'Ben Sherman',
 'Dave La Chasse',
 'Seph Mard',
 'Min-Young Wu',
 'Melissa Benitez',
 'Jason Zhao',
 'Rebecca Li',
 'Mohammad Bakir',
 'Saurav Maheshkar',
 'Andrew Truong',
 'Felix Vialva',
 'Jackson Rapaich',
 'Pratik Sutar',
 'Kevin Stofan',
 'Esteban Gonzalez',
 'Parul Pandey',
 'Justin Melbourne',
 'Elaina Hodgkin',
 'Will Goldfarb',
 'Ivan',
 'Allan Stevenson',
 'Jeff Raubitschek',
 'Bryan Bischof',
 'Carey Phelps',
 'Jack Bailin',
 'Frida de Sigley',
 'Scott Pearson',
 'Hugo Ponte',
 'Matt Mirick',
 'Ember Campbell',
 'Leslie',
 'Ken Lee',
 'Kyle Goyette',
 'Justin Tenuto',
 'Edward.li',
 'Tim Sweeney',
 'Graham Whitelaw',
 'Stac



<tf.Tensor: shape=(), dtype=string, numpy=b'Kenna Zilis'>

In [10]:
m.type

KerasModel(inputs_type=TypedDict(property_types={'0': KerasTensorType(shape=TypedDict(property_types={'0': NoneType(), '1': <Const Number() 1>}), datatype_enum=<Const Number() 7>)}), outputs_type=TypedDict(property_types={'0': KerasTensorType(shape=TypedDict(property_types={'0': NoneType(), '1': <Const Number() 143>}), datatype_enum=<Const Number() 1>)}))

In [46]:
import re
def remove_user_mention(text):
    regex = r'\<.*\>'
    return re.sub(regex, '', text)

def remove_multiple_space(text):
    regex = r'\s+'
    return re.sub(regex, ' ', text)

def remove_emojies(text):
    regex = r':\w+:'
    return re.sub(regex, ' ', text)

def process_text(text):
    return remove_multiple_space(remove_emojies(remove_user_mention(text))).strip()

In [49]:
users = {}
messages = []
for m in human_messages[:100]:
    text = process_text(m["text"])
    if text != '':
        messages.append([user_id, text])
        user_id = m["user"]
        if user_id not in users:
            users[user_id] = {
                'count': 0,
                'real_name': m['user_profile']['real_name'],
                'image_72': m['user_profile']['image_72'],
            }
        users[user_id]['count'] += 1

messages[:10]
users

In [26]:
import weave
import pandas as pd
from weave.ecosystem import slack



In [27]:
def all_slack_channels() -> list[str]:
    return sorted(Channel(d.name) for d in SLACK_DATA_DIR.glob('*'))

In [24]:
slack_obj = slack.open_slack_export(SLACK_DATA_DIR)

In [28]:
# channels = weave.use(slack_obj.channels())
channels = all_slack_channels()
channels

AttributeError: 'str' object has no attribute 'glob'

In [17]:
data = weave.use(.channel(TRAINING_CHANNEL).messages())
data_df = pd.DataFrame(columns=["text", "user"], data=[[d._text, d._user_id] for d in data])

In [18]:
data_df

Unnamed: 0,text,user
0,:wave:,U0243LC8DQB
1,I learned a LOT at SKO about the framing of ou...,U01CQAAT7MX
2,Do you have a way that you would want to be co...,U70S0JGS2
3,Hey Lukas - thanks for the response. I think f...,U01CQAAT7MX
4,If you had extra money sitting around and you ...,U02N3UD54AX
...,...,...
64,I think focus on a single user persona really ...,U70S0JGS2
65,Sounds great thanks :slightly_smiling_face:,U03ECPDHRH6
66,"If folks have questions I didn’t get to today,...",U0243LC8DQB
67,Someone asked in the all hands how much runway...,U70S0JGS2


In [16]:
%load_ext rich

from rich import progress
import typing
import random
import json
import sklearn
import pathlib
import itertools
import dataclasses
import numpy as np

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

def all_slack_channels() -> list[str]:
    return sorted(Channel(d.name) for d in SLACK_DATA_DIR.glob('*'))

def all_messages(channels: list[Channel]) -> list[Message]:
    ms = []
    for c in progress.track(channels):
        ms += c.messages
    return ms

# frequency count of users so we can threshold on the minimum number
# of messages required for the user to be labeled by our model
def get_top_users(all_m: list[Message], min_msg_count=50) -> dict:
    user_map = {}
    for m in all_m:
        if m.user_id in user_map:
            user_map[m.user_id] += 1
        else:
            user_map[m.user_id] = 1
    print("unique users: ", len(user_map))
    um_sort = sorted(user_map.items(), key=lambda x: x[1], reverse=True)
    # sort by message count in dataset -
    top_users = {}
    for i, u in enumerate(um_sort):
        if u[1] >= min_msg_count:
            top_users[u[0]] = i
    print("user labels in this model: ", len(top_users))
    return top_users
    
class TrainTest(typing.TypedDict):
    train: list[Message]
    test: list[Message]

def train_test_split(messages: list[Message], train_frac: float) -> TrainTest:
    n_train = int(len(messages) * train_frac)
    messages = list(messages)
    random.shuffle(messages)
    return {
        'train': messages[:n_train],
        'test': messages[n_train:]
    }

def load_data():
    channels = all_slack_channels()
    # filter out sentry + deploy-builds
    channels = [c for c in channels if not 'sentry' in c.name and not "deploy-build" in c.name]
    all_m = all_messages(channels)
    
    # filter for top 215 users (at least 50 messages)
    top_users = get_top_users(all_m)
    all_m = [m for m in all_m if m.user_id in top_users]
    
    split = train_test_split(all_m, 0.8)
    train, test = split['train'], split['test']
    print("train: ", len(train), "test: ", len(test))
    return train, test, top_users

def usernames():
    # pull in user dictionary
    import json
    u = json.load(open("users.json", 'r'))
    names = {}
    for e in u:
        names[e["id"]] = e["name"]
    print("total users: ", len(names))
    return names

The rich extension is already loaded. To reload it, use:
  %reload_ext rich


## Define a transformer

In [5]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)
    
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [8]:
# TODO: Shawn what is channel in the class def supposed to be to start? 
# if one doesn't run this block, the class definition fails on "channel"
channel = Channel('ceo-ama')
channel.size

In [27]:
train, test, top_users = load_data()
train_text = [t.text for t in train]
y_train =[top_users[t.user_id] for t in train]
x_val = [t.text for t in test]
y_val = [top_users[t.user_id] for t in test]

Output()

unique users:  1083
user labels in this model:  215
train:  117805 test:  29452


## Fit embedding to training text

In [28]:
from tensorflow.keras.layers import TextVectorization
def custom_standardization(input_data):
  return tf.strings.lower(input_data)
  #stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
  #return tf.strings.regex_replace(stripped_html,
      #                            '[%s]' % re.escape(string.punctuation), '')


# Vocabulary size and number of words in a sequence.
vocab_size = 20000
# TODO these should be the same
sequence_length = 200
maxlen=200

# Use the text vectorization layer to normalize, split, and map strings to
# integers. Note that the layer uses the custom standardization defined above.
# Set maximum_sequence length as all samples are not of the same length.
vectorize_layer = TextVectorization(
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

# Make a text-only dataset (no labels) and call adapt to build the vocabulary.
#text_ds = train.map(lambda x, y: x)
vectorize_layer.adapt(train_text)

## Define baseline + transformer models

Defaults to training a super simple toy baseline_model.

In [24]:
def baseline_model():
    inputs = layers.Input(shape=(1,), dtype=tf.string)
    x = vectorize_layer(inputs)
    x = layers.Dense(20, activation="relu")(x) # was 20
    x = layers.Dropout(0.1)(x)
    outputs = layers.Dense(215, activation="softmax")(x)
    return keras.Model(inputs=inputs, outputs=outputs)

def transformer_model():
    embed_dim = 64  # Embedding size for each token
    num_heads = 6  # Number of attention heads
    ff_dim = 64  # Hidden layer size in feed forward network inside transformer

    inputs = layers.Input(shape=(1,), dtype=tf.string)
    x = vectorize_layer(inputs)
    embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
    x = embedding_layer(x)
    transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
    x = transformer_block(x)
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dropout(0.1)(x)
    x = layers.Dense(20, activation="relu")(x) 
    x = layers.Dropout(0.1)(x)
    outputs = layers.Dense(215, activation="softmax")(x)
    model = keras.Model(inputs=inputs, outputs=outputs)
    

In [29]:
curr_model = baseline_model() # trains to about 4.11% in 10-15 sec

# switch to this call to train a more legit model
# takes 5-10 mins per epoch on CPU 
#curr_model = transformer_model()

## Train model!

In [30]:
curr_model.compile(
    optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)
history = curr_model.fit(
    # probably increase epochs for transformer model to about 5
    train_text, y_train, batch_size=64, epochs=1, validation_data=(x_val, y_val)
)



In [23]:
## Don't forget to save
### try a unique descriptive name ;)
curr_model.save("my_test_model")



INFO:tensorflow:Assets written to: my_test_model/assets


INFO:tensorflow:Assets written to: my_test_model/assets


In [None]:
# more fun ideas to try: 

# tried:
# - bigger batch size! meh, speed not much different
# - more epochs: overfits by 29/e5
# - more attention heads: seems to be most impactful
# - one more dense layer: meh

# to try eventually maybe:
# - other loss: nah
# - stop words: would be smart, nontrivial
# - punctuation?!
# - different min cutoff