# Slackformer: Basic transformer on WBSlack data

Following [this tutorial](https://colab.research.google.com/github/keras-team/keras-io/blob/master/examples/nlp/ipynb/text_classification_with_transformer.ipynb#scrollTo=HaNCFrk9xtv2) from Keras

In [16]:
%load_ext rich

from rich import progress
import typing
import random
import json
import sklearn
import pathlib
import itertools
import dataclasses
import numpy as np

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

random.seed(0)

SLACK_DATA_DIR = pathlib.Path('..', '..', '..', 'cleanstart', 'slackml', 'data', 'wb-slack')

@dataclasses.dataclass
class Message:
    ts: str
    channel: str
    text: str
    user_id: str

def dirsize(path) -> int:
    return sum(f.stat().st_size for f in path.glob('**/*') if f.is_file())
        
@dataclasses.dataclass(order=True)
class Channel:
    name: str

    @property
    def path(self):
        return SLACK_DATA_DIR / self.name
        
    @property
    def size(self) -> int:
        return dirsize(self.path)
        
    @property
    def messages(self) -> list[Message]:
        chan_path = SLACK_DATA_DIR / self.name
        files = chan_path.glob('*.json')
        channel_messages = []
        for f in sorted(files):
            messages = json.load(open(f))
            for message in messages:
                if message['type'] != 'message' or message.get('subtype'):
                    # actual messages from users don't have subtype set
                    continue
                channel_messages.append(Message(
                    message['ts'],
                    channel, # TODO: what should this be?
                    message['text'],
                    message['user']
                ))
        return channel_messages        

def all_slack_channels() -> list[str]:
    return sorted(Channel(d.name) for d in SLACK_DATA_DIR.glob('*'))

def all_messages(channels: list[Channel]) -> list[Message]:
    ms = []
    for c in progress.track(channels):
        ms += c.messages
    return ms

# frequency count of users so we can threshold on the minimum number
# of messages required for the user to be labeled by our model
def get_top_users(all_m: list[Message], min_msg_count=50) -> dict:
    user_map = {}
    for m in all_m:
        if m.user_id in user_map:
            user_map[m.user_id] += 1
        else:
            user_map[m.user_id] = 1
    print("unique users: ", len(user_map))
    um_sort = sorted(user_map.items(), key=lambda x: x[1], reverse=True)
    # sort by message count in dataset -
    top_users = {}
    for i, u in enumerate(um_sort):
        if u[1] >= min_msg_count:
            top_users[u[0]] = i
    print("user labels in this model: ", len(top_users))
    return top_users
    
class TrainTest(typing.TypedDict):
    train: list[Message]
    test: list[Message]

def train_test_split(messages: list[Message], train_frac: float) -> TrainTest:
    n_train = int(len(messages) * train_frac)
    messages = list(messages)
    random.shuffle(messages)
    return {
        'train': messages[:n_train],
        'test': messages[n_train:]
    }

def load_data():
    channels = all_slack_channels()
    # filter out sentry + deploy-builds
    channels = [c for c in channels if not 'sentry' in c.name and not "deploy-build" in c.name]
    all_m = all_messages(channels)
    
    # filter for top 215 users (at least 50 messages)
    top_users = get_top_users(all_m)
    all_m = [m for m in all_m if m.user_id in top_users]
    
    split = train_test_split(all_m, 0.8)
    train, test = split['train'], split['test']
    print("train: ", len(train), "test: ", len(test))
    return train, test, top_users

def usernames():
    # pull in user dictionary
    import json
    u = json.load(open("users.json", 'r'))
    names = {}
    for e in u:
        names[e["id"]] = e["name"]
    print("total users: ", len(names))
    return names

The rich extension is already loaded. To reload it, use:
  %reload_ext rich


## Define a transformer

In [5]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)
    
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [8]:
# TODO: Shawn what is channel in the class def supposed to be to start? 
# if one doesn't run this block, the class definition fails on "channel"
channel = Channel('ceo-ama')
channel.size

In [17]:
train, test, top_users = load_data()
train_text = [t.text for t in train]

Output()

unique users:  1083
user labels in this model:  215
train:  117805 test:  29452


## Fit embedding to training text

In [18]:
from tensorflow.keras.layers import TextVectorization
def custom_standardization(input_data):
  return tf.strings.lower(input_data)
  #stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
  #return tf.strings.regex_replace(stripped_html,
      #                            '[%s]' % re.escape(string.punctuation), '')


# Vocabulary size and number of words in a sequence.
vocab_size = 20000
# TODO these should be the same
sequence_length = 200
maxlen=200

# Use the text vectorization layer to normalize, split, and map strings to
# integers. Note that the layer uses the custom standardization defined above.
# Set maximum_sequence length as all samples are not of the same length.
vectorize_layer = TextVectorization(
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

# Make a text-only dataset (no labels) and call adapt to build the vocabulary.
#text_ds = train.map(lambda x, y: x)
vectorize_layer.adapt(train_text)

## Define model

In [19]:
embed_dim = 64  # Embedding size for each token
num_heads = 6  # Number of attention heads
ff_dim = 64  # Hidden layer size in feed forward network inside transformer

inputs = layers.Input(shape=(1,), dtype=tf.string)
x = vectorize_layer(inputs)
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
x = embedding_layer(x)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x) # was 0.1
x = layers.Dense(20, activation="relu")(x) # was 20
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(215, activation="softmax")(x)
model = keras.Model(inputs=inputs, outputs=outputs)

In [20]:
y_train =[top_users[t.user_id] for t in train]
x_val = [t.text for t in test]
y_val = [top_users[t.user_id] for t in test]

In [21]:
## Train model!

In [22]:
# up to 5 epochs!!
model.compile(
    optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)
history = model.fit(
    train_text, y_train, batch_size=64, epochs=1, validation_data=(x_val, y_val)
)



In [23]:
## Don't forget to save
### try a unique descriptive name ;)
model.save("my_test_model")



INFO:tensorflow:Assets written to: my_test_model/assets


INFO:tensorflow:Assets written to: my_test_model/assets


In [None]:
# more fun ideas to try: 

# tried:
# - bigger batch size! meh, speed not much diffe
# - more epochs
# - more attention heads
# - one more dense layer

# to try:
# - other loss
# - stop words
# - punctuation?!
# - different min cutoff
