## Access Slack History + RTM API with graceful(ish) recovery

### Define schema using avro

* If you don't have Avro, follow along https://avro.apache.org/docs/1.8.1/gettingstartedpython.html to install


In [32]:
%%writefile slack.avsc
{
"namespace": "slack_kafka.avro",
"type": "record",
"name": "slack_message_schema",
"fields" : [
    {
        "name": "user_id",
        "type": ["string", "null"],
        "doc": "Code given by slack api for username"
    },
    {
        "name": "text",
        "type": ["string", "null"],
        "doc": "The text of the message"
    },
    {
        "name": "channel",
        "type": ["string", "null"],
        "doc": "The code given by slack api for channel"
    },
    {
        "name": "timestamp",
        "type": "float",
        "doc": "Unix timestamp of record assigned by slack api"}
 ],
"doc": "A Schema for storing Slack messages."
}

Overwriting slack.avsc


In [43]:
%%writefile SlackKafka.py
from slackclient import SlackClient
from kafka import KafkaClient, KafkaConsumer, KafkaProducer
import threading, logging, time
import avro.schema
import avro.io
import io
import sqlite3
import os

'''The goal of this script is to define a simple 
proceedure that will determine if the topic
already exists. If it does, it pulls the 
latest message timestamp from the sqlite 
database and queries the slack channel history
for messages sent since that time.'''

def updateTimestamp(db, team_id, ts):
    '''Function to update the value of the
    timestamp of the last processed message'''
    if not os.path.exists(db):
        conn = sqlite3.connect(db)
        c = conn.cursor()
        c.execute('CREATE TABLE timestamps (team_id TEXT, ts FLOAT);')
        conn.commit()
        conn.close()
    conn = sqlite3.connect(db)
    c = conn.cursor()
    c.execute('DELETE FROM timestamps WHERE team_id = ?', (team_id,))
    c.execute('INSERT INTO timestamps VALUES (?, ?)', (team_id, ts))
    conn.commit()
    conn.close()
    return

def findMaxTimestamp(db, team_id):
    conn = sqlite3.connect(db)
    c = conn.cursor()
    c.execute('SELECT MAX(ts) from timestamps where team_id = ?', (team_id,))
    max_ts = float(c.fetchall()[0][0])
    conn.close()
    return max_ts

# Connect to Slack Client
token_file = open('token.txt', 'r')
token = [line.strip() for line in token_file.readlines()][0]
token_file.close()
sc = SlackClient(token)

# Get the team ID
team_id = sc.api_call('team.info')['team']['id']

#Setup a kafka producer
producer = KafkaProducer(bootstrap_servers='localhost:9092')

#Setup avro
schema = avro.schema.parse(open("slack.avsc", "rb").read())
writer = avro.io.DatumWriter(schema)
bytes_writer = io.BytesIO()
encoder = avro.io.BinaryEncoder(bytes_writer)

# Check if a topic already exists with that team ID
kafka = KafkaClient('localhost:9092')
server_topics = kafka.topic_partitions
if server_topics:
    topics = server_topics.keys()
else:
    topics = []
# If it does, find messages that have been sent since
# The last time a message was processed
if team_id in topics:
    print('Topic Already Exists')
    max_ts = findMaxTimestamp('timestamps.sqlite', team_id)
    print("Querying slack channel history for missed messages")
    channels = [channel_dict['id'] for channel_dict in sc.api_call("channels.list")['channels']]
    c = 0
    for channel in channels:
        channel_history = sc.api_call("channels.history", channel=channel, count="100000")
        for message_dict in channel_history['messages']:
            if 'user' in message_dict:
                if float(message_dict['ts']) > max_ts:
                    message = {}
                    message['user_id'] = message_dict['user']
                    message['text'] = message_dict['text']
                    message['channel'] = channel
                    message['timestamp'] = float(message_dict['ts'])
                    writer.write(message, encoder)
                    raw_bytes = bytes_writer.getvalue()
                    producer.send(team_id, raw_bytes)
                    c += 1
    print("Sent {} missing messages to kafka".format(c))
# If it doesn't, create topic and process all of the history
else:
    c = 0
    print("Creating new topic for team {}".format(team_id))
    channels = [channel_dict['id'] for channel_dict in sc.api_call("channels.list")['channels']]
    for channel in channels:
        channel_history = sc.api_call("channels.history", channel=channel, count="100000")
        for message_dict in channel_history['messages']:
            if 'user' in message_dict:
                message = {}
                message['user_id'] = message_dict['user']
                message['text'] = message_dict['text']
                message['channel'] = channel
                message['timestamp'] = float(message_dict['ts'])
                writer.write(message, encoder)
                raw_bytes = bytes_writer.getvalue()
                producer.send(team_id, raw_bytes)
                c += 1
    print("Sent {} historical messages to Kafka".format(c))

# Connect to real time messaing API and listen for messages
# Update timestamp after processing message
c = 0
print("Connecting to the real time messaging API")
if sc.rtm_connect():
    while True:
        latest = sc.rtm_read()
        if latest:
            if 'text' in latest[0]:
                message = {}
                message['user_id'] = latest[0]['user']
                message['text'] = latest[0]['text']
                message['channel'] = latest[0]['channel']
                message['timestamp'] = float(latest[0]['ts'])
                writer.write(message, encoder)
                raw_bytes = bytes_writer.getvalue()
                producer.send(team_id, raw_bytes)
                updateTimestamp('timestamps.sqlite', team_id, message['timestamp'])
                c += 1
                print('Sent {} messages'.format(c))
                
        time.sleep(5)

Overwriting SlackKafka.py


In [40]:
!python SlackKafka.py

Creating new topic for team T2BT8MVE3
Sent 553 historical messages to Kafka
Connecting to the real time messaging API
Sent 1 messages
Sent 2 messages
^C
Traceback (most recent call last):
  File "SlackKafka.py", line 124, in <module>
    time.sleep(5)
KeyboardInterrupt


In [41]:
conn = sqlite3.connect('timestamps.sqlite')
c = conn.cursor()
c.execute('Select * FROM timestamps')
print(c.fetchall())
conn.close()

[(u'T2BT8MVE3', 1475701368.000003)]


In [42]:
!python SlackKafka.py

Topic Already Exists
Querying slack channel history for missed messages
Sent 1 missing messages to kafka
Connecting to the real time messaging API
Sent 1 messages
Sent 2 messages
^C
Traceback (most recent call last):
  File "SlackKafka.py", line 124, in <module>
    time.sleep(5)
KeyboardInterrupt
