In [1]:
# Add the path to the slack-pack/code/ folder in order to be able to import nlp module
import sys, os

NLP_PATH = '/'.join(os.path.abspath('.').split('/')[:-1]) + '/'
sys.path.append(NLP_PATH)

## Principal Runner Code

In [2]:
from nlp.text import extractor as xt
from nlp.text.window import Window
from nlp.models.similarity_calculation import SimilarTopicCalculator
from nlp.models.message_classification import MessageClassifier

from nlp.geometry.repr import list_corpora, GloVe
from nlp.geometry.dist import cosine, jensen_shannon
from nlp.grammar.tokenizer import MessageTokenizer


In [2]:
# Initialize the extractor (JSON or Cassandra)
casdb = xt.CassandraExtractor(cluster_ips=['54.175.189.47'],
                              session_keyspace='test_keyspace',
                              table_name='awaybot_messages')

In [3]:
# Generate message stream
message_stream = casdb.get_messages(type_of_query='hour', channel='general')

In [4]:
list_corpora()

['glove.6B.300d.txt']

In [5]:
%%time
# Initialize the GloVe representation
glove_rep = GloVe('glove.6B.300d.txt')

CPU times: user 32 s, sys: 1.17 s, total: 33.2 s
Wall time: 33.5 s


In [6]:
# Initialize Window on which we'll store the topics
topic_window = Window(window_size=10)

In [7]:
# Initialize the similar topic calculator
simtop = SimilarTopicCalculator(representation=glove_rep,
                                similarity=cosine,
                                tokenizer=MessageTokenizer())

# Initialize the message classifier
classifier = MessageClassifier(window=topic_window,
                               similarity_threshold=0.8,
                               similar_topic_calculator=simtop)

In [8]:
%%time
# Classify the message as we obtain them from the stream
classifier.classify_stream(message_stream=message_stream)

  ... Finished classifying 174 messages
CPU times: user 1.06 s, sys: 76 ms, total: 1.14 s
Wall time: 1.19 s


In [11]:
len(classifier.window.topics)

10

In [12]:
for i, topic in enumerate(classifier.window.topics):
    print 'Topic #{} --> size: {}'.format(i, len(topic)) 

Topic #0 --> size: 1
Topic #1 --> size: 1
Topic #2 --> size: 1
Topic #3 --> size: 3
Topic #4 --> size: 1
Topic #5 --> size: 2
Topic #6 --> size: 2
Topic #7 --> size: 25
Topic #8 --> size: 1
Topic #9 --> size: 92


In [15]:
for msg in classifier.window.topics[3].messages:
    print msg.text + '\n'

for navigating through callbacks and the call stack in notebooks `%pdb on` is you best friend. Suuuper helpful, specially when trying to debug `DATA.my_column.apply( WHICHEVER_FUNCTION )`

This is the code that connects to kafka, consumes messages and inserts them into cassandra.

This is the code that connects to kafka, consumes messages and inserts them into cassandra.



In [None]:
# Visualize the stream