_The below cell will expand the notebook width to the (almost - 95%) full width of the browser_

In [1]:
%%javascript
document.getElementById('notebook-container').style.width = '95%'

<IPython.core.display.Javascript object>

In [2]:
# Add the path to the slack-pack/code/ folder in order to be able to import nlp module
import sys, os

NLP_PATH = '/'.join(os.path.abspath('.').split('/')[:-1]) + '/'
sys.path.append(NLP_PATH)

In [3]:
%matplotlib inline
from nlp.text import extractor as xt

from nlp.geometry import repr as gr
from nlp.geometry import dist as gd
from nlp.grammar import tokenizer as gt
from nlp.text import window as gw

from nlp.models import similarity_calculation as gsc
from nlp.models import message_classification as gmc

### Different representations

We need to load the different representations (we will use `nlp.geometry.repr.GloVe` class) 

In [4]:
gr.list_corpora()

['glove.6B.100d.txt', 'glove.6B.300d.txt']

In [5]:
%%time
# Initialize the GloVe representation
glove100_rep = gr.GloVe('glove.6B.100d.txt')

CPU times: user 12.6 s, sys: 524 ms, total: 13.1 s
Wall time: 13.6 s


In [6]:
%%time
# Initialize the GloVe representation
glove300_rep = gr.GloVe('glove.6B.300d.txt')

CPU times: user 32.9 s, sys: 1.28 s, total: 34.2 s
Wall time: 34.9 s


### Distance function

The following function defines a distance between to texts (it first cleans them using `nlp.grammar.tokenizer.SimpleCleaner`)

In [7]:
clean = gt.SimpleCleaner()

def dist_m2m(m1, m2):
    # tokenize
    text1 = clean(m1.lower())
    text2 = clean(m2.lower())

    # get geometric representation
    rep1 = glove100_rep(text1)
    rep2 = glove100_rep(text2)
    
    return gd.cosine(rep1, rep2)

def dist_m2m_300(m1, m2):
    # tokenize
    text1 = clean(m1.lower())
    text2 = clean(m2.lower())

    # get geometric representation
    rep1 = glove300_rep(text1)
    rep2 = glove300_rep(text2)
    
    return gd.cosine(rep1, rep2)

### Auxiliary functions for inspecting the outputted window (topic list)

With `inspect_window` we get a list of the topics and the #messages in each

With `print_topic` we get all the messages in the given topic, along with the reason why they were added

In [8]:
def inspect_window(window):
    print( 'Window has #{} topics\n'.format( len(window) ) )
    
    print( 'Topic length report:' )
    for i, tpc in enumerate(window):
        print( '  Topic #{:>2}  --> size: {:<3}'.format(i, len(tpc)) )

def print_topic(topic):
    for i,(m,r) in enumerate(topic):
        print '{} -- {}\n\t\033[33m{}\033[0m\n\n'.format(i,r,m.text)

## Simple Classifier

The main classifying function

In [9]:
def classify_stream(message_stream, distance=dist_m2m, max_messages=20,
                    low_threshold=.4, high_threshold=.7, low_step=.05, high_step=.02, verbose=True):
    topics = []
    for m, msg in enumerate(message_stream):
        if m > max_messages:
            m -= 1
            break

        if verbose:
            print '#{:>3}\033[33m ==> {}\033[0m'.format(m, msg.text.encode('ascii', 'ignore'))

        if len(topics) == 0:
            topics.insert(0, [(msg, 'First message')] )
            if verbose:
                print '\t First message (new 0)\n'

        else:
            # We will sequentially try to append to each topic ...
            #    as time goes by it is harder to append to a topic

            low_th = low_threshold
            high_th = high_threshold
            topic_scores = []  # in case no topic is close

            for t in xrange(len(topics)):
                tp_len = len(topics[t])
                distances = map(lambda x: distance(msg.text, x[0].text), topics[t])

                # Assign a non-linear score (very close messages score higher)
                score = sum([ 0 if d < low_th else 1 if d < high_th else 3 for d in distances ])

                # Very large topics (> 10) should be harder to append to,
                #   since the odds of a casual match are higher
                if (tp_len < 3):
                    if (score > 0):
                        reason = 'len({}) < 3 and distances({})'.format(tp_len, distances)
                        _topic = topics.pop(t)  # pop from topic queue
                        _topic.append( (msg, reason) )
                        topics.insert(0, _topic)  # append to first topic
                        if verbose:
                            print '\t inserted to #{} : {}\n'.format(t, reason)
                        break

                elif (tp_len < 10):
                    if (score > (tp_len - (2 - tp_len/15.) )):
                        reason = 'len({}) < 10 and distances({})'.format(tp_len, distances)
                        _topic = topics.pop(t)  # pop from topic queue
                        _topic.append( (msg, 'len({}) < 10 and distances({})'.format(tp_len, distances)) )
                        topics.insert(0, _topic)  # append to first topic
                        if verbose:
                            print '\t inserted to #{} : {}\n'.format(t, reason)
                        break

                elif (tp_len > 10):
                    if (score > tp_len*1.5):
                        reason = 'len({}) > 10 and distances({})'.format(tp_len, distances)
                        _topic = topics.pop(t)  # pop from topic queue
                        _topic.append( (msg, 'len({}) > 10 and distances({})'.format(tp_len, distances)) )
                        topics.insert(0, _topic)  # append to first topic
                        if verbose:
                            print '\t inserted to #{} : {}\n'.format(t, reason)
                        break

                topic_scores.append( (tp_len,score) )  # append score to topic_scores

                # else try with next topic --> harder
                low_th += low_step if low_th+low_step < high_th else high_step
                high_th += high_step
            else:
                # If no topic was suitable --> Start new topic
                topics.insert(0, [(msg, 'No similar topics (to 0) scores:({})'.format(topic_scores))] )
                if verbose:
                    print '\t No similar topics (new 0) scores:({})\n'.format(topic_scores)

    print '... Done, processed {} messages'.format(m)
    return topics

Let's try it out...

In [10]:
# Initialize the extractor (JSON or Cassandra)
awwdb = xt.CassandraExtractor(cluster_ips=['54.175.189.47'],
                              session_keyspace='test_keyspace',
                              table_name='awaybot_messages')

In [11]:
awwdb.list_channels()

{u'architecture',
 u'bot-sandbox',
 u'class-deliverables',
 u'code-documentation',
 u'data',
 u'general',
 u'github-repo',
 u'name-selection',
 u'next-meeting',
 u'nlp-methodology',
 u'tech-stuff'}

In [12]:
# Need to call .get_messages each time, because if not the message_stream will have "dried out"
msg_stream = awwdb.get_messages(type_of_query='day', periods=5, channel='tech-stuff', min_words=5)

window_us = classify_stream(msg_stream, distance=dist_m2m, low_threshold=.4, high_threshold=.7, low_step=.05, high_step=.02, max_messages=30)

#  0[33m ==> are you using outgoing webhooks for pushing the messages out?[0m
	 First message (new 0)

#  1[33m ==> No, the tutorial is using a websocket connection[0m
	 No similar topics (new 0) scores:([(1, 0)])

#  2[33m ==> which I'm not really familiar with[0m
	 No similar topics (new 0) scores:([(1, 0), (1, 0)])

#  3[33m ==> I'm reading up on NLP meanwhile[0m
	 inserted to #0 : len(1) < 3 and distances([0.55818406822222211])

#  4[33m ==> :slightly_smiling_face: this is really cool stuff[0m
	 No similar topics (new 0) scores:([(2, 0), (1, 0), (1, 0)])

#  5[33m ==> Nice. Hopefully NLP class will give you some good resources as well.[0m
	 inserted to #0 : len(1) < 3 and distances([0.54949126842986729])

#  6[33m ==> just googling around I found this. I don't really understand it, but thought I'd share.[0m
	 inserted to #0 : len(2) < 3 and distances([0.4767096159607096, 0.6559426160817835])

#  7[33m ==> which may or may not be related to this project, which I also 

In [13]:
inspect_window(window_us)

Window has #7 topics

Topic length report:
  Topic # 0  --> size: 10 
  Topic # 1  --> size: 6  
  Topic # 2  --> size: 1  
  Topic # 3  --> size: 10 
  Topic # 4  --> size: 2  
  Topic # 5  --> size: 1  
  Topic # 6  --> size: 1  


In [14]:
print_topic(window_us[3])

0 -- No similar topics (to 0) scores:([(5, 3), (2, 0), (1, 0), (1, 0)])
	[33magreed <@U2C9M9GP5> from the reading I have done I am siding with the RTM API[0m


1 -- len(1) < 3 and distances([0.51486277451014895])
	[33m`open question`: what is a partition of a log for kafka? is this similar to the partitioning that happens in the MapReduce framework?[0m


2 -- len(2) < 3 and distances([0.63210357154046548, 0.73554032578735018])
	[33m```Log Aggregation

Many people use Kafka as a replacement for a log aggregation solution. Log aggregation typically collects physical log files off servers and puts them in a central place (a file server or HDFS perhaps) for processing. Kafka abstracts away the details of files and gives a cleaner abstraction of log or event data as a stream of messages. This allows for lower-latency processing and easier support for multiple data sources and distributed data consumption. In comparison to log-centric systems like Scribe or Flume, Kafka offers equally g

### Convert into a `nlp.text.window.Window`

In [15]:
from nlp.text import topic as gt
from nlp.text import window as gw

The old _topic list_ (let's check we get the same results after converting it into a window...)

In [16]:
inspect_window(window_us)

Window has #7 topics

Topic length report:
  Topic # 0  --> size: 10 
  Topic # 1  --> size: 6  
  Topic # 2  --> size: 1  
  Topic # 3  --> size: 10 
  Topic # 4  --> size: 2  
  Topic # 5  --> size: 1  
  Topic # 6  --> size: 1  


In [17]:
real_window = gw.from_topic_list(window_us)

In [18]:
real_window.report_topics()

Window has #7 topics

Topic length report:
  Topic # 0  --> size: 10 
  Topic # 1  --> size: 6  
  Topic # 2  --> size: 1  
  Topic # 3  --> size: 10 
  Topic # 4  --> size: 2  
  Topic # 5  --> size: 1  
  Topic # 6  --> size: 1  


In [19]:
real_window.topics[1].report_messages()

0 -- No similar topics (to 0) scores:([(6, 4), (1, 0), (10, 9), (2, 0), (1, 0), (1, 0)])
	[33mAgain... My understanding here...  But serialization for me just means standardizing into an object that will be readable and has follows some (usually java) format[0m


1 -- len(1) < 3 and distances([0.51307060858976083])
	[33mokay so JSON is a serialization[0m


2 -- len(2) < 3 and distances([0.46802098268440662, 0.32118337847136891])
	[33mBasically, pickle is a serializer for python objects [0m


3 -- len(3) < 10 and distances([0.58456109949091772, 0.16949531381232907, 0.40362016822657015])
	[33mas well as many other well known formats[0m


4 -- len(4) < 10 and distances([0.7335960110648202, 0.42799379927020431, 0.35556377717969201, 0.66759583196660033])
	[33mThat is a question I have... But I would say JSON can serve as the serialized final format[0m


5 -- len(5) < 10 and distances([0.71273707284402976, 0.25130027498114171, 0.51289860357272021, 0.65988093594073338, 0.71164461804

### Save the Window in a pickle

In [138]:
import cPickle as pk

In [144]:
with open('../nlp/data/windows/alex_new_config_window.pk', 'wb') as f:
    pk.dump(real_window, f)

And we are done... Now we can use it for visualization purposes outside &#9786;