_The below cell will expand the notebook width to the (almost - 95%) full width of the browser_

In [1]:
%%javascript
document.getElementById('notebook-container').style.width = '95%'

<IPython.core.display.Javascript object>

In [2]:
# Add the path to the slack-pack/code/ folder in order to be able to import nlp module
import sys, os

NLP_PATH = '/'.join(os.path.abspath('.').split('/')[:-1]) + '/'
sys.path.append(NLP_PATH)

In [8]:
%matplotlib inline
from nlp.text import extractor as xt

from nlp.geometry import repr as gr
from nlp.geometry import dist as gd
from nlp.grammar import tokenizer as gt
from nlp.text import window as gw

from nlp.models import similarity_calculation as gsc
from nlp.models import message_classification as gmc

### Different representations

We need to load the different representations (we will use `nlp.geometry.repr.GloVe` class) 

In [2]:
gr.list_corpora()

['glove.6B.100d.txt', 'glove.6B.300d.txt']

In [3]:
%%time
# Initialize the GloVe representation
glove100_rep = gr.GloVe('glove.6B.100d.txt')

CPU times: user 12.6 s, sys: 514 ms, total: 13.1 s
Wall time: 13.5 s


In [4]:
%%time
# Initialize the GloVe representation
glove300_rep = gr.GloVe('glove.6B.300d.txt')

CPU times: user 32.3 s, sys: 1.35 s, total: 33.6 s
Wall time: 34.2 s


### Distance function

The following function defines a distance between to texts (it first cleans them using `nlp.grammar.tokenizer.SimpleCleaner`)

In [9]:
clean = gt.SimpleCleaner()

def dist_m2m(m1, m2):
    # tokenize
    text1 = clean(m1.lower())
    text2 = clean(m2.lower())

    # get geometric representation
    rep1 = glove100_rep(text1)
    rep2 = glove100_rep(text2)
    
    return gd.cosine(rep1, rep2)

### Auxiliary functions for inspecting the outputted window (topic list)

With `inspect_window` we get a list of the topics and the #messages in each

With `print_topic` we get all the messages in the given topic, along with the reason why they were added

In [10]:
def inspect_window(window):
    print( 'Window has #{} topics\n'.format( len(window) ) )
    
    print( 'Topic length report:' )
    for i, tpc in enumerate(window):
        print( '  Topic #{:>2}  --> size: {:<3}'.format(i, len(tpc)) )

def print_topic(topic):
    for i,(m,r) in enumerate(topic):
        print '{} -- {}\n\t{}\n\n'.format(i,r,m.text)

## Simple Classifier

The main classifying function

In [11]:
def classify_stream(message_stream, max_messages=20, low_threshold=.7, high_threshold=.85, low_step=.05, high_step=.02, verbose=True):
    topics = []
    for m, msg in enumerate(message_stream):
        if m > max_messages:
            break

        if verbose:
            print '#{:>3}\033[33m ==> {}\033[0m'.format(m, msg.text)

        if len(topics) == 0:
            topics.insert(0, [(msg, 'First message')] )
            if verbose:
                print '\t First message (new 0)\n'

        else:
            # We will sequentially try to append to each topic ...
            #    as time goes by it is harder to append to a topic

            low_th = low_threshold
            high_th = high_threshold
            topic_scores = []  # in case no topic is close

            for t in xrange(len(topics)):
                tp_len = len(topics[t])
                distances = map(lambda x: dist_m2m(msg.text, x[0].text), topics[t])

                # Assign a non-linear score (very close messages score higher)
                score = sum([ 0 if d < low_th else 1 if d < high_th else 3 for d in distances ])

                # Very large topics (> 10) should be harder to append to,
                #   since the odds of a casual match are higher
                if (tp_len < 3) and (score > 0):
                    reason = 'len({}) < 3 and distances({})'.format(tp_len, distances)
                    _topic = topics.pop(t)  # pop from topic queue
                    _topic.append( (msg, reason) )
                    topics.insert(0, _topic)  # append to first topic
                    if verbose:
                        print '\t inserted to #{} : {}\n'.format(t, reason)
                    break

                elif (tp_len < 10) and (score > 10):
                    reason = 'len({}) < 10 and distances({})'.format(tp_len, distances)
                    _topic = topics.pop(t)  # pop from topic queue
                    _topic.append( (msg, 'len({}) < 10 and distances({})'.format(tp_len, distances)) )
                    topics.insert(0, _topic)  # append to first topic
                    if verbose:
                        print '\t inserted to #{} : {}\n'.format(t, reason)
                    break

                elif (tp_len > 10) and (score > tp_len*1.5):
                    reason = 'len({}) > 10 and distances({})'.format(tp_len, distances)
                    _topic = topics.pop(t)  # pop from topic queue
                    _topic.append( (msg, 'len({}) > 10 and distances({})'.format(tp_len, distances)) )
                    topics.insert(0, _topic)  # append to first topic
                    if verbose:
                        print '\t inserted to #{} : {}\n'.format(t, reason)
                    break

                topic_scores.append( (tp_len,score) )  # append score to topic_scores

                # else try with next topic --> harder
                low_th += low_step if low_th+low_step < high_th else high_step
                high_th += high_step
            else:
                # If no topic was suitable --> Start new topic
                topics.insert(0, [(msg, 'No similar topics (to 0) scores:({})'.format(topic_scores))] )
                if verbose:
                    print '\t No similar topics (new 0) scores:({})\n'.format(topic_scores)

    print '... Done, processed {} messages'.format(m)
    return topics

Let's try it out...

In [12]:
# Initialize the extractor (JSON or Cassandra)
awwdb = xt.CassandraExtractor(cluster_ips=['54.175.189.47'],
                              session_keyspace='test_keyspace',
                              table_name='awaybot_messages')

In [None]:
# Need to call .get_messages each time, because if not the message_stream will have "dried out"
msg_stream = awwdb.get_messages(type_of_query='hour', channel='general')

window_us = classify_stream(msg_stream, max_messages=10)

In [59]:
message_stream = casdb.get_messages(type_of_query='hour', channel='random', min_words=5)

window_10 = classify_stream(message_stream, max_messages=10)

#  0[33m ==> Good news is that i found a more straightforward way, and that it is better performing. Double win.[0m
	 First message (new 0)

#  1[33m ==> <@U16S9N0LE|nvisal> has joined the channel[0m
	 No similar topics (new 0) scores:([(1, 0)])

#  2[33m ==> hit that shit with a %dopar%[0m
	 No similar topics (new 0) scores:([(1, 0), (1, 0)])

#  3[33m ==> <@U16S1V6LX|kristineeck> has joined the channel[0m
	 inserted to #1 : len(1) < 3 and distances([1.0])

#  4[33m ==> <@U16RAECF5|micah.gr> has joined the channel[0m
	 inserted to #0 : len(2) < 3 and distances([1.0, 1.0])

#  5[33m ==> I love me some lapply()[0m
	 No similar topics (new 0) scores:([(3, 0), (1, 0), (1, 0)])

#  6[33m ==> <@U16TY5M6F|cjfariss> has joined the channel[0m
	 No similar topics (new 0) scores:([(1, 0), (3, 9), (1, 0), (1, 0)])

#  7[33m ==> <@U16RLTH3N|alex> has joined the channel[0m
	 inserted to #0 : len(1) < 3 and distances([1.0])

#  8[33m ==> <@U16RY7PR6|cdcrabtree> has joined the channe

In [60]:
inspect_window(window_10)

Window has #7 topics

Topic length report:
  Topic # 0  --> size: 1  
  Topic # 1  --> size: 1  
  Topic # 2  --> size: 3  
  Topic # 3  --> size: 1  
  Topic # 4  --> size: 3  
  Topic # 5  --> size: 1  
  Topic # 6  --> size: 1  


In [63]:
print_topic(window_10[4])

0 -- No similar topics (to 0) scores:([(1, 0)])
	<@U16S9N0LE|nvisal> has joined the channel


1 -- len(1) < 3 and distances([1.0])
	<@U16S1V6LX|kristineeck> has joined the channel


2 -- len(2) < 3 and distances([1.0, 1.0])
	<@U16RAECF5|micah.gr> has joined the channel




In [71]:
%%time
message_stream = casdb.get_messages(type_of_query='hour', channel='general', min_words=5)

full_window = classify_stream(message_stream, max_messages=10000, verbose=False)

... Done, processed 119 messages
... Done, processed 119 messages
... Done, processed 119 messages
... Done, processed 119 messages
1 loop, best of 3: 487 ms per loop


In [70]:
inspect_window(full_window)

Window has #60 topics

Topic length report:
  Topic # 0  --> size: 2  
  Topic # 1  --> size: 1  
  Topic # 2  --> size: 1  
  Topic # 3  --> size: 2  
  Topic # 4  --> size: 3  
  Topic # 5  --> size: 1  
  Topic # 6  --> size: 1  
  Topic # 7  --> size: 2  
  Topic # 8  --> size: 3  
  Topic # 9  --> size: 3  
  Topic #10  --> size: 1  
  Topic #11  --> size: 3  
  Topic #12  --> size: 1  
  Topic #13  --> size: 3  
  Topic #14  --> size: 3  
  Topic #15  --> size: 3  
  Topic #16  --> size: 1  
  Topic #17  --> size: 1  
  Topic #18  --> size: 3  
  Topic #19  --> size: 1  
  Topic #20  --> size: 1  
  Topic #21  --> size: 1  
  Topic #22  --> size: 3  
  Topic #23  --> size: 1  
  Topic #24  --> size: 1  
  Topic #25  --> size: 3  
  Topic #26  --> size: 3  
  Topic #27  --> size: 2  
  Topic #28  --> size: 3  
  Topic #29  --> size: 1  
  Topic #30  --> size: 3  
  Topic #31  --> size: 1  
  Topic #32  --> size: 1  
  Topic #33  --> size: 3  
  Topic #34  --> size: 3  
  Topic #35

In [51]:
message_stream = casdb.get_messages(type_of_query='hour', channel='general', min_words=5)

window_20 = classify_stream(message_stream, max_messages=20)

#  0[33m ==> Its a pretty simple recommender -- seems to just key on most descriptive word. This is a little tough, because while there are a lot of nerds who rate beer, there are a lot of bros who know like 4 beer words: dank, fruit, sour, ... . And, this sort of mainlines the recommendations. I think a little filtering would help clean this up.[0m
	 First message (to 0)

#  1[33m ==> Two syllabi that my be of interest: Matt Blackwell: <http://www.mattblackwell.org/files/teaching/gov2002-15f-syllabus.pdf> and Danny Hidalgo: <http://www.mit.edu/~dhidalgo/syllabi/17_802_syll2014.pdf>.  Alex pointed out that Hidalgo is more interested in observational data. Nice bibliography between the two.[0m
	 inserted to #0 : len(1) < 3 and distances([0.5085626196861307])

#  2[33m ==> Does anybody have an in either at Pew or the NCSL? I wonder because theyve published a _Stateline_ series, which at least has the ethnicity data that Micah and Charles were thinking about. Ive been warned that if 

In [52]:
inspect_window(window_20)

Window has #10 topics

Topic length report:
  Topic # 0  --> size: 1  
  Topic # 1  --> size: 2  
  Topic # 2  --> size: 3  
  Topic # 3  --> size: 3  
  Topic # 4  --> size: 1  
  Topic # 5  --> size: 3  
  Topic # 6  --> size: 1  
  Topic # 7  --> size: 1  
  Topic # 8  --> size: 3  
  Topic # 9  --> size: 3  


### Message-to-topic distance

When we measure the distance of a message to a topic we are doing som clustering...

### Single Runner

In [116]:
# Initialize Window on which we'll store the topics
topic_window = gw.Window(window_size=100)

In [117]:
# Initialize the similar topic calculator
simtop = gsc.SimilarTopicCalculator(representation=glove100_rep,
                                    similarity=gd.cosine,
                                    tokenizer=gt.SimpleCleaner())

In [118]:
# Initialize the message classifier
classifier = gmc.MessageClassifier(window=topic_window,
                                   similarity_threshold=0.4,
                                   similar_topic_calculator=simtop,
                                   reply_analysis=False)

In [119]:
# Classify the message as we obtain them from the stream
message_stream = casdb.get_messages(type_of_query='hour', channel='general')
classifier.classify_stream(message_stream=message_stream)

  ... Finished classifying 120 messages


### Examining Results

In [112]:
def inspect_window(window):
    print( 'Window has #{} topics\n'.format( len(window) ) )
    
    print( 'Topic length report:' )
    for i, tpc in enumerate(window.topics):
        print( '  Topic #{:>2}  --> size: {:<3}'.format(i, len(tpc)) )

def print_topic(topic):
    for i,m,r in zip(range(len(topic)), topic.messages, topic.reasons):
        print '{} -- {}\n\t{}\n\n'.format(i,r,m.text)

In [120]:
inspect_window(topic_window)

Window has #3 topics

Topic length report:
  Topic # 0  --> size: 2  
  Topic # 1  --> size: 7  
  Topic # 2  --> size: 111
