In [1]:
# For our internal toolbox imports
import os
import sys
import logging
import pendulum as pd
import time
path_to_here = os.path.abspath('.')
NLP_PATH = path_to_here[:path_to_here.index('slack-pack') + 10] + '/code/'
sys.path.append(NLP_PATH)


from nlp.text import extractor as xt
from nlp.models.message_classification import SimpleClassifier
from nlp.utils.model_output_management import OutputHelper
from nlp.models.similarity_calculation import MessageSimilarity
from nlp.models.summarization import TFIDF as Model
from nlp.grammar import tokenizer as nt
from nlp.viz.cloud import Wordcloud

logger = logging.getLogger('MIDS_FE2016S_log')
logger.setLevel(logging.DEBUG)
LOGFILE = 'log/MIDS_FE2016S_log'
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')

# create console handler, set level of logging and add formatter
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
ch.setFormatter(formatter)

# create file handler, set level of logging and add formatter
fh = logging.handlers.TimedRotatingFileHandler(LOGFILE, when='M', interval=1)
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)
logger.addHandler(ch)
logger.addHandler(fh)

casdb = xt.CassandraExtractor(cluster_ips=['54.175.189.47'],
                              session_keyspace='test_keyspace',
                              table_name='awaybot_messages')

In [4]:
# Set up a query that gets about 100 messages from #general
tz = pd.timezone('US/Eastern')
week14 = pd.now(tz).subtract(weeks=21).timetuple()
week15 = pd.now(tz).subtract(weeks=25).timetuple()
week14_ts = 1468889336.0
week15_ts = 1466470136.0

print week15_ts
print week14_ts

mids_fe2016s_filter = (
    "SELECT * FROM fe_s16_messages WHERE"
    " ts > '{}' AND ts < '{}' "
    "AND CHANNEL = 'general' ALLOW FILTERING".format(week15_ts, week14_ts))

casdb.add_query("mids_fe2016s_filter", mids_fe2016s_filter)

rows = casdb.get_messages(
    "mids_fe2016s_filter")

for row in rows:
    print (row.id, row.text, row.author, row.team, row.url, row.timestamp)
    break
    
rows = casdb.get_messages(
    "mids_fe2016s_filter")
c = 0
for row in rows:
    c += 1
print c

1466470136.0
1468889336.0
(u'1466471391.000083', u'are you feeling better, <@U15T45XQS> ?', u'U17D030P3', u'MIDSFE2016S', u'https://midsfe2016s.slack.com/archives/general/p1466471391000083', <Pendulum [2016-06-21T01:09:51.000083+00:00]>)
93


In [3]:
# Run the model on that query and save the output vizualizations locally
FONT_PATH = NLP_PATH + 'nlp/data/font/Ranga-Regular.ttf'
IMG_FOLDER = NLP_PATH + 'nlp/data/img/'
msg_sim = MessageSimilarity()
msg_stream = casdb.get_messages(
    "mids_fe2016s_filter")
classifier = SimpleClassifier(message_similarity=msg_sim)
classified_window = classifier.classify_stream(msg_stream, low_threshold=.4, high_threshold=.7, low_step=.05, high_step=.02, max_messages=10000, verbose=False)
image_loader = OutputHelper()

uni_model = Model(window=classified_window, cleaner=nt.SimpleCleaner(), n_grams=2)
viz_topics = 0
for t, topic in enumerate(classified_window):  # one(?) per topic
    if len(topic) >= 3:
        # Generate the viz out of the model
        try:
            viz = Wordcloud(model=uni_model, document_id=t, max_words=(10, 5), font=FONT_PATH, multi_plot=True)
        except:
            logger.warning("Failed to generate word cloud for",exc_info=True)
            continue
        viz_topics += 1
        logger.info('topic {} for {} duration {} hour(s) has length {}'.format(t, 'general', 0, len(topic)))
        viz_path = IMG_FOLDER + 'FE2016S_{}_{}_{}_{}.png'.format('general', 'testing', 0, viz_topics)
        viz.save_png(viz_path, title='Topic {}'.format(viz_topics))
        logger.info('saved {}'.format(viz_path))

 -- Loading GloVe, this might take a few (10~30) seconds... -- 



2016-12-12 20:49:23,644 - INFO - topic 0 for general duration 0 hour(s) has length 11


... Done, processed 92 messages


2016-12-12 20:49:23,852 - INFO - saved /home/kjydavis/slack-pack/code/nlp/data/img/FE2016S_general_testing_0_1.png
2016-12-12 20:49:23,972 - INFO - topic 1 for general duration 0 hour(s) has length 14
2016-12-12 20:49:24,101 - INFO - saved /home/kjydavis/slack-pack/code/nlp/data/img/FE2016S_general_testing_0_2.png
2016-12-12 20:49:24,195 - INFO - topic 2 for general duration 0 hour(s) has length 15
2016-12-12 20:49:24,403 - INFO - saved /home/kjydavis/slack-pack/code/nlp/data/img/FE2016S_general_testing_0_3.png
2016-12-12 20:49:24,546 - INFO - topic 3 for general duration 0 hour(s) has length 10
2016-12-12 20:49:24,720 - INFO - saved /home/kjydavis/slack-pack/code/nlp/data/img/FE2016S_general_testing_0_4.png
2016-12-12 20:49:24,801 - INFO - topic 4 for general duration 0 hour(s) has length 16
2016-12-12 20:49:24,936 - INFO - saved /home/kjydavis/slack-pack/code/nlp/data/img/FE2016S_general_testing_0_5.png
2016-12-12 20:49:25,067 - INFO - topic 5 for general duration 0 hour(s) has lengt

## Task One

* Group these messages into topics manually

## Task Two

* Write a summary of each topic that the classifier produces
    
## Task three
* Look at the word clouds, without reading the topics, and write a summary of the topic
    

In [9]:
# Task one pseduo code
rows = casdb.get_messages(
    "mids_fe2016s_filter")

sorted_convo = []
for row in rows:
    sorted_convo.append((row.id, row.text, row.author, row.team, row.url, row.timestamp))
sorted_convo = sorted_convo.sort(key=lambda x: x[5])

In [None]:
from operator import itemgetter
sorted_convo = sorted(sorted_convo, key=itemgetter(5))