In [1]:
# For our internal toolbox imports
import os
import sys
import logging
import pendulum as pd
import time
path_to_here = os.path.abspath('.')
NLP_PATH = path_to_here[:path_to_here.index('slack-pack') + 10] + '/code/'
sys.path.append(NLP_PATH)


from nlp.text import extractor as xt
from nlp.models.message_classification import SimpleClassifier
from nlp.utils.model_output_management import OutputHelper
from nlp.models.similarity_calculation import MessageSimilarity
from nlp.models.summarization import TFIDF as Model
from nlp.grammar import tokenizer as nt
from nlp.viz.cloud import Wordcloud

logger = logging.getLogger('MIDS_FE2016S_log')
logger.setLevel(logging.DEBUG)
LOGFILE = 'log/MIDS_FE2016S_log'
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')

# create console handler, set level of logging and add formatter
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
ch.setFormatter(formatter)

# create file handler, set level of logging and add formatter
fh = logging.handlers.TimedRotatingFileHandler(LOGFILE, when='M', interval=1)
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)
logger.addHandler(ch)
logger.addHandler(fh)

casdb = xt.CassandraExtractor(cluster_ips=['54.175.189.47'],
                              session_keyspace='test_keyspace',
                              table_name='awaybot_messages')

In [2]:
# Set up a query that gets about 100 messages from #general
tz = pd.timezone('US/Eastern')
week14 = pd.now(tz).subtract(weeks=21).timetuple()
week15 = pd.now(tz).subtract(weeks=25).timetuple()
week14_ts = time.mktime(week14)
week15_ts = time.mktime(week15)

print week15_ts
print week14_ts

mids_fe2016s_filter = (
    "SELECT * FROM fe_s16_messages WHERE"
    " ts > '{}' AND ts < '{}' "
    "AND CHANNEL = 'general' ALLOW FILTERING".format(week15_ts, week14_ts))

casdb.add_query("mids_fe2016s_filter", mids_fe2016s_filter)

rows = casdb.get_messages(
    "mids_fe2016s_filter")

for row in rows:
    print (row.id, row.text, row.author, row.team, row.url, row.timestamp)
    break
    
rows = casdb.get_messages(
    "mids_fe2016s_filter")
c = 0
for row in rows:
    c += 1
print c

1466462193.0
1468881393.0
(u'1466463445.000078', u'So we could replace the `mean(mids) + ( qnorm(0.025) * sd(mids) )` line with `mean(mids) + qnorm(0.025, sd = sd(mids))`, or maybe we were intended to compare the two.', u'U17BYEAF2', u'MIDSFE2016S', u'https://midsfe2016s.slack.com/archives/general/p1466463445000078', <Pendulum [2016-06-20T22:57:25.000078+00:00]>)
94


In [5]:
# Run the model on that query and save the output vizualizations locally
FONT_PATH = NLP_PATH + 'nlp/data/font/Ranga-Regular.ttf'
IMG_FOLDER = NLP_PATH + 'nlp/data/img/'
msg_sim = MessageSimilarity()
msg_stream = casdb.get_messages(
    "mids_fe2016s_filter")
classifier = SimpleClassifier(message_similarity=msg_sim)
classified_window = classifier.classify_stream(msg_stream, low_threshold=.4, high_threshold=.7, low_step=.05, high_step=.02, max_messages=10000, verbose=False)
image_loader = OutputHelper()

uni_model = Model(window=classified_window, cleaner=nt.SimpleCleaner(), n_grams=2)
viz_topics = 0
for t, topic in enumerate(classified_window):  # one(?) per topic
    if len(topic) >= 3:
        # Generate the viz out of the model
        try:
            viz = Wordcloud(model=uni_model, document_id=t, max_words=(10, 5), font=FONT_PATH, multi_plot=True)
        except:
            logger.warning("Failed to generate word cloud for",exc_info=True)
            continue
        viz_topics += 1
        logger.info('topic {} for {} duration {} hour(s) has length {}'.format(t, 'general', 0, len(topic)))
        viz_path = IMG_FOLDER + 'FE2016S_{}_{}_{}_{}.png'.format('general', 'testing', 0, viz_topics)
        viz.save_png(viz_path, title='Topic {}'.format(viz_topics))
        logger.info('saved {}'.format(viz_path))

 -- Loading GloVe, this might take a few (10~30) seconds... -- 



2016-12-12 18:40:03,953 - INFO - topic 0 for general duration 0 hour(s) has length 11
2016-12-12 18:40:04,045 - INFO - saved /home/kjydavis/slack-pack/code/nlp/data/img/FE2016S_general_testing_0_1.png


... Done, processed 93 messages


2016-12-12 18:40:04,119 - INFO - topic 1 for general duration 0 hour(s) has length 14
2016-12-12 18:40:04,220 - INFO - saved /home/kjydavis/slack-pack/code/nlp/data/img/FE2016S_general_testing_0_2.png
2016-12-12 18:40:04,306 - INFO - topic 2 for general duration 0 hour(s) has length 15
2016-12-12 18:40:04,404 - INFO - saved /home/kjydavis/slack-pack/code/nlp/data/img/FE2016S_general_testing_0_3.png
2016-12-12 18:40:04,486 - INFO - topic 3 for general duration 0 hour(s) has length 10
2016-12-12 18:40:04,580 - INFO - saved /home/kjydavis/slack-pack/code/nlp/data/img/FE2016S_general_testing_0_4.png
2016-12-12 18:40:04,659 - INFO - topic 4 for general duration 0 hour(s) has length 16
2016-12-12 18:40:04,751 - INFO - saved /home/kjydavis/slack-pack/code/nlp/data/img/FE2016S_general_testing_0_5.png
2016-12-12 18:40:04,833 - INFO - topic 5 for general duration 0 hour(s) has length 17
2016-12-12 18:40:04,933 - INFO - saved /home/kjydavis/slack-pack/code/nlp/data/img/FE2016S_general_testing_0_6