In [47]:
import sys
sys.path.append('../..')
from pprint import pprint

### Parsing Slack sentences

In [69]:
from segmentation.preprocessing.slack import *
parse_body('''
@noag: which parrot mini drone?  
Iâ€™ve had some success using this with a Parrot Rolling Spider mini drone: 
http://github.com/voodootikigod/node-rolling-spider
''')

{'emojis': [],
 'mentions': ['noag'],
 'sanitized': '\n: which parrot mini drone?  \nI\xe2\x80\x99ve had some success using this with a Parrot Rolling Spider mini drone: \n\n',
 'text': '\n@noag: which parrot mini drone?  \nI\xe2\x80\x99ve had some success using this with a Parrot Rolling Spider mini drone: \nhttp://github.com/voodootikigod/node-rolling-spider\n',
 'urls': ['http://github.com/voodootikigod/node-rolling-spider']}

### Extract sentence feature

In [54]:
feat1 = extract_all(parse_body(
                    '''Don't forget it's Valentine's Day tomorrow!! 
                    Put your name down if you wanna meet your hacker valentine this weekend
                    '''))
pprint(feat1)

{'concepts': [{u'dbpedia': u"http://dbpedia.org/resource/Valentine's_Day",
               u'freebase': u'http://rdf.freebase.com/ns/m.018y5m',
               u'opencyc': u'http://sw.opencyc.org/concept/Mx4rvVjgYZwpEbGdrcN5Y29ycA',
               u'relevance': u'0.91136',
               u'text': u"Valentine's Day",
               u'yago': u"http://yago-knowledge.org/resource/Valentine's_Day"}],
 'emojis': [],
 'entities': [],
 'keywords': [{u'relevance': u'0.993352', u'text': u'Valentine'},
              {u'relevance': u'0.880159', u'text': u'hacker'},
              {u'relevance': u'0.809776', u'text': u'weekend'}],
 'mentions': [],
 'sanitized': "Don't forget it's Valentine's Day tomorrow!! \n                    Put your name down if you wanna meet your hacker valentine this weekend\n                    ",
 'taxonomy': [{u'label': u'/shopping/gifts/flowers', u'score': u'0.707805'},
              {u'confident': u'no',
               u'label': u'/society/dating',
               u'score':

In [59]:
feat2 = extract_all(parse_body(
                    '''I would like to celebrate Valentine's Day with tea!
                    '''))
pprint(feat2)

{'concepts': [{u'dbpedia': u"http://dbpedia.org/resource/Valentine's_Day",
               u'freebase': u'http://rdf.freebase.com/ns/m.018y5m',
               u'opencyc': u'http://sw.opencyc.org/concept/Mx4rvVjgYZwpEbGdrcN5Y29ycA',
               u'relevance': u'0.91136',
               u'text': u"Valentine's Day",
               u'yago': u"http://yago-knowledge.org/resource/Valentine's_Day"}],
 'emojis': [],
 'entities': [],
 'keywords': [{u'relevance': u'0.980859', u'text': u'Valentine'},
              {u'relevance': u'0.869746', u'text': u'tea'}],
 'mentions': [],
 'sanitized': "I would like to celebrate Valentine's Day with tea!\n                    ",
 'taxonomy': [{u'label': u'/food and drink/beverages/non alcoholic beverages/coffee and tea',
               u'score': u'0.737093'},
              {u'confident': u'no',
               u'label': u'/shopping/gifts',
               u'score': u'0.205532'},
              {u'confident': u'no',
               u'label': u'/art and entertainme

In [70]:
# distance measure.
keywords_l0(feat1, feat2)

-0.993352

### Clustering

In [68]:
from segmentation.cluster import spectral_clustering
messages = [
    '''
    Don't forget it's Valentine's Day tomorrow!! 
                    Put your name down if you wanna meet your hacker valentine this weekend
    ''',
    '''
    I would like to celebrate Valentine's Day with tea!
    ''',
    '''
    which parrot mini drone?  
    I\u2019ve had some success using this with a Parrot Spider mini drone:
    '''
]
spectral_clustering(messages, num_clusters=2)

[[ 14.63311338   2.70027063   1.        ]
 [  2.66674599   6.36366838   1.        ]
 [  1.           1.           7.86332858]]


array([1, 1, 0], dtype=int32)

# Segmentation algorithm: second iteration

In [78]:
from server.source.slack import start_sync as start_slack_sync, get_messages as get_slack_messages

messages_data = get_slack_messages(16)
channel_name = messages_data['channel_name']
messages = messages_data['messages']

In [174]:
from segmentation.preprocessing.slack import *
new_messages = merge_messages_by_sender(messages)

In [175]:
new_messages

[{'emojis': [],
  'sender': {'first_name': 'Shrey',
   'full_name': 'Shrey Gupta',
   'nickname': 'shreygupta'},
  'text': '\\mentor anyone who can help with NLP segmentation?.',
  'timestamp': '1455422975.001394'},
 {'emojis': [],
  'sender': {'first_name': 'Ashi',
   'full_name': 'Ashi Agrawal',
   'nickname': 'organizer-ashi'},
  'text': ' /mentor will send your question straight to mentors! :-).',
  'timestamp': '1455423013.001395'},
 {'emojis': [],
  'sender': {'first_name': 'Alec',
   'full_name': 'Alec Garcia',
   'nickname': 'alec'},
  'text': u'They don\u2019t want you to have :evergreen_tree:shirts smh.',
  'timestamp': '1455424557.001398'},
 {'emojis': [],
  'sender': {'first_name': 'Raphael',
   'full_name': 'Raphael Palefsky-Smith',
   'nickname': 'organizer-raphael'},
  'text': 'swag bags / shirts at 10:30!!!!.',
  'timestamp': '1455425062.001402'},
 {'emojis': [],
  'sender': {'first_name': 'Claire',
   'full_name': 'Claire Shu',
   'nickname': 'organizer-claire'},
  'te

In [176]:
feats = []
for message in new_messages:
    feat = dict(message)
    feat.update(extract_all(parse_body(message['text'])))
    feats.append(feat)

text_hash 8662566a27a5f461001dab1678227df97371d64aacdb4c519fb7bd8622a0188f
cached call
text_hash 992f1cff84959209d4144e5cd4dfc4c88d1123e1729ca3a413e91749103c6f24
cached call
text_hash 20d92747bb978fae8fc7a85a97e36b6ff955035de311f79284aa08b2c012099b
cached call
text_hash b053a62508d3e4516551daaf1429d7c7794336df97e5750db7b4566c8a743a15
cached call
text_hash aba75513deda5ee22ad0a08e98269d124af539f6b7f52b6ed5d2ae65b15120ef
cached call
text_hash b9752bfd9d0f99ec699439c47853dcf05cdb375862f3c19009935e5dd9f6fd7e
cached call
text_hash 4fe407289cfdc4b01edb142d9f5d47d5179540f8ef6a286fe3dbd5f5e5a88e28
cached call
text_hash 865c8b923283965f3ab7adcfe5acdc4902129668d77b6e949dd6c7f54845b367
cached call
text_hash 1054e38d0bebd715e7eec90cf54b39e7de74605124947bc155c6b739379faf8c
cached call
text_hash d95f38b951d70a9d82afc9287590bb764757f7414621fb0e4ae6f834400219e4
cached call
text_hash 240c2d7a3421fb74387cb8d0c43a0bdb5d47c2e17bac77abc9b1409b586017ec
cached call
text_hash 360357e33fa186ed22df163a932e4589a

In [177]:
for i in range(len(feats)):
    print i
    pprint(feats[i])

0
{'concepts': [],
 'emojis': [],
 'entities': [],
 'inline-emojis': [],
 'keywords': [{u'relevance': u'0.980977', u'text': u'NLP segmentation'}],
 'mentions': [],
 'sanitized': '\\mentor anyone who can help with NLP segmentation?.',
 'sender': {'first_name': 'Shrey',
            'full_name': 'Shrey Gupta',
            'nickname': 'shreygupta'},
 'taxonomy': [{u'label': u'/business and industrial/advertising and marketing/marketing',
               u'score': u'0.777159'},
              {u'confident': u'no',
               u'label': u'/technology and computing/enterprise technology/data management',
               u'score': u'0.322128'},
              {u'confident': u'no',
               u'label': u'/technology and computing/enterprise technology/customer relationship management',
               u'score': u'0.294169'}],
 'text': '\\mentor anyone who can help with NLP segmentation?.',
 'timestamp': '1455422975.001394',
 'urls': []}
1
{'concepts': [],
 'emojis': [],
 'entities': [],
 'inl

In [206]:
pprint(feats[7])

{'concepts': [],
 'emojis': [],
 'entities': [],
 'inline-emojis': [],
 'keywords': [{u'relevance': u'0.918308', u'text': u'flight reimbursements'},
              {u'relevance': u'0.782724', u'text': u'*last chance*'},
              {u'relevance': u'0.531477', u'text': u'checks'},
              {u'relevance': u'0.403607', u'text': u'channel'},
              {u'relevance': u'0.393181', u'text': u'money'}],
 'mentions': [],
 'sanitized': '<!channel>: if you missed me for flight reimbursements, i will be giving out more checks from 12-1 tonight! this will probably be your *last chance* to get your money!.',
 'sender': {'first_name': 'Mike', 'full_name': 'Mike Yud', 'nickname': 'mike'},
 'taxonomy': [{u'confident': u'no',
               u'label': u'/business and industrial',
               u'score': u'0.377108'},
              {u'confident': u'no',
               u'label': u'/education/school',
               u'score': u'0.160608'},
              {u'confident': u'no',
               u'labe

In [179]:
pprint(feats[26])

{'concepts': [],
 'emojis': [],
 'entities': [],
 'inline-emojis': [],
 'keywords': [],
 'mentions': ['organizer-vincent'],
 'sanitized': ': ty .',
 'sender': {'first_name': 'Rohan',
            'full_name': 'Rohan Pai',
            'nickname': 'mentor-rohanpai'},
 'taxonomy': [],
 'text': '@organizer-vincent: ty .',
 'timestamp': '1455430931.001462',
 'urls': []}


In [187]:
import segmentation.distance as distance

In [188]:
distance.combined(feats[25], feats[26])

-1128.9999990463257

In [196]:
from segmentation.cluster import adhoc_clustering

In [199]:
adhoc_clustering(messages)

text_hash 8662566a27a5f461001dab1678227df97371d64aacdb4c519fb7bd8622a0188f
cached call
text_hash 992f1cff84959209d4144e5cd4dfc4c88d1123e1729ca3a413e91749103c6f24
cached call
text_hash 20d92747bb978fae8fc7a85a97e36b6ff955035de311f79284aa08b2c012099b
cached call
text_hash b053a62508d3e4516551daaf1429d7c7794336df97e5750db7b4566c8a743a15
cached call
text_hash aba75513deda5ee22ad0a08e98269d124af539f6b7f52b6ed5d2ae65b15120ef
cached call
text_hash b9752bfd9d0f99ec699439c47853dcf05cdb375862f3c19009935e5dd9f6fd7e
cached call
text_hash 4fe407289cfdc4b01edb142d9f5d47d5179540f8ef6a286fe3dbd5f5e5a88e28
cached call
text_hash 865c8b923283965f3ab7adcfe5acdc4902129668d77b6e949dd6c7f54845b367
cached call
text_hash 1054e38d0bebd715e7eec90cf54b39e7de74605124947bc155c6b739379faf8c
cached call
text_hash d95f38b951d70a9d82afc9287590bb764757f7414621fb0e4ae6f834400219e4
cached call
text_hash 240c2d7a3421fb74387cb8d0c43a0bdb5d47c2e17bac77abc9b1409b586017ec
cached call
text_hash 360357e33fa186ed22df163a932e4589a

[0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 3,
 3,
 3,
 3,
 3,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 5,
 5,
 5,
 5,
 5,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 7,
 7,
 7,
 7,
 8]