In [1]:
import sys
sys.path.append('..')
from aips import *

In [2]:
import pandas
import pickle
import json
import datetime
import dateutil.parser

import html,re
from bs4 import BeautifulSoup

import spacy
from spacy.matcher import Matcher

from sentence_transformers import SentenceTransformer, STutil

In [3]:
def strip_html_lxml(h):
    h = html.unescape(h)
    soup = BeautifulSoup(h, 'lxml')
    text = soup.get_text(separator=' ')
    text = text.strip()
    text = re.sub(r'\s+',' ',text)
    return text
def getData(filename):
    data = []
    df = pandas.read_csv(filename)
    for idx,row in df.iterrows():
        body = row["body"]
        if body and isinstance(body,str) and len(body)>0:
            text = strip_html_lxml(body)
            data.append((text,row))
    return data

In [4]:
SAVE = False

if SAVE:
    outdoors = getData('../../../../temp/outdoors/posts.csv')
    with open('outdoors.pickle','wb') as fd:
        pickle.dump(outdoors,fd)
else:
    with open('outdoors.pickle','rb') as fd:
        outdoors = pickle.load(fd)

In [158]:
def normalize(span):
    #normalizes a noun phrase
    return ' '.join([tok.lemma_.lower() for tok in span])

In [161]:
def getConcepts(tuples):
    #Get all the noun phrases in the content
    noun_phrases = []
    matcher = Matcher(nlp.vocab)
    pattern = [{"POS": "NOUN", "OP": "+"}]
    matcher.add("noun_phrases", [pattern])
    
    nlp = spacy.load('en_core_web_lg')
    for doc,context in nlp.pipe(tuples[0:10000], batch_size=50, as_tuples=True):
        matches = matcher(doc)
        text = doc.text
        for matchid,start,end in matches:
            span = doc[start:end]
            noun_phrases.append(normalize(span))
    
    concepts = {}
    for np in noun_phrases:
        if np not in concepts:
            concepts[np] = 0
        concepts[np] += 1
    
    sorted_concepts = {k: v for k, v in sorted(concepts.items(), key=lambda item: item[1], reverse=True)}
    
    return sorted_concepts


cons = getConcepts(outdoors)

In [164]:
topcons = {k:v for (k,v) in cons.items() if v>500 }
print(len(topcons.keys()))
print(json.dumps(topcons,indent=2))

127
{
  "water": 4619,
  "time": 3475,
  "rope": 3108,
  "way": 2601,
  "day": 2551,
  "tent": 2522,
  "people": 2344,
  "thing": 2310,
  "bag": 2307,
  "area": 2264,
  "lot": 2117,
  "foot": 1916,
  "question": 1640,
  "trail": 1609,
  "weight": 1538,
  "food": 1532,
  "bear": 1468,
  "point": 1447,
  "boot": 1392,
  "body": 1381,
  "shoe": 1364,
  "case": 1343,
  "hand": 1337,
  "one": 1335,
  "place": 1323,
  "climbing": 1315,
  "snow": 1253,
  "year": 1226,
  "problem": 1207,
  "rock": 1205,
  "fire": 1154,
  "trip": 1144,
  "pack": 1132,
  "part": 1088,
  "answer": 1086,
  "bit": 1082,
  "ground": 1078,
  "example": 1071,
  "gear": 1067,
  "side": 1023,
  "hiking": 1001,
  "tree": 983,
  "ice": 975,
  "route": 966,
  "type": 963,
  "experience": 953,
  "knot": 933,
  "hour": 930,
  "mountain": 922,
  "temperature": 915,
  "use": 903,
  "night": 900,
  "camping": 888,
  "heat": 871,
  "situation": 845,
  "end": 834,
  "person": 821,
  "line": 819,
  "condition": 804,
  "anchor": 80

In [165]:
stsb = SentenceTransformer('roberta-base-nli-stsb-mean-tokens')

In [166]:
#Compute embeddings
embeddings = stsb.encode(list(topcons.keys()), convert_to_tensor=True)

In [168]:
#Compute cosine-similarities for each sentence with each other sentence
cosine_scores = STutil.pytorch_cos_sim(embeddings, embeddings)

In [169]:
#Find the pairs with the highest cosine similarity scores
pairs = []
for i in range(len(cosine_scores)-1):
    for j in range(i+1, len(cosine_scores)):
        pairs.append({'index': [i, j], 'score': cosine_scores[i][j]})
#Sort scores in decreasing order
pairs = sorted(pairs, key=lambda x: x['score'], reverse=True)

In [170]:
topconkeys = list(topcons.keys())
for pair in pairs[0:100]:
    i, j = pair['index']
    print("{} \t\t {} \t\t Score: {:.4f}".format(topconkeys[i], topconkeys[j], pair['score']))

climbing 		 climber 		 Score: 0.9236
camping 		 camp 		 Score: 0.8789
hiking 		 hike 		 Score: 0.8152
people 		 person 		 Score: 0.7847
snow 		 winter 		 Score: 0.7196
temperature 		 heat 		 Score: 0.6664
pack 		 backpack 		 Score: 0.6561
water 		 river 		 Score: 0.6536
fire 		 stove 		 Score: 0.6353
tent 		 tarp 		 Score: 0.6263
fire 		 heat 		 Score: 0.6233
couple 		 pair 		 Score: 0.5961
safety 		 protection 		 Score: 0.5851
temperature 		 weather 		 Score: 0.5666
temperature 		 condition 		 Score: 0.5637
fire 		 fuel 		 Score: 0.5630
pack 		 example 		 Score: 0.5611
weather 		 fall 		 Score: 0.5601
night 		 end 		 Score: 0.5590
rope 		 rock 		 Score: 0.5582
tent 		 trail 		 Score: 0.5537
heat 		 condition 		 Score: 0.5521
way 		 difference 		 Score: 0.5503
distance 		 length 		 Score: 0.5499
climber 		 clothe 		 Score: 0.5471
weather 		 wind 		 Score: 0.5453
device 		 equipment 		 Score: 0.5429
way 		 level 		 Score: 0.5391
time 		 day 		 Score: 0.5365
condition 		 reason 		 Score:

In [29]:
import nmslib

In [171]:
vocab = [k for (k,v) in cons.items() if v>5]
print(len(vocab))
print(json.dumps(cons,indent=2))

5248
{
  "water": 4619,
  "time": 3475,
  "rope": 3108,
  "way": 2601,
  "day": 2551,
  "tent": 2522,
  "people": 2344,
  "thing": 2310,
  "bag": 2307,
  "area": 2264,
  "lot": 2117,
  "foot": 1916,
  "question": 1640,
  "trail": 1609,
  "weight": 1538,
  "food": 1532,
  "bear": 1468,
  "point": 1447,
  "boot": 1392,
  "body": 1381,
  "shoe": 1364,
  "case": 1343,
  "hand": 1337,
  "one": 1335,
  "place": 1323,
  "climbing": 1315,
  "snow": 1253,
  "year": 1226,
  "problem": 1207,
  "rock": 1205,
  "fire": 1154,
  "trip": 1144,
  "pack": 1132,
  "part": 1088,
  "answer": 1086,
  "bit": 1082,
  "ground": 1078,
  "example": 1071,
  "gear": 1067,
  "side": 1023,
  "hiking": 1001,
  "tree": 983,
  "ice": 975,
  "route": 966,
  "type": 963,
  "experience": 953,
  "knot": 933,
  "hour": 930,
  "mountain": 922,
  "temperature": 915,
  "use": 903,
  "night": 900,
  "camping": 888,
  "heat": 871,
  "situation": 845,
  "end": 834,
  "person": 821,
  "line": 819,
  "condition": 804,
  "anchor": 8

In [172]:
#Compute embeddings
vocab_embeddings = stsb.encode(vocab, convert_to_tensor=True)

In [173]:
print(len(vocab_embeddings))
print(len(vocab))
print(len(vocab_embeddings[0]))

5248
5248
768


In [174]:
# initialize a new index, using a HNSW index on Cosine Similarity
index = nmslib.init(method='hnsw', space='cosinesimil')
index.addDataPointBatch(vocab_embeddings)
index.createIndex(print_progress=True)

In [175]:
# query for the nearest neighbours of the first datapoint
ids, distances = index.knnQuery(vocab_embeddings[0], k=10)

# get all nearest neighbours for all the datapoint
# using a pool of 4 threads to compute
#neighbours = index.knnQueryBatch(data, k=10, num_threads=4)

In [176]:
print(ids)

[   0 3388 2591 1378 3868  612 4382 3313 4156 4267]


In [217]:
def print_labels(labels):
    for l,d in labels:
        print(l + '\t' + str(d))    

In [220]:
def synonyms(term,index,vocab,embeddings):
    labels = []
    if term in vocab:
        idx = vocab.index(term)
        ids, distances = index.knnQuery(embeddings[idx], k=100)
        for i in range(len(ids)):
            text = vocab[ids[i]] 
            dist = 1.0-distances[i]
            if dist>0.8:
                labels.append((text,dist))
    else:
        print(term, 'was not found in the vocabulary')

    return labels

In [221]:
labels = synonyms('camp fire',index,vocab,vocab_embeddings)
print_labels(labels)

camp fire	0.9999996423721313
campfire	0.9566243886947632
camping stove	0.8573607802391052
camp stove	0.8526521921157837
camp site	0.8003904223442078


In [229]:
def paraphrase(query,model,index,vocab):
    labels = []
    embeddings = model.encode([query], convert_to_tensor=True)
    ids, distances = index.knnQuery(embeddings[0], k=100)
    for i in range(len(ids)):
        text = vocab[ids[i]] 
        dist = 1.0-distances[i]
        if dist>0.8:
            labels.append((text,dist))
    if not len(labels):
        labels.append((vocab[ids[1]],1.0-distances[1]))
    return labels

In [239]:
labels = paraphrase('Where was the fire lit?',stsb,index,vocab)
print_labels(labels)

burning	0.6847879886627197


In [250]:
outdoors_posts = getData('../../../../temp/outdoors/posts.csv')

In [274]:
for text,row in outdoors:
    tags = []
    if row["post_type_id"]==1:
        if row["tags"] and isinstance(row["tags"],str):
            tags = [t for t in re.compile("[\<\>]").split(html.unescape(row["tags"])) if len(t)]
        print(tags)
        print(row)

['health', 'first-aid', 'blisters']
id                                                                          1
post_type_id                                                                1
accepted_answer_id                                                         12
parent_id                                                                 NaN
creation_date                                         2012-01-24T19:55:57.057
deletion_date                                                             NaN
score                                                                      31
view_count                                                               7383
body                        &lt;p&gt;A few times I've been out walking or ...
owner_user_id                                                               9
owner_display_name                                                        NaN
last_editor_user_id                                                     12892
last_editor_display_name    

Name: 1274, dtype: object
['fire']
id                                                                       1353
post_type_id                                                                1
accepted_answer_id                                                       1357
parent_id                                                                 NaN
creation_date                                         2012-04-17T01:51:55.920
deletion_date                                                             NaN
score                                                                      11
view_count                                                               2391
body                        &lt;p&gt;If I build a campfire, but need to mo...
owner_user_id                                                             432
owner_display_name                                                        NaN
last_editor_user_id                                                       NaN
last_editor_display_name     

Name: 2216, dtype: object
['skiing']
id                                                                       3341
post_type_id                                                                1
accepted_answer_id                                                       3351
parent_id                                                                 NaN
creation_date                                         2012-12-11T04:26:18.400
deletion_date                                                             NaN
score                                                                      17
view_count                                                              23126
body                        &lt;p&gt;Occasionally I've seen skiers free-he...
owner_user_id                                                            1926
owner_display_name                                                        NaN
last_editor_user_id                                                       NaN
last_editor_display_name   

Name: 3288, dtype: object
['hiking', 'shoes', 'footwear', 'walking']
id                                                                       4488
post_type_id                                                                1
accepted_answer_id                                                       4490
parent_id                                                                 NaN
creation_date                                         2013-08-19T09:47:06.310
deletion_date                                                             NaN
score                                                                       8
view_count                                                               1225
body                        &lt;p&gt;The title may itself sound a little w...
owner_user_id                                                            2303
owner_display_name                                                        NaN
last_editor_user_id                                                      

Name: 4291, dtype: object
['boats', 'gps', 'theft']
id                                                                       5628
post_type_id                                                                1
accepted_answer_id                                                        NaN
parent_id                                                                 NaN
creation_date                                         2014-04-28T19:27:50.887
deletion_date                                                             NaN
score                                                                       6
view_count                                                                190
body                        &lt;p&gt;I'm planning to buy this &lt;a href=&...
owner_user_id                                                            1863
owner_display_name                                                        NaN
last_editor_user_id                                                      8794
last_editor_

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)




['rock-climbing', 'climbing', 'trad-climbing']
id                                                                      20162
post_type_id                                                                1
accepted_answer_id                                                      20163
parent_id                                                                 NaN
creation_date                                         2018-08-09T08:15:09.300
deletion_date                                                             NaN
score                                                                      12
view_count                                                                933
body                        &lt;p&gt;I occasionally place slings on rock s...
owner_user_id                                                             NaN
owner_display_name                                                   user2766
last_editor_user_id                                                     11623
last_editor_disp

['safety', 'animals', 'coyotes', 'pack-animals']
id                                                                      21199
post_type_id                                                                1
accepted_answer_id                                                        NaN
parent_id                                                                 NaN
creation_date                                         2018-12-05T06:26:42.650
deletion_date                                                             NaN
score                                                                       5
view_count                                                                372
body                        &lt;p&gt;I live in a city in Ontario, Canada. ...
owner_user_id                                                           17003
owner_display_name                                                        NaN
last_editor_user_id                                                      2157
last_editor_dis

Name: 17879, dtype: object
['united-states', 'animals']
id                                                                      22129
post_type_id                                                                1
accepted_answer_id                                                        NaN
parent_id                                                                 NaN
creation_date                                         2019-05-15T14:29:50.537
deletion_date                                                             NaN
score                                                                       2
view_count                                                                 73
body                        &lt;p&gt;Each year pronghorn shed the outer co...
owner_user_id                                                            8794
owner_display_name                                                        NaN
last_editor_user_id                                                      8794
last_edi

Name: 18774, dtype: object
['rescue']
id                                                                      24156
post_type_id                                                                1
accepted_answer_id                                                      24157
parent_id                                                                 NaN
creation_date                                         2019-09-19T20:33:11.910
deletion_date                                                             NaN
score                                                                       1
view_count                                                                 83
body                        &lt;p&gt;Garmin inReach devices and its subscr...
owner_user_id                                                            3857
owner_display_name                                                        NaN
last_editor_user_id                                                       NaN
last_editor_display_name  