## clustering of entity keywords 
use a new app dataset (that is the union of the Official Samsung SmartThings: https://github.com/vomar18/SmartThingsPublic and the nsslabcuus: https://github.com/nsslabcuus/IoTMon)

the aim of this script is the creation of table 1 (cluster of entity keywords correlated with a specific channel)

evolution:
1. check every single app description in order to identify:"The entity keyword with the highest aggregated value is considered as a representative keyword for the cluster"
2. create the similarity matrix
3. save the pair of word with the highest aggregation value (over than a fixed threshold)
4. save the result inside: "cluster_channels.JSON"

In [32]:
# !!attention!! use python 3.6 for manage the packet stanza that is 
# the Official Stanford NLP Python Library for Many Human Languages

from gensim.models import Word2Vec, KeyedVectors 
import json   # for read the JSON file
import stanza # Official Stanford NLP Python Library for Many Human Languages
import numpy as np

In [33]:
#read only descriptions and save them
descriptions = {} #dictionary with descriptions
with open("../1_Intra_app_Analysis/app-description.JSON", "r") as read_file: # load the JSON file with app descriptions
    descriptions = json.load(read_file)
print("total app description found: "+ str(len(descriptions)))

total app description found: 131


In [34]:
stanza.download('en')       # This downloads the English models for the neural pipeline
nlp = stanza.Pipeline('en') # This sets up a default neural pipeline in English
model = KeyedVectors.load_word2vec_format('../../../GoogleNews-vectors-negative300.bin', binary=True, limit=100000) # load word2vec pre-trained

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.2.2.json:   0%|   …

2021-10-05 12:13:55 INFO: Downloading default packages for language: en (English)...
2021-10-05 12:13:57 INFO: File exists: /home/volta/stanza_resources/en/default.zip.
2021-10-05 12:14:02 INFO: Finished downloading models and saved to /home/volta/stanza_resources.
2021-10-05 12:14:02 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| pos       | combined  |
| lemma     | combined  |
| depparse  | combined  |
| sentiment | sstplus   |
| ner       | ontonotes |

2021-10-05 12:14:02 INFO: Use device: cpu
2021-10-05 12:14:02 INFO: Loading: tokenize
2021-10-05 12:14:03 INFO: Loading: pos
2021-10-05 12:14:03 INFO: Loading: lemma
2021-10-05 12:14:03 INFO: Loading: depparse
2021-10-05 12:14:03 INFO: Loading: sentiment
2021-10-05 12:14:04 INFO: Loading: ner
2021-10-05 12:14:04 INFO: Done loading processors!


In [35]:
set_of_keywords = []        # set of most similar keywords extrapolated from every single app description
# the keywords accepted are the NN and NNS as the example of cap 4.2-p835
POS_accepted=["NN", "NNS"]  # part of speech tags accepted ---> deleted: "NNP", "NNPS"
percentage_lv = 15          # level of similarity accepted between entity keywords of the same app description
                            # this level came from the analysis of the descriptions of example application of table 1

for app in descriptions.keys(): # check every app descriptions
    keywords = []       # keywords found inside app description
    keyword_vector = {} # dictionary keyword -> vector rappresentation (just for check if word2vec can 
                        # calculate the similarity between NN and NNS kwywords)
    print("analysing app description: ",app)
    des_analysed = nlp(descriptions[app]) # apply Stanford NLP to description
    for sent in des_analysed.sentences: 
        for word in sent.words:           # check each word of the description
            if word.xpos in POS_accepted: # check is it is a NN or NNS
                keywords_found = word.text.lower()
                if not keywords_found in keywords: # if not already saved that keyword 
                    #print(f"word:{word.text}")
                    keywords.append(keywords_found) # save it
    
    for i in range(len(keywords)): # for every keywords saved from the description 
            try:
                word = str(keywords[i])
                if len(word) > 1:                      # if it is not a single letter or symbol
                    keyword_vector[word] = model[word] # try to use Word2Vec
                else:
                    print("word not processed: ",word)
            except KeyError as err:
                print("handling Error: ", err) # some words can not be processed by Word2Vec
    
    keywords = list(keyword_vector.keys()) # save all keyword that can be processed by word2vec
    tot_keyword_found = len(keywords)
    
    if tot_keyword_found == 0:
        print("no one keyword found")
    
    elif tot_keyword_found == 1:
        print("tot_keywords:1 -", keywords[0])
        if not keywords[0] in set_of_keywords:
            set_of_keywords.append(keywords[0]) # just a single keyword inside a description -> save it
    
    elif tot_keyword_found == 2: # find only a pair of keywords so directly save them
        print("tot_keywords:2 -", keywords)
        if not keywords[0] in set_of_keywords:
            set_of_keywords.append(keywords[0])
        if not keywords[1] in set_of_keywords:
            set_of_keywords.append(keywords[1])
    else: # find a lot of kwywords --> save only the most similar
        
        similarity_matrix = np.ones((tot_keyword_found, tot_keyword_found))
        
        print("tot_keywords:",tot_keyword_found,"-", keywords)
        for key_x in range(tot_keyword_found):
            for key_y in range(key_x, tot_keyword_found):
                if key_x != key_y:
                    #print("check",keywords[key_x], " and ", keywords[key_y])
                    # -------------------------------  model.similarity() is equal to 1 - cosine.distance() --------
                    similarity_matrix[key_x][key_y] = (model.similarity(keywords[key_x], keywords[key_y])) * 100
                    if similarity_matrix[key_x][key_y] >= percentage_lv: # for every pair of keywords save only
                        if not keywords[key_x] in set_of_keywords:       # the most similar
                            print("add", keywords[key_x])
                            set_of_keywords.append(keywords[key_x])
                        if not keywords[key_y] in set_of_keywords:  # avoid duplicates
                            print("add", keywords[key_y])
                            set_of_keywords.append(keywords[key_y])

print(len(set_of_keywords))
set_of_keywords.sort()
set_of_keywords

analysing app description:  alfred-workflow
tot_keywords:2 - ['things', 'graph']
analysing app description:  beacon-control
tot_keywords: 5 - ['home', 'phrase', 'lights', 'door', 'region']
add home
add door
add lights
analysing app description:  big-turn-off
tot_keywords:1 - lights
analysing app description:  big-turn-on
tot_keywords:1 - lights
analysing app description:  bon-voyage
tot_keywords: 14 - ['set', 'presence', 'tags', 'smartphones', 'mode', 'change', 'everyone', 'conjunction', 'turn', 'lights', 'appliances', 'thermostat', 'security', 'apps']
add set
add change
add turn
add presence
add security
add tags
add apps
add smartphones
add appliances
add mode
add thermostat
add everyone
analysing app description:  bose-soundtouch-control
word not processed:  ®
tot_keywords: 4 - ['sound', 'actions', 'place', 'home']
add actions
add place
analysing app description:  brighten-dark-places
tot_keywords: 3 - ['lights', 'sensor', 'space']
add sensor
add space
analysing app description:  br

tot_keywords: 8 - ['reminder', 'medicine', 'cabinet', 'drawer', 'time', 'notification', 'text', 'message']
add medicine
analysing app description:  mini-hue-controller
tot_keywords:2 - ['hue', 'bulbs']
analysing app description:  mood-cube
tot_keywords:2 - ['lighting', 'cube']
analysing app description:  nfc-tag-toggle
handling Error:  "Key 'toggling' not present"
tot_keywords: 7 - ['switch', 'lock', 'garage', 'door', 'tag', 'touch', 'event']
add touch
analysing app description:  notify-me-when
tot_keywords: 3 - ['notifications', 'anything', 'home']
analysing app description:  notify-me-when-it-opens
tot_keywords: 4 - ['push', 'message', 'phone', 'sensor']
add phone
analysing app description:  notify-me-with-hue
tot_keywords: 12 - ['color', 'brightness', 'hue', 'bulbs', 'variety', 'motion', 'contact', 'acceleration', 'moisture', 'presence', 'sensors', 'switches']
add color
add variety
add acceleration
add contact
analysing app description:  once-a-day
tot_keywords:2 - ['switches', 'tim

tot_keywords:1 - sensor
analysing app description:  when-its-going-to-rain
tot_keywords: 7 - ['shed', 'windows', 'grill', 'dogs', 'lawn', 'plants', 'tomorrow']
add shed
add plants
add grill
add lawn
add dogs
analysing app description:  whole-house-fan
word not processed:  x
tot_keywords: 5 - ['house', 'fan', 'switch', 'temp', 'thermostat']
add temp
analysing app description:  withings-manager
no one keyword found
analysing app description:  working-from-home
tot_keywords: 5 - ['time', 'day', 'person', 'home', 'action']
add action
analysing app description:  yoics-connect
handling Error:  "Key 'yoics' not present"
tot_keywords:1 - devices
analysing app description:  smart-humidity-vent
tot_keywords: 3 - ['humidity', 'level', 'fans']
add level
analysing app description:  smart-home-monitor
tot_keywords: 5 - ['home', 'intrusion', 'fire', 'carbon_monoxide', 'leaks']
add fire
add intrusion
add leaks
add carbon_monoxide
201


['acceleration',
 'account',
 'action',
 'actions',
 'activities',
 'air_conditioner',
 'alarm',
 'ambient',
 'amount',
 'anytime',
 'app',
 'appliance',
 'appliances',
 'apps',
 'area',
 'arrival',
 'bathroom',
 'battery',
 'bed',
 'bedroom',
 'brightness',
 'buddies',
 'bulbs',
 'burst',
 'button',
 'buttons',
 'cabinet',
 'cameras',
 'candle',
 'capabilities',
 'car',
 'carbon_monoxide',
 'carpooling',
 'change',
 'changes',
 'chat',
 'color',
 'colors',
 'conjunction',
 'contact',
 'control',
 'controller',
 'controls',
 'cube',
 'curling',
 'day',
 'days',
 'departure',
 'device',
 'devices',
 'dogs',
 'door',
 'doors',
 'drawer',
 'element',
 'endpoint',
 'energy',
 'environment',
 'events',
 'everyone',
 'everything',
 'feeder',
 'fire',
 'food',
 'garage',
 'glass',
 'goals',
 'grace',
 'graph',
 'grill',
 'habits',
 'heater',
 'home',
 'hour',
 'house',
 'hub',
 'hue',
 'humidity',
 'input',
 'interior',
 'intruders',
 'intrusion',
 'iron',
 'lawn',
 'leaks',
 'level',
 'level

In [36]:
print(len(set_of_keywords))
set_of_keywords # set of all entity keywords from samrtapp descriptions

201


['acceleration',
 'account',
 'action',
 'actions',
 'activities',
 'air_conditioner',
 'alarm',
 'ambient',
 'amount',
 'anytime',
 'app',
 'appliance',
 'appliances',
 'apps',
 'area',
 'arrival',
 'bathroom',
 'battery',
 'bed',
 'bedroom',
 'brightness',
 'buddies',
 'bulbs',
 'burst',
 'button',
 'buttons',
 'cabinet',
 'cameras',
 'candle',
 'capabilities',
 'car',
 'carbon_monoxide',
 'carpooling',
 'change',
 'changes',
 'chat',
 'color',
 'colors',
 'conjunction',
 'contact',
 'control',
 'controller',
 'controls',
 'cube',
 'curling',
 'day',
 'days',
 'departure',
 'device',
 'devices',
 'dogs',
 'door',
 'doors',
 'drawer',
 'element',
 'endpoint',
 'energy',
 'environment',
 'events',
 'everyone',
 'everything',
 'feeder',
 'fire',
 'food',
 'garage',
 'glass',
 'goals',
 'grace',
 'graph',
 'grill',
 'habits',
 'heater',
 'home',
 'hour',
 'house',
 'hub',
 'hue',
 'humidity',
 'input',
 'interior',
 'intruders',
 'intrusion',
 'iron',
 'lawn',
 'leaks',
 'level',
 'level

## find the similarity between all the found keywords

In [39]:
tot_keywords = len(set_of_keywords) # set of most similar keywords 
channels_matrix = np.zeros((tot_keywords ,tot_keywords ))
for key_x in range(tot_keywords):
    for key_y in range(tot_keywords):
        #print("check",keywords[key_x], " and ", keywords[key_y])
        channels_matrix[key_x][key_y] = model.similarity(set_of_keywords[key_x], set_of_keywords[key_y])

channels_matrix = channels_matrix * 100        
np.set_printoptions(precision=2)
print(channels_matrix)

[[100.     7.27  18.93 ...   5.07  10.8    8.67]
 [  7.27 100.    15.21 ...  12.22   3.37   5.56]
 [ 18.93  15.21 100.   ...  -3.04 -10.45   9.63]
 ...
 [  5.07  12.22  -3.04 ... 100.    72.61   3.31]
 [ 10.8    3.37 -10.45 ...  72.61 100.    11.16]
 [  8.67   5.56   9.63 ...   3.31  11.16 100.  ]]


In [40]:
tot_keywords = len(set_of_keywords)
percentage_channel_level = 30 # percentage (%) of similarity accepted for the creation of a keyword cluster
channel = {} # dictionary made of [keyword] : list of most similar keywords associated with the [keyword]

for i in range(tot_keywords): # create an empty dictionary
    channel[set_of_keywords[i]]=[]

for key_x in range(tot_keywords):
    for key_y in range(tot_keywords): # now for the cluster save only the most similar keywords
        if (key_y != key_x) and (physical_channels_matrix[key_x][key_y] >= percentage_channel_level):
            channel[set_of_keywords[key_x]].append(set_of_keywords[key_y])
            
channel

{'acceleration': [],
 'account': [],
 'action': ['actions', 'motion', 'response'],
 'actions': ['action', 'activities', 'changes', 'events', 'response'],
 'activities': ['actions', 'events'],
 'air_conditioner': ['alarm',
  'appliance',
  'appliances',
  'bathroom',
  'battery',
  'bedroom',
  'brightness',
  'bulbs',
  'car',
  'carbon_monoxide',
  'garage',
  'grill',
  'heater',
  'house',
  'humidity',
  'lawn',
  'lights',
  'sirens',
  'temperature',
  'thermostat',
  'valve',
  'ventilation',
  'window',
  'windows'],
 'alarm': ['air_conditioner',
  'fire',
  'intruders',
  'lights',
  'sirens',
  'thermostat',
  'ventilation',
 'ambient': ['brightness',
  'carbon_monoxide',
  'humidity',
  'lighting',
  'modes',
  'moisture',
  'sensor',
  'sensors',
  'settings',
  'sound',
  'temperature',
  'ventilation'],
 'amount': ['level', 'levels', 'money', 'number', 'time'],
 'anytime': ['everyone', 'everything', 'someone', 'something', 'time'],
 'app': ['apps',
  'button',
  'device',

In [41]:
# physical channel identification (table 1 - p835)
dict_channel = {}

temp = channel["temperature"] # save all the keywords of a cluster 
temp.append("temperature") # don't forget to add also the main keyword 
dict_channel["temperature"] = temp # save them inside a dictionary - temperature

temp = channel["humidity"]
temp.append("humidity")
dict_channel["humidity"]= temp # humidity

temp =channel["lights"]
temp.append("lights")
dict_channel["illumination"]=temp # illumination

temp = channel["motion"]
temp.append("motion")
dict_channel["motion"]=temp # motion

temp = channel["location"]
temp.append("location")
dict_channel["location"]=temp # location

temp = channel["carbon_monoxide"]
temp.append("carbon_monoxide")
dict_channel["smoke"]=temp # smoke

temp=channel["valve"]
temp.append("valve")
dict_channel["leakage"]=temp # leakage

temp=channel["time"]
temp.append("time")
dict_channel["time"]=temp # time

temp = ['location', 'mode']
dict_channel["locationMode"]=temp # locationMode

temp=channel["lock"]
temp.append("lock")
dict_channel["lock"]=temp # lock

dict_channel

{'temperature': ['air_conditioner',
  'ambient',
  'brightness',
  'carbon_monoxide',
  'heater',
  'humidity',
  'moisture',
  'sensor',
  'sensors',
  'temp',
  'thermostat',
  'ventilation',
  'water',
  'weather',
  'temperature'],
 'humidity': ['air_conditioner',
  'ambient',
  'brightness',
  'carbon_monoxide',
  'moisture',
  'temperature',
  'thermostat',
  'ventilation',
  'weather',
  'humidity'],
 'illumination': ['air_conditioner',
  'alarm',
  'appliances',
  'brightness',
  'bulbs',
  'cameras',
  'candle',
  'colors',
  'heater',
  'light',
  'lighting',
  'sirens',
  'switch',
  'switches',
  'thermostat',
  'lights'],
 'motion': ['action', 'motion'],
 'location': ['area', 'hub', 'place', 'location'],
 'smoke': ['air_conditioner',
  'ambient',
  'fire',
  'heater',
  'humidity',
  'leaks',
  'moisture',
  'temperature',
  'thermostat',
  'ventilation',
  'carbon_monoxide'],
 'leakage': ['air_conditioner',
  'device',
  'heater',
  'leaks',
  'lever',
  'sensor',
  'sens

In [42]:
with open("cluster_channels.JSON","w") as outfile:
    json.dump(dict_channel, outfile)