# ## second clustering approach: pair of keywords 
since that I use a new app dataset (that is the union of the Official Samsung SmartThings: https://github.com/vomar18/SmartThingsPublic and the nsslabcuus: https://github.com/nsslabcuus/IoTMon)
1. check every single app description in order to identify:"The entity keyword with the highest aggregated value is considered as a representative keyword for the cluster"
2. create the similarity matrix
3. save the pair of word with the highest aggregation value (over than 90%??)
4. save the result inside: "cluster_pysical_channels.JSON"

In [1]:
# !!attention!! use python 3.6 for manage the packet stanza that is 
# the Official Stanford NLP Python Library for Many Human Languages

from gensim.models import Word2Vec, KeyedVectors 
from sklearn.cluster import KMeans
from scipy import spatial
import json   # for read the JSON file
import stanza # Official Stanford NLP Python Library for Many Human Languages
import re     # for preprocessing the descriptions (necessary for using the word2vec)
import numpy as np

In [2]:
#read only descriptions and save them
descriptions = {} #dictionary with descriptions
with open("app-description.JSON", "r") as read_file: # load the JSON file with app descriptions
    descriptions = json.load(read_file) 

In [3]:
print("total app description found: "+ str(len(descriptions)))

total app description found: 140


In [4]:
stanza.download('en')       # This downloads the English models for the neural pipeline
nlp = stanza.Pipeline('en') # This sets up a default neural pipeline in English
model = KeyedVectors.load_word2vec_format('/home/volta/Documenti/CCS18_CS/word2vec/GoogleNews-vectors-negative300.bin', binary=True, limit=100000) # load word2vec pre-trained

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.2.2.json:   0%|   …

2021-09-24 10:14:49 INFO: Downloading default packages for language: en (English)...
2021-09-24 10:14:52 INFO: File exists: /home/volta/stanza_resources/en/default.zip.
2021-09-24 10:14:57 INFO: Finished downloading models and saved to /home/volta/stanza_resources.
2021-09-24 10:14:57 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| pos       | combined  |
| lemma     | combined  |
| depparse  | combined  |
| sentiment | sstplus   |
| ner       | ontonotes |

2021-09-24 10:14:57 INFO: Use device: cpu
2021-09-24 10:14:57 INFO: Loading: tokenize
2021-09-24 10:14:57 INFO: Loading: pos
2021-09-24 10:14:57 INFO: Loading: lemma
2021-09-24 10:14:57 INFO: Loading: depparse
2021-09-24 10:14:58 INFO: Loading: sentiment
2021-09-24 10:14:58 INFO: Loading: ner
2021-09-24 10:14:59 INFO: Done loading processors!


In [14]:
set_of_keywords = []       # set of most similar keywords extrapolated from every single app description
POS_accepted=["NN", "NNS"] # part of speech tags accepted ---> deleted: "NNP", "NNPS"
percentage_lv = 25         # level of similarity accepted between entity keywords of the same app description

for app in descriptions.keys(): # check every app descriptions
    keywords = []       # keywords found inside app description
    keyword_vector = {} # dictionary keyword -> vector rappresentation (just for check if word2vec can 
                        # calculate the similarity of the NN and NNS)
    print("analysing app description: ",app)
    des_analysed = nlp(descriptions[app]) # apply Stanford NLP to description
    for sent in des_analysed.sentences: 
        for word in sent.words:           # check each word of the description
            if word.xpos in POS_accepted: # check is it is a NN or NNS
                keywords_found = word.text
                if not keywords_found in keywords: # if not already saved that keyword 
                    #print(f"word:{word.text}")
                    keywords.append(keywords_found) # save it
    
    for i in range(len(keywords)): # for every keywords saved from the description 
            try:
                word = str(keywords[i])
                if len(word) > 1:
                    keyword_vector[word] = model[word] # try to use Word2Vec
                else:
                    print("word not processed: ",word)
            except KeyError as err:
                print("handling Error: ", err) # some words can not be processed by Word2Vec
    
    keywords = list(keyword_vector.keys()) # save all keyword that can be processed by word2vec
    tot_keyword_found = len(keywords)
    
    if tot_keyword_found == 0:
        print("no one keyword found")
    
    elif tot_keyword_found == 1:
        print("tot_keywords:1 -", keywords[0])
        if not keywords[0] in set_of_keywords:
            set_of_keywords.append(keywords[0])
    
    elif tot_keyword_found == 2: # find only a pair of keywords so directly save them
        print("tot_keywords:2 -", keywords)
        if not keywords[0] in set_of_keywords:
            set_of_keywords.append(keywords[0])
        if not keywords[1] in set_of_keywords:
            set_of_keywords.append(keywords[1])
    else: # find a lot of kwywords --> save only the most similar
        
        similarity_matrix = np.ones((tot_keyword_found, tot_keyword_found))
        
        print("tot_keywords:",tot_keyword_found,"-", keywords)
        for key_x in range(tot_keyword_found):
            for key_y in range(key_x, tot_keyword_found):
                if key_x != key_y:
                    #print("check",keywords[key_x], " and ", keywords[key_y])
                    # -------------------------------  model.similarity() is equal to 1 - cosine.distance() --------
                    similarity_matrix[key_x][key_y] = (model.similarity(keywords[key_x], keywords[key_y]))*100
                    if similarity_matrix[key_x][key_y] >= percentage_lv: # for every pair of keywords save the
                        if not keywords[key_x] in set_of_keywords:       # most similar
                            print("add", keywords[key_x])
                            set_of_keywords.append(keywords[key_x])
                        if not keywords[key_y] in set_of_keywords:
                            print("add", keywords[key_y])
                            set_of_keywords.append(keywords[key_y])

print(len(set_of_keywords))
set_of_keywords.sort()
set_of_keywords

analysing app description:  alfred-workflow
tot_keywords:2 - ['things', 'graph']
analysing app description:  beacon-control
tot_keywords: 5 - ['Home', 'phrase', 'lights', 'door', 'region']
analysing app description:  big-turn-off
tot_keywords:1 - lights
analysing app description:  big-turn-on
tot_keywords:1 - lights
analysing app description:  bon-voyage
tot_keywords: 14 - ['set', 'Presence', 'tags', 'smartphones', 'mode', 'change', 'everyone', 'conjunction', 'Turn', 'lights', 'appliances', 'thermostat', 'security', 'apps']
add smartphones
add appliances
add apps
add Turn
add thermostat
analysing app description:  bose-soundtouch-control
word not processed:  ®
tot_keywords: 4 - ['Sound', 'actions', 'place', 'home']
add place
add home
analysing app description:  brighten-dark-places
tot_keywords: 3 - ['lights', 'sensor', 'space']
analysing app description:  brighten-my-path
tot_keywords:2 - ['lights', 'motion']
analysing app description:  button-controller
tot_keywords: 3 - ['Control', 

tot_keywords: 8 - ['reminder', 'medicine', 'cabinet', 'drawer', 'time', 'notification', 'text', 'message']
analysing app description:  mini-hue-controller
tot_keywords:2 - ['Hue', 'bulbs']
analysing app description:  mood-cube
tot_keywords:2 - ['lighting', 'cube']
analysing app description:  nfc-tag-toggle
handling Error:  "Key 'toggling' not present"
tot_keywords: 7 - ['switch', 'lock', 'garage', 'door', 'Tag', 'touch', 'event']
analysing app description:  notify-me-when
tot_keywords: 3 - ['notifications', 'anything', 'home']
analysing app description:  notify-me-when-it-opens
tot_keywords: 4 - ['push', 'message', 'phone', 'sensor']
add phone
analysing app description:  notify-me-with-hue
tot_keywords: 12 - ['color', 'brightness', 'Hue', 'bulbs', 'variety', 'motion', 'contact', 'acceleration', 'moisture', 'presence', 'sensors', 'switches']
add color
add contact
analysing app description:  once-a-day
tot_keywords:2 - ['switches', 'time']
analysing app description:  opent2t-smartapp-tes

tot_keywords: 4 - ['house', 'energy', 'use', 'account']
analysing app description:  weatherbug-home
tot_keywords:1 - Weather
analysing app description:  weather-underground-pws-connect
tot_keywords:1 - sensor
analysing app description:  wemo-connect
handling Error:  "Key 'WeMo' not present"
handling Error:  "Key 'Wemo' not present"
tot_keywords: 3 - ['Switch', 'Motion', 'sensor']
analysing app description:  when-its-going-to-rain
tot_keywords: 7 - ['shed', 'windows', 'grill', 'dogs', 'lawn', 'plants', 'tomorrow']
add lawn
add grill
analysing app description:  whole-house-fan
word not processed:  x
handling Error:  "Key 'Thermostat' not present"
tot_keywords: 4 - ['house', 'fan', 'switch', 'temp']
analysing app description:  withings
tot_keywords:1 - scale
analysing app description:  withings-manager
no one keyword found
analysing app description:  working-from-home
tot_keywords: 5 - ['time', 'day', 'person', 'home', 'action']
add person
analysing app description:  yoics-connect
handlin

['Account',
 'Alarm',
 'Button',
 'Change',
 'Device',
 'Devices',
 'Harmony',
 'Hue',
 'LED',
 'Manager',
 'Monitors',
 'Multi',
 'Note',
 'SMS',
 'Service',
 'Turn',
 'Updates',
 'Users',
 'Weather',
 'account',
 'actions',
 'activities',
 'air_conditioner',
 'alarm',
 'ambient',
 'amount',
 'anytime',
 'app',
 'appliance',
 'appliances',
 'apps',
 'arrival',
 'bathroom',
 'battery',
 'bed',
 'bedroom',
 'brightness',
 'bulbs',
 'burst',
 'button',
 'cabinet',
 'camera',
 'cameras',
 'candle',
 'capabilities',
 'car',
 'carbon_monoxide',
 'change',
 'changes',
 'chat',
 'color',
 'colors',
 'contact',
 'controller',
 'cube',
 'day',
 'days',
 'departure',
 'device',
 'devices',
 'door',
 'doors',
 'drawer',
 'endpoint',
 'energy',
 'events',
 'fire',
 'food',
 'garage',
 'graph',
 'grill',
 'heater',
 'home',
 'hour',
 'house',
 'humidity',
 'internet',
 'intrusion',
 'lawn',
 'leaks',
 'lever',
 'life',
 'light',
 'lighting',
 'lights',
 'location',
 'lock',
 'mail',
 'mailbox',
 'm

In [15]:
print(len(set_of_keywords))
set_of_keywords

149


['Account',
 'Alarm',
 'Button',
 'Change',
 'Device',
 'Devices',
 'Harmony',
 'Hue',
 'LED',
 'Manager',
 'Monitors',
 'Multi',
 'Note',
 'SMS',
 'Service',
 'Turn',
 'Updates',
 'Users',
 'Weather',
 'account',
 'actions',
 'activities',
 'air_conditioner',
 'alarm',
 'ambient',
 'amount',
 'anytime',
 'app',
 'appliance',
 'appliances',
 'apps',
 'arrival',
 'bathroom',
 'battery',
 'bed',
 'bedroom',
 'brightness',
 'bulbs',
 'burst',
 'button',
 'cabinet',
 'camera',
 'cameras',
 'candle',
 'capabilities',
 'car',
 'carbon_monoxide',
 'change',
 'changes',
 'chat',
 'color',
 'colors',
 'contact',
 'controller',
 'cube',
 'day',
 'days',
 'departure',
 'device',
 'devices',
 'door',
 'doors',
 'drawer',
 'endpoint',
 'energy',
 'events',
 'fire',
 'food',
 'garage',
 'graph',
 'grill',
 'heater',
 'home',
 'hour',
 'house',
 'humidity',
 'internet',
 'intrusion',
 'lawn',
 'leaks',
 'lever',
 'life',
 'light',
 'lighting',
 'lights',
 'location',
 'lock',
 'mail',
 'mailbox',
 'm

In [16]:
tot_keywords = len(set_of_keywords) # set of most similar keywords 
physical_channels_matrix = np.zeros((tot_keywords ,tot_keywords ))
for key_x in range(tot_keywords):
    for key_y in range(tot_keywords):
        #print("check",keywords[key_x], " and ", keywords[key_y])
        physical_channels_matrix[key_x][key_y] = model.similarity(set_of_keywords[key_x], set_of_keywords[key_y])

physical_channels_matrix = physical_channels_matrix * 100        
np.set_printoptions(precision=2)
print(physical_channels_matrix)

[[100.    13.33  11.62 ...   7.85  -1.06   3.5 ]
 [ 13.33 100.    17.17 ...  11.98  11.77   6.11]
 [ 11.62  17.17 100.   ...  10.95   8.89   6.54]
 ...
 [  7.85  11.98  10.95 ... 100.    72.61   3.31]
 [ -1.06  11.77   8.89 ...  72.61 100.    11.16]
 [  3.5    6.11   6.54 ...   3.31  11.16 100.  ]]


In [18]:
tot_keywords = len(set_of_keywords)
percentage_channel_level = 37 # percentage (%) of similarity accepted for that keywords
physical_channel = {} # dictionary made of [keyword] : list of most similar keywords associated with the [keyword]
for i in range(tot_keywords):
    physical_channel[set_of_keywords[i]]=[]

for key_x in range(tot_keywords):
    for key_y in range(tot_keywords):
        if (key_y != key_x) and (physical_channels_matrix[key_x][key_y] >= percentage_channel_level):
            physical_channel[set_of_keywords[key_x]].append(set_of_keywords[key_y])
            
physical_channel

{'Account': ['account'],
 'Button': [],
 'Change': ['change', 'changes'],
 'Device': ['Devices', 'device', 'devices', 'sensor'],
 'Devices': ['Device', 'Monitors', 'device', 'devices', 'smartphones'],
 'Harmony': [],
 'Hue': [],
 'LED': ['battery', 'brightness', 'bulbs', 'lighting', 'lights', 'sensor'],
 'Manager': [],
 'Monitors': ['Devices'],
 'Multi': [],
 'Note': [],
 'SMS': ['app', 'internet', 'messages', 'notifications', 'phone', 'text'],
 'Service': [],
 'Turn': [],
 'Updates': [],
 'Users': ['user'],
 'Weather': ['humidity', 'weather'],
 'account': ['Account'],
 'actions': ['activities'],
 'activities': ['actions', 'events'],
 'air_conditioner': ['appliance',
  'appliances',
  'bathroom',
  'car',
  'garage',
  'grill',
  'heater',
  'lights',
  'temperature',
  'thermostat',
  'window',
  'windows'],
 'alarm': ['Alarm', 'fire', 'sirens'],
 'ambient': ['temperature'],
 'amount': ['time'],
 'anytime': [],
 'app': ['SMS', 'apps', 'device', 'smartphones', 'user'],
 'appliance': ['

In [11]:
# creation the centroids (table 1 - p835)
dict_physical_channel = {}
temp = physical_channel["temperature"]
temp.append("temperature") # don't forget to add also the main keyword 
dict_physical_channel["temperature"] = temp

temp = physical_channel["humidity"]
temp.append("humidity")
dict_physical_channel["humidity"]= temp

temp =physical_channel["lights"]
temp.append("lights")
dict_physical_channel["illumination"]=temp

temp = physical_channel["motion"]
temp.append("motion")
dict_physical_channel["motion"]=temp

temp = physical_channel["location"]
temp.append("location")
dict_physical_channel["location"]=temp
temp = physical_channel["carbon_monoxide"]
temp.append("carbon_monoxide")
dict_physical_channel["smoke"]=temp
temp=physical_channel["messages"]
temp.append("messages")
dict_physical_channel["leakage"]=temp
dict_physical_channel

{'temperature': ['air_conditioner',
  'brightness',
  'carbon_monoxide',
  'heater',
  'humidity',
  'moisture',
  'sensor',
  'sensors',
  'sleep',
  'temp',
  'thermostat',
  'weather',
  'temperature'],
 'humidity': ['air_conditioner',
  'brightness',
  'carbon_monoxide',
  'fire',
  'heater',
  'lighting',
  'moisture',
  'temperature',
  'thermostat',
  'weather',
  'humidity'],
 'illumination': ['brightness',
  'bulbs',
  'camera',
  'cameras',
  'candle',
  'color',
  'colors',
  'hue',
  'lighting',
  'lights',
  'light'],
 'motion': ['camera', 'sensor', 'motion'],
 'location': ['device',
  'hub',
  'place',
  'presence',
  'settings',
  'space',
  'station',
  'weather',
  'location'],
 'smoke': ['air_conditioner',
  'alarm',
  'bathroom',
  'brightness',
  'fire',
  'garage',
  'heater',
  'humidity',
  'leaks',
  'moisture',
  'occupant',
  'sirens',
  'temperature',
  'thermostat',
  'carbon_monoxide'],
 'leakage': ['chat',
  'mail',
  'mailbox',
  'message',
  'notificatio

In [15]:
with open("cluster_pysical_channels.JSON", "w") as write_file: # load the JSON file with app descriptions
    write_file.write("{\n")
    for key in dict_physical_channel.keys():
        write_file.write(f"\"{key}\":[")
        for keyword in dict_physical_channel[key]:
            write_file.write(f"\"{keyword}\",")
        write_file.write(f"],\n")
    write_file.write("}")