# Keywords dictionary

### Imports and function definitions

In [1]:
import numpy as np  # for saving dictionary at the end 

import spacy  # for lemmanizing
nlp = spacy.load('en_core_web_sm')  # here we could easily use a bigger model

from nltk.stem import PorterStemmer  # for stemming
ps = PorterStemmer()  # this can also be changed out for other prebuilts

def remove_trailing_spaces(string):
    '''simple while loop for removing trailing spaces. returns string less trailing spaces.'''
    while string[-1] == ' ':
        string = string[:-1]
        
    return string

### Load text and place in dictionary

In [10]:
filename = '../data/dictionary_v2.txt'  
# load keywords from text file. Assumes entries are line seperated and categories are seperate by 3 lines
with open(filename) as f:
    text=f.read()[:-1]
    
text

'DBS\nDigital technology\nDigital era\nDigital resources\nPlatform\nRecombination/Integration\nRecombination\nCombination\nCombining\ncombine\ncoupling\ndecoupling\nrecoupling\ncontinuum\ndomino\ntranscend\nloyalty\nharmony\nsolution\nproblem setting\nconnotation\nsolidarity\novercoming limits\nknowledge\nskills\nskilled\nexperience\ncomplexity\nhyper\nreact\ncontinuous\nchange\nagility\nagile\ngeneric\nquality\nflexibility\nleadership\nlearning\nacumen\nniche\nmultiplicity\nchoice\nsavvy\ninformed\nproliferation\nadvancement\nexpertise\nexpert\ncreativity\nspirit\nshare\npost-industrial\ndifferent\ndeluge\norganic\nknowledge factory\nflexibility\nvariety\nevolve\nidea\ncognitive\nvirtual\npersonalization\nintelligence\ncommunity\npractice\nintegration\ncluster\nsegments\nworkgroup\nteam\ninterface\nportfolio\nknowledge-based\ncompetence\ncompetencies\ncompetent\nversatile\nsharing\nexperience\nco-locate\nco-location\nseamless\nproject\nconvene\ndisperse\ncycle\naccumulate\nlearning\nd

In [11]:
filename = '../data/dictionary_v4.txt'  
# load keywords from text file. Assumes entries are line seperated and categories are seperate by 3 lines
with open(filename) as f:
    text=f.read()[:-1]
    
text

'Digital\nDigital\nDigitization\ndigitalization\nEcosystem\nPlatform\nData\nInformation\ninformation-based \nknowledge\nArtificial Intelligence\nAI\nIT\nIT-enabled\nInformation technologies\nInformation technology\n\n\nRecombination/Integration\nRecombination\nCombination\nCombining\ncombine\ncoupling\ndecoupling\nrecoupling\ncontinuum\ndomino\ntranscend\nloyalty\nharmony\nsolution\nproblem setting\nconnotation\nsolidarity\novercoming\nlimits\novercoming \nknowledge\nskills\nskilled\nexperience\ncomplexity\nhyper\nreact\ncontinuous\nchange\nagility\nagile\ngeneric\nquality\nflexibility\nleadership\nlearning\nacumen\nniche\nmultiplicity\nchoice\nsavvy\ninformed\nproliferation\nadvancement\nexpertise\nexpert\ncreativity\nspirit\nshare\npost-industrial\ndifferent\ndeluge\norganic\nfactory\nflexibility\nvariety\nevolve\nidea\ncognitive\nvirtual\npersonalization\nintelligence\ncommunity\npractice\nintegration\ncluster\nsegments\nworkgroup\nteam\ninterface\nportfolio\nknowledge-based\ncompet

In [13]:
lst = [i.split('\n') for i in text.split('\n\n\n')]

dic = {}
for i in lst:
    dic[i[0]] = {}
    for index, word in enumerate(i[1:]):
        dic[i[0]][index]= {'text': remove_trailing_spaces(word)}

### Stemming and Lemmanizing 

In [14]:
for key in dic.keys():
    for key2 in dic[key].keys():
        doc = nlp(dic[key][key2]['text'])
        lemmas = [word.lemma_ for word in doc]
        dic[key][key2]['lemmas'] = lemmas
        stems = [ps.stem(str(word)) for word in doc]
        dic[key][key2]['stems'] = stems
[(key, len(list(dic[key]))) for key in dic.keys()]

[('Digital', 15),
 ('Recombination/Integration', 94),
 ('Scope of Digital Business Strategy', 80),
 ('Scale of Digital Business Strategy', 78),
 ('Speed of Digital Business Strategy', 120),
 ('Sources of Value Creation and Capture', 94)]

### Save dict

In [16]:
np.savez('../data/keywords_dict_v4.npz', dic)

In [17]:
file = np.load('../data/keywords_dict_v4.npz', allow_pickle=1)
dic_loaded = file[file.files[0]].item()

In [18]:
dic_loaded

{'Digital': {0: {'text': 'Digital', 'lemmas': ['Digital'], 'stems': ['digit']},
  1: {'text': 'Digitization', 'lemmas': ['digitization'], 'stems': ['digit']},
  2: {'text': 'digitalization',
   'lemmas': ['digitalization'],
   'stems': ['digit']},
  3: {'text': 'Ecosystem', 'lemmas': ['ecosystem'], 'stems': ['ecosystem']},
  4: {'text': 'Platform', 'lemmas': ['platform'], 'stems': ['platform']},
  5: {'text': 'Data', 'lemmas': ['datum'], 'stems': ['data']},
  6: {'text': 'Information', 'lemmas': ['information'], 'stems': ['inform']},
  7: {'text': 'information-based',
   'lemmas': ['information', '-', 'base'],
   'stems': ['inform', '-', 'base']},
  8: {'text': 'knowledge', 'lemmas': ['knowledge'], 'stems': ['knowledg']},
  9: {'text': 'Artificial Intelligence',
   'lemmas': ['Artificial', 'Intelligence'],
   'stems': ['artifici', 'intellig']},
  10: {'text': 'AI', 'lemmas': ['AI'], 'stems': ['ai']},
  11: {'text': 'IT', 'lemmas': ['it'], 'stems': ['it']},
  12: {'text': 'IT-enabled',
