# Keywords dictionary

### Imports and function definitions

In [1]:
import numpy as np  # for saving dictionary at the end 

import spacy  # for lemmanizing
nlp = spacy.load('en_core_web_sm')  # here we could easily use a bigger model

from nltk.stem import PorterStemmer  # for stemming
ps = PorterStemmer()  # this can also be changed out for other prebuilts

def remove_trailing_spaces(string):
    '''simple while loop for removing trailing spaces. returns string less trailing spaces.'''
    while string[-1] == ' ':
        string = string[:-1]
        
    return string

### Load text and place in dictionary

In [2]:
filename = 'data/dictionary_v2.txt'  
# load keywords from text file. Assumes entries are line seperated and categories are seperate by 3 lines
with open(filename) as f:
    text=f.read()[:-1]

In [3]:
lst = [i.split('\n') for i in text.split('\n\n\n')]

dic = {}
for i in lst:
    dic[i[0]] = {}
    for index, word in enumerate(i[1:]):
        dic[i[0]][index]= {'text': remove_trailing_spaces(word)}

### Stemming and Lemmanizing 

In [4]:
for key in dic.keys():
    for key2 in dic[key].keys():
        doc = nlp(dic[key][key2]['text'])
        lemmas = [word.lemma_ for word in doc]
        dic[key][key2]['lemmas'] = lemmas
        stems = [ps.stem(str(word)) for word in doc]
        dic[key][key2]['stems'] = stems
[(key, len(list(dic[key]))) for key in dic.keys()]

[('DBS', 97),
 ('Scope of Digital Business Strategy', 88),
 ('Scale of Digital Business Strategy', 77),
 ('Speed of Digital Business Strategy', 116),
 ('Sources of Value Creation and Capture', 91)]

### Save dict

In [5]:
np.savez('data/keywords_dict.npz', dic)

In [6]:
file = np.load('data/keywords_dict.npz', allow_pickle=1)
dic_loaded = file[file.files[0]].item()

In [7]:
dic_loaded

{'DBS': {0: {'text': 'Digital technology',
   'lemmas': ['Digital', 'technology'],
   'stems': ['digit', 'technolog']},
  1: {'text': 'Digital era',
   'lemmas': ['Digital', 'era'],
   'stems': ['digit', 'era']},
  2: {'text': 'Digital resources',
   'lemmas': ['Digital', 'resource'],
   'stems': ['digit', 'resourc']},
  3: {'text': 'Platform', 'lemmas': ['platform'], 'stems': ['platform']},
  4: {'text': 'Recombination/Integration',
   'lemmas': ['Recombination', '/', 'integration'],
   'stems': ['recombin', '/', 'integr']},
  5: {'text': 'Recombination',
   'lemmas': ['recombination'],
   'stems': ['recombin']},
  6: {'text': 'Combination', 'lemmas': ['combination'], 'stems': ['combin']},
  7: {'text': 'Combining', 'lemmas': ['combine'], 'stems': ['combin']},
  8: {'text': 'combine', 'lemmas': ['combine'], 'stems': ['combin']},
  9: {'text': 'coupling', 'lemmas': ['couple'], 'stems': ['coupl']},
  10: {'text': 'decoupling', 'lemmas': ['decouple'], 'stems': ['decoupl']},
  11: {'text'