# About

creating embeddings in Keras, maybe combo with actual PO data?

https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/

In [39]:
from numpy import array

from numpy import asarray
from numpy import zeros
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding

import pandas as pd
import janitor

In [40]:
p_in = "../data/interim/unspsc-item.csv"
unspsc_item = pd.read_csv(p_in, dtype=str).clean_names()

In [41]:
unspsc_item.head()

Unnamed: 0,normalized_unspsc,item_name
0,10101501,iphone
1,10101502,Amendment 1 - Option to Renew
2,10101502,"CANINE, LAW ENFORCEMENT, NON-BITE, SEARCH DOG"
3,10101502,CANINES
4,10101502,"CANINES, NON-BITE"


In [42]:
smpl = unspsc_item.sample(n = 10, random_state = 0)
smpl

Unnamed: 0,normalized_unspsc,item_name
83480,43211503,Ultralight - Dell Lattitude E6230
173478,73101701,Order for Emerg. Prescriptions FY 12/13
14096,15101506,Fuel for Equip. 99C043
95063,43231512,ACAD DEEP SEC ANTI-MALWARE DXNA0575
17286,20121448,bonnet acorn
210573,92101603,Fuel Modification within defensible spacee zon...
85498,43211601,Cisco Catalyst Switch
210981,92121504,Amendments 5 & 6
82091,43211501,computer server
31263,25173107,Garmin Rino 650 GPS/2-Way Radio


In [43]:
docs = smpl[["item_name"]].values.flatten().tolist()
docs

['Ultralight - Dell Lattitude E6230',
 'Order for Emerg. Prescriptions FY 12/13',
 'Fuel for Equip. 99C043',
 'ACAD DEEP SEC ANTI-MALWARE DXNA0575',
 'bonnet acorn',
 'Fuel Modification within defensible spacee zone and the wildland fire zone betwe',
 'Cisco Catalyst Switch',
 'Amendments 5 & 6',
 'computer server',
 'Garmin Rino 650 GPS/2-Way Radio']

In [44]:
# define class labels
labels = array([1,1,1,1,1,0,0,0,0,0])

In [45]:
# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(docs)
vocab_size = len(t.word_index) + 1

In [46]:
# integer encode the documents
encoded_docs = t.texts_to_sequences(docs)
print(encoded_docs)
[print(len(x)) for x in encoded_docs]

[[4, 5, 6, 7], [8, 1, 9, 10, 11, 12, 13], [2, 1, 14, 15], [16, 17, 18, 19, 20, 21], [22, 23], [2, 24, 25, 26, 27, 3, 28, 29, 30, 31, 3, 32], [33, 34, 35], [36, 37, 38], [39, 40], [41, 42, 43, 44, 45, 46, 47]]
4
7
4
6
2
12
3
3
2
7


[None, None, None, None, None, None, None, None, None, None]

In [47]:
# pad documents to a max length of 4 words
max_length = 12
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs)

[[ 4  5  6  7  0  0  0  0  0  0  0  0]
 [ 8  1  9 10 11 12 13  0  0  0  0  0]
 [ 2  1 14 15  0  0  0  0  0  0  0  0]
 [16 17 18 19 20 21  0  0  0  0  0  0]
 [22 23  0  0  0  0  0  0  0  0  0  0]
 [ 2 24 25 26 27  3 28 29 30 31  3 32]
 [33 34 35  0  0  0  0  0  0  0  0  0]
 [36 37 38  0  0  0  0  0  0  0  0  0]
 [39 40  0  0  0  0  0  0  0  0  0  0]
 [41 42 43 44 45 46 47  0  0  0  0  0]]


In [48]:
# load the whole embedding into memory
embeddings_index = dict()
f = open('../data/external/glove/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [49]:
# create a weight matrix for words in training docs
embedding_matrix = zeros((vocab_size, 100))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

print(embedding_matrix)

[[ 0.          0.          0.         ...  0.          0.
   0.        ]
 [-0.14401001  0.32554001  0.14257    ...  0.25398001  1.10780001
  -0.073074  ]
 [-0.70880997  0.84255999  0.68713999 ...  0.072446    1.16419995
  -0.20105   ]
 ...
 [-0.38679001  0.61233002  0.14796001 ... -0.16003001  0.64607
  -0.82195002]
 [ 0.15542001  0.22266001  0.66749001 ... -0.25213     0.38793001
   0.83459002]
 [-0.41181001 -1.00619996 -0.23506001 ... -0.11002     1.08140004
   0.31467   ]]


In [54]:
# define model
model = Sequential()

# input length needs to be the same as the padding (max_length)
e = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=12, trainable=False)

model.add(e)
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
# summarize the model
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 12, 100)           4800      
_________________________________________________________________
flatten_4 (Flatten)          (None, 1200)              0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 1201      
Total params: 6,001
Trainable params: 1,201
Non-trainable params: 4,800
_________________________________________________________________
None


In [57]:
# fit the model
model.fit(padded_docs, labels, epochs=50, verbose=0)

<keras.callbacks.History at 0x7f1e6e32a828>

In [58]:
# evaluate the model
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 100.000000
