In [1]:
import csv
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

In [2]:
vocab_size = 3000 # size of vocabulary
embedding_dim = 64
max_length = 20
training_portion = .85 # set ratio of train (85%) and validation (15%)

list_of_patents = []
labels = []

In [3]:
# Read data and remove stopword
with open("basf_challenge/data/uspto.csv", 'r', encoding="utf-8") as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    next(reader)
    for row in reader:
        labels.append(row[3])
        patent = row[2]
        for word in STOPWORDS:
            token = ' ' + word + ' '
            patent = patent.replace(token, ' ')
            patent = patent.replace(' ', ' ')
        list_of_patents.append(patent)
print(len(labels))
print(len(list_of_patents))

12343
12343


In [4]:
train_size = int(len(list_of_patents) * training_portion)
train_patents = list_of_patents[0: train_size]
train_labels = labels[0: train_size]
validation_patents = list_of_patents[train_size:]
validation_labels = labels[train_size:]

oov_tok = '<OOV>'
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_patents)
word_index = tokenizer.word_index

dict(list(word_index.items())[0:20]) ## print out first 20 index of vocabulary

{'<OOV>': 1,
 'method': 2,
 'device': 3,
 'system': 4,
 'apparatus': 5,
 'methods': 6,
 'systems': 7,
 'using': 8,
 'control': 9,
 'same': 10,
 'vehicle': 11,
 'processing': 12,
 'display': 13,
 'thereof': 14,
 'based': 15,
 'data': 16,
 'image': 17,
 'manufacturing': 18,
 'assembly': 19,
 'storage': 20}

In [5]:
train_sequences = tokenizer.texts_to_sequences(train_patents)
# First of 50 records in token form
for i in range(50):
    print(train_sequences[i])

[1, 5, 9, 2, 14, 12, 5, 12, 2]
[1, 8, 581, 1, 2621]
[1, 64, 4, 2, 78, 2622, 3]
[4, 42, 2, 14, 228, 643, 582]
[310, 2198, 582, 2199, 4]
[24, 5, 2, 42, 10, 43, 124, 98, 38]
[975, 677, 33, 222, 229, 898, 134, 1297]
[1076, 63, 1168]
[2, 46, 395, 2623, 468]
[170, 112, 91, 490, 6, 1446, 62, 7]
[21, 6, 7, 716, 30, 2200, 1447]
[1169, 717, 62, 355, 4]
[831, 1876, 39, 92, 3, 831, 1876, 39, 92, 2]
[1, 15, 338, 1, 339, 976]
[2, 5, 246, 2624, 2201, 1, 448, 1643, 251, 8, 1, 2625, 644, 718, 770]
[5, 2, 93, 1877, 505]
[1878, 1, 1, 1]
[24, 3, 1, 719, 16]
[1, 469, 85, 112, 340]
[645, 85, 33]
[4, 2, 1, 1076, 551, 449]
[230, 678, 36, 7, 6]
[5, 2, 93, 719, 976, 2626]
[118, 1, 470, 8, 192, 2627, 62]
[1877, 70, 252, 5, 18, 2, 14]
[4, 2, 93, 1, 1, 1, 679, 2628, 2629, 1]
[1, 3, 6]
[832, 4, 149, 118]
[4, 2, 93, 287, 1076, 1879, 1880]
[1, 15, 1881, 44, 125, 5, 2, 1448, 506, 1, 8, 1, 583, 1]
[396, 899, 64, 8, 1882, 900]
[6, 92, 1, 2202, 1644, 8, 901]
[1449, 1, 680, 112, 1, 1298, 62, 6]
[902, 1, 1, 1077, 163, 1645

In [6]:
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post', truncating='post')
# First of 50 records after padding to size 20
for i in range(50):
    print(train_padded[i])

[ 1  5  9  2 14 12  5 12  2  0  0  0  0  0  0  0  0  0  0  0]
[   1    8  581    1 2621    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0]
[   1   64    4    2   78 2622    3    0    0    0    0    0    0    0
    0    0    0    0    0    0]
[  4  42   2  14 228 643 582   0   0   0   0   0   0   0   0   0   0   0
   0   0]
[ 310 2198  582 2199    4    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0]
[ 24   5   2  42  10  43 124  98  38   0   0   0   0   0   0   0   0   0
   0   0]
[ 975  677   33  222  229  898  134 1297    0    0    0    0    0    0
    0    0    0    0    0    0]
[1076   63 1168    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0]
[   2   46  395 2623  468    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0]
[ 170  112   91  490    6 1446   62    7    0    0    0    0    0    0
    0    0    0    0    0    0]
[  21    6    7  716   30 2200 1447    0    0    0 

In [7]:
validation_sequences = tokenizer.texts_to_sequences(validation_patents)
validation_padded = pad_sequences(validation_sequences, maxlen=max_length, padding='post', truncating='post')

In [8]:
# set of lables
print(set(labels))

{'G', 'F', 'A', 'H', 'C', 'D', 'E', 'B'}


In [9]:
# label to token
label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(labels)

training_label_seq = np.array(label_tokenizer.texts_to_sequences(train_labels))
validation_label_seq = np.array(label_tokenizer.texts_to_sequences(validation_labels))

In [10]:
# First of 20 labels (token form)
for i in range(50):
    print(training_label_seq[i])

[3]
[3]
[3]
[3]
[3]
[3]
[3]
[3]
[3]
[3]
[3]
[3]
[3]
[3]
[3]
[3]
[3]
[3]
[3]
[3]
[3]
[3]
[3]
[3]
[3]
[3]
[3]
[3]
[3]
[3]
[3]
[3]
[3]
[3]
[3]
[3]
[3]
[3]
[3]
[3]
[3]
[3]
[3]
[3]
[3]
[3]
[3]
[3]
[3]
[3]


In [11]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
# Checking encode and original
def decode_patent(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])
print('------------------------')
print(decode_patent(train_padded[20]))
print(train_patents[20])
print('------------------------')

------------------------
system method <OOV> sleep stage classification ? ? ? ? ? ? ? ? ? ? ? ? ? ?
System method cardiorespiratory sleep stage classification
------------------------


In [12]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim)),
    tf.keras.layers.Dense(embedding_dim, activation='relu'),
    tf.keras.layers.Dense(9, activation='softmax')
])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 64)          192000    
_________________________________________________________________
bidirectional (Bidirectional (None, 128)               66048     
_________________________________________________________________
dense (Dense)                (None, 64)                8256      
_________________________________________________________________
dense_1 (Dense)              (None, 9)                 585       
Total params: 266,889
Trainable params: 266,889
Non-trainable params: 0
_________________________________________________________________


In [13]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [14]:
# Traing model with 15 epochs
num_epochs = 15
history = model.fit(train_padded, training_label_seq, epochs=num_epochs, validation_data=(validation_padded, validation_label_seq), verbose=2)

Train on 10491 samples, validate on 1852 samples
Epoch 1/15
10491/10491 - 9s - loss: 1.5166 - accuracy: 0.4279 - val_loss: 1.1659 - val_accuracy: 0.4455
Epoch 2/15
10491/10491 - 5s - loss: 1.0291 - accuracy: 0.6301 - val_loss: 1.0513 - val_accuracy: 0.5454
Epoch 3/15
10491/10491 - 5s - loss: 0.8080 - accuracy: 0.7166 - val_loss: 1.2848 - val_accuracy: 0.5005
Epoch 4/15
10491/10491 - 5s - loss: 0.6766 - accuracy: 0.7652 - val_loss: 1.1255 - val_accuracy: 0.5616
Epoch 5/15
10491/10491 - 5s - loss: 0.5798 - accuracy: 0.7963 - val_loss: 1.3148 - val_accuracy: 0.5227
Epoch 6/15
10491/10491 - 5s - loss: 0.5068 - accuracy: 0.8221 - val_loss: 1.4983 - val_accuracy: 0.5232
Epoch 7/15
10491/10491 - 5s - loss: 0.4458 - accuracy: 0.8444 - val_loss: 1.5539 - val_accuracy: 0.5443
Epoch 8/15
10491/10491 - 4s - loss: 0.3970 - accuracy: 0.8589 - val_loss: 1.5008 - val_accuracy: 0.5934
Epoch 9/15
10491/10491 - 4s - loss: 0.3508 - accuracy: 0.8747 - val_loss: 1.8177 - val_accuracy: 0.5432
Epoch 10/15
104

In [15]:
# Predict input text
patent_input = ["Apparatus and method for determining a physiological condition"]
seq = tokenizer.texts_to_sequences(patent_input)
padded = pad_sequences(seq, maxlen=max_length)
prediction = model.predict(padded)
print(prediction)
print(labels[np.argmax(prediction)])

[[0.00608218 0.18759827 0.1466277  0.18126073 0.15570012 0.20532756
  0.06251923 0.04249167 0.01239252]]
A
