In [1]:
import csv
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

In [2]:
embedding_dim = 50
max_length = 100
training_portion = .85 # set ratio of train (80%) and validation (20%)
list_of_patents = []
labels = []

In [3]:
# Read data and remove stopword
with open("basf_challenge/data/uspto.csv", 'r', encoding="utf-8") as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    next(reader)
    for row in reader:
        labels.append(row[4]) #3 for categories (A-H), 4 for sub-categories (A08J)
        patent = row[8] #2 for title, 8 for abstract
        for word in STOPWORDS:
            token = ' ' + word + ' '
            patent = patent.replace(token, ' ')
            patent = patent.replace(' ', ' ')
        list_of_patents.append(patent)
print(len(labels))
print(len(list_of_patents))

12343
12343


In [4]:
train_size = int(len(list_of_patents) * training_portion)
train_patents = list_of_patents[0: train_size]
train_labels = labels[0: train_size]
validation_patents = list_of_patents[train_size:]
validation_labels = labels[train_size:]

In [5]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_patents)
word_index = tokenizer.word_index

vocab_size=len(word_index)
dict(list(word_index.items())[0:20]) ## print out first 20 index of vocabulary

{'the': 1,
 'first': 2,
 'second': 3,
 'a': 4,
 'one': 5,
 'includes': 6,
 'device': 7,
 'data': 8,
 'system': 9,
 'least': 10,
 'method': 11,
 'may': 12,
 'plurality': 13,
 'portion': 14,
 'configured': 15,
 'based': 16,
 'layer': 17,
 'surface': 18,
 'unit': 19,
 'provided': 20}

In [6]:
train_sequences = tokenizer.texts_to_sequences(train_patents)
# First of 50 records in token form
for i in range(50):
    print(train_sequences[i])

[1, 39, 48, 133, 15401, 32, 6, 1039, 19, 1435, 6552, 23, 12397, 817, 4203, 34, 24, 19, 618, 34, 19, 34, 86, 81, 939, 23, 1321, 106, 81, 245, 6552, 23, 1206, 939, 6552, 23]
[35, 6553, 7, 304, 1351, 988, 267, 527, 2435, 1296, 1352, 27, 1184, 493, 5, 352, 87, 4069, 1, 7, 710, 817, 12397, 323, 12398, 155, 15402, 2404, 4354, 12399, 15403]
[4, 15404, 9, 11, 636, 65, 74, 4508, 1898, 817, 6, 2, 27, 71, 583, 27, 743, 7, 9299, 27, 580, 201, 27, 817, 1395, 41, 249, 553, 817, 26, 7654, 777, 41, 1072, 1395, 41, 743, 7, 1525, 453, 1395, 41, 131, 817, 201, 27, 236, 380, 1453, 777, 27, 493, 817]
[4, 9, 308, 871, 817, 67, 235, 332, 2110, 817, 10, 5, 2141, 2575, 2111, 235, 33, 707, 4853, 2141, 14, 2575, 767, 2111, 235, 33, 2576, 196, 973, 2405, 235, 332, 195, 817, 66, 19, 15, 24, 9, 1, 19, 618, 9, 33, 2, 817, 2405, 1185, 2, 586, 989, 33, 989, 2405, 957, 2, 586, 989, 33, 9, 785, 4070, 2, 817, 827, 5, 871, 2, 817, 1339, 9, 256, 4070, 2, 817]
[35, 6553, 7655, 39, 428, 15405, 1899, 1105, 817, 8350, 548, 439

In [7]:
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post', truncating='post')
# First of 50 records after padding to size 20
for i in range(50):
    print(train_padded[i])

[    1    39    48   133 15401    32     6  1039    19  1435  6552    23
 12397   817  4203    34    24    19   618    34    19    34    86    81
   939    23  1321   106    81   245  6552    23  1206   939  6552    23
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0]
[   35  6553     7   304  1351   988   267   527  2435  1296  1352    27
  1184   493     5   352    87  4069     1     7   710   817 12397   323
 12398   155 15402  2404  4354 12399 15403     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0 

In [8]:
validation_sequences = tokenizer.texts_to_sequences(validation_patents)
validation_padded = pad_sequences(validation_sequences, maxlen=max_length, padding='post', truncating='post')

In [9]:
# set of lables
print(set(labels))

{'A23L', 'B64B', 'B22D', 'H03L', 'C08L', 'C11D', 'F25D', 'F16J', 'G21D', 'B09B', 'F15D', 'B60C', 'C23C', 'H03F', 'G10G', 'A61C', 'F24F', 'H04M', 'B21D', 'D06F', 'E03C', 'G16B', 'B06B', 'B60T', 'A22B', 'B65C', 'B62M', 'B60H', 'H04Q', 'F01N', 'F16H', 'A63G', 'B26B', 'G06F', 'A47G', 'F02C', 'G10L', 'C12R', 'C10L', 'B25B', 'H02G', 'G01P', 'A24B', 'E04H', 'B29D', 'B63H', 'C07J', 'A61L', 'F16C', 'G21G', 'A61G', 'C25D', 'F16D', 'C13K', 'G04R', 'F04D', 'F28F', 'B64D', 'H02S', 'A45D', 'A45F', 'G01T', 'A41F', 'F24D', 'B60R', 'C02F', 'A23C', 'B67C', 'B21F', 'A61J', 'B62H', 'E04B', 'F02M', 'C22C', 'F25C', 'G03F', 'F16G', 'C05D', 'G07G', 'B61F', 'B07C', 'E01B', 'G06Q', 'H01T', 'A47B', 'B21J', 'A63B', 'G10K', 'B33Y', 'A01G', 'B31F', 'F24C', 'H04S', 'B44D', 'F41J', 'A61H', 'E04G', 'C30B', 'C07K', 'C11B', 'C12N', 'F21L', 'B60W', 'C25F', 'F15B', 'H03H', 'C03B', 'E05D', 'B66F', 'A61B', 'B65B', 'C01G', 'D01D', 'H04N', 'A22C', 'G04D', 'H02H', 'A47H', 'B44C', 'G07D', 'B43K', 'B41M', 'C10B', 'G01B', 'F16M',

In [10]:
# label to token
label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(labels)

In [11]:
training_label_seq = np.array(label_tokenizer.texts_to_sequences(train_labels))
validation_label_seq = np.array(label_tokenizer.texts_to_sequences(validation_labels))

In [12]:
# First of 20 labels (token form)
for i in range(20):
    print(training_label_seq[i])

[6]
[6]
[6]
[6]
[6]
[6]
[6]
[6]
[6]
[6]
[6]
[6]
[6]
[6]
[6]
[6]
[6]
[6]
[6]
[6]


In [13]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
# Checking encode and original
def decode_patent(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])
print('------------------------')
print(decode_patent(train_padded[20]))
print(train_patents[20])
print('------------------------')

------------------------
the present disclosure pertains system configured determine one parameters based cardiorespiratory information subject determine sleep stage classifications based discriminative undirected probabilistic graphical model conditional random fields using determined parameters the system advantageous sleep structured process parameters determined individual epochs independent time system determines sleep stage classifications based parameters determined current epoch determined relationships parameters sleep stage classifications determined previous epochs and or information the system assume determined parameters discriminative entire sleep stage maybe indicative sleep stage transition alone in embodiments system comprises one sensors one physical computer processors electronic storage user interface ? ? ? ? ? ? ? ?
The present disclosure pertains system configured determine one parameters based cardiorespiratory information subject determine sleep stage classifica

In [14]:
# Glovec embedding
embeddings_index = {};
with open('basf_challenge/glovec/glove.6B.50d.txt', encoding="utf8") as f:
    for line in f:
        values = line.split();
        word = values[0];
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs;

embeddings_matrix = np.zeros((vocab_size+1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embeddings_matrix[i] = embedding_vector
        
print(embeddings_matrix.shape) 

(22252, 50)


In [15]:
# Use tf.keras.layers.Bidirectional(tf.keras.layers.LSTM()).
# Use ReLU in place of tanh function.
# Add a Dense layer with 479 units and softmax activation.

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size+1, embedding_dim, input_length=max_length, weights=[embeddings_matrix], trainable=False),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(479, activation='softmax') #9 for categories, 479 for sub-categories
])
model.summary()
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 50)           1112600   
_________________________________________________________________
bidirectional (Bidirectional (None, 128)               58880     
_________________________________________________________________
dense (Dense)                (None, 64)                8256      
_________________________________________________________________
dense_1 (Dense)              (None, 479)               31135     
Total params: 1,210,871
Trainable params: 98,271
Non-trainable params: 1,112,600
_________________________________________________________________


In [16]:
# Traing model with 15 epochs
num_epochs = 15
history = model.fit(train_padded, training_label_seq, epochs=num_epochs, validation_data=(validation_padded, validation_label_seq), verbose=2)

Train on 10491 samples, validate on 1852 samples
Epoch 1/15
10491/10491 - 22s - loss: 4.6818 - accuracy: 0.1406 - val_loss: 3.8748 - val_accuracy: 0.1269
Epoch 2/15
10491/10491 - 18s - loss: 3.9349 - accuracy: 0.1942 - val_loss: 3.6101 - val_accuracy: 0.1479
Epoch 3/15
10491/10491 - 20s - loss: 3.5744 - accuracy: 0.2313 - val_loss: 3.2759 - val_accuracy: 0.2441
Epoch 4/15
10491/10491 - 19s - loss: 3.2869 - accuracy: 0.2665 - val_loss: 3.3092 - val_accuracy: 0.2084
Epoch 5/15
10491/10491 - 18s - loss: 3.0722 - accuracy: 0.2934 - val_loss: 3.0445 - val_accuracy: 0.2981
Epoch 6/15
10491/10491 - 18s - loss: 2.8914 - accuracy: 0.3187 - val_loss: 3.0428 - val_accuracy: 0.2900
Epoch 7/15
10491/10491 - 19s - loss: 2.7365 - accuracy: 0.3447 - val_loss: 3.1054 - val_accuracy: 0.2802
Epoch 8/15
10491/10491 - 20s - loss: 2.5954 - accuracy: 0.3641 - val_loss: 2.9795 - val_accuracy: 0.3251
Epoch 9/15
10491/10491 - 18s - loss: 2.4553 - accuracy: 0.3880 - val_loss: 2.9914 - val_accuracy: 0.3153
Epoch 

In [17]:
# Predict input text
patent_input = ["An apparatus for producing a fundus image includes: a processor and a memory; an illumination component including a light source and operatively coupled to the processor; a camera including a lens and operatively coupled to the processor, wherein the memory stores instructions that, when executed by the processor, cause the apparatus to: execute an automated script for capture of the fundus image; and allow for manual capture of the fundus image."]
seq = tokenizer.texts_to_sequences(patent_input)
padded = pad_sequences(seq, maxlen=max_length)
prediction = model.predict(padded)
print(prediction)
print(labels[np.argmax(prediction)])

[[1.34544123e-07 8.58195946e-02 1.44428378e-02 2.50664819e-03
  2.08372716e-04 1.91945657e-02 7.44544528e-03 1.90164726e-02
  4.15494815e-02 2.48639677e-02 2.90664565e-02 1.71218123e-02
  3.19550396e-04 1.29164830e-02 1.17755018e-03 5.16589847e-04
  2.51078722e-03 1.67183997e-03 1.59251299e-02 1.77737711e-05
  6.98402655e-05 5.45347482e-03 5.71293663e-03 1.56998227e-03
  5.28843702e-05 1.10884449e-02 8.83810222e-04 1.07269734e-03
  2.34555081e-03 4.11666369e-05 7.55232875e-04 1.78677810e-03
  8.34065396e-03 9.58765019e-03 2.14450574e-03 3.35182442e-04
  6.64932132e-02 3.02704691e-04 1.05271526e-02 7.93003477e-04
  1.06110936e-03 4.23854362e-05 5.72589925e-04 2.73217540e-02
  6.23848811e-02 5.86473867e-02 8.10749829e-04 1.57078970e-02
  1.26053358e-03 4.65807621e-04 1.10212937e-02 1.18564030e-05
  3.17890141e-07 2.12409883e-03 1.25036924e-04 4.99082590e-03
  2.39281959e-04 3.00429971e-03 2.72137004e-05 3.75793781e-04
  6.17722180e-05 1.98774524e-05 2.35894974e-03 2.89574987e-03
  1.5404