## Import Libraries

In [87]:
import tensorflow as tf
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder, LabelBinarizer, MultiLabelBinarizer
from sklearn.utils import shuffle
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential

from tensorflow.keras.preprocessing import text

import pickle

## Constants (can be changed and re-run to see effect on model)

In [88]:
TRAIN_PERCENT = .8
VOCAB_SIZE = 200

## Read and Shuffle Data

In [89]:
data = pd.read_csv('data/data.csv')
data = shuffle(data, random_state=22)

data.head()

Unnamed: 0,Industry,Symbol,Description
2,Finance,TURN,"Harris & Harris Group Inc. ® (the ""Company"" ""u..."
1,Finance,PIHPP,1347 Property Insurance Holdings Inc. (“PIH” t...
9,Energy,MMLP,"References in this annual report to ""we"" ""ours..."
6,Energy,AREX,Approach Resources Inc. is an independent ener...
3,Finance,CVCY,Central Valley Community Bancorp is a bank hol...


## One-Hot Encode Labels

In [90]:
industries = [industries.split(';') for industries in data['Industry'].values]
industries

[['Finance'],
 ['Finance'],
 ['Energy'],
 ['Energy'],
 ['Finance'],
 ['Energy'],
 ['Finance'],
 ['Energy'],
 ['Finance'],
 ['Finance']]

In [91]:
encoder = MultiLabelBinarizer()
industries_encoded = encoder.fit_transform(industries)
num_industries = len(industries_encoded[0])
industries_encoded

array([[0, 1],
       [0, 1],
       [1, 0],
       [1, 0],
       [0, 1],
       [1, 0],
       [0, 1],
       [1, 0],
       [0, 1],
       [0, 1]])

## Split Industry data between train / test

In [92]:
train_size = int(len(data) * TRAIN_PERCENT)
print('Train Size: %d' % train_size)
print('Test Size: %d' % (len(data) - train_size))

Train Size: 8
Test Size: 2


In [93]:
train_industries = industries_encoded[:train_size]
test_industries = industries_encoded[train_size:]

## Create Tokenizer Preprocessor Class

In [94]:
class TextPreprocessor(object):
    def __init__(self, vocab_size):
        self.vocab_size = vocab_size
        self.tokenizer = None
    
    def create_tokenizer(self, text_list):
        tokenizer = text.Tokenizer(num_words=self.vocab_size)
        tokenizer.fit_on_texts(text_list)
        self.tokenizer = tokenizer
    
    def transform_text(self, text_list):
        text_matrix = self.tokenizer.texts_to_matrix(text_list)
        return text_matrix

## Create Bag of Words Matrices

In [95]:
train_descriptions = data['Description'].values[:train_size]
test_descriptions = data['Description'].values[train_size:]

processor = TextPreprocessor(VOCAB_SIZE)
processor.create_tokenizer(train_descriptions)

body_train = processor.transform_text(train_descriptions)
body_test = processor.transform_text(test_descriptions)

## Preview Training Data

In [96]:
print(len(body_train[0]))
print(body_train[0])

200
[0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 0. 1. 1. 1. 1. 0. 1. 0.
 1. 1. 1. 1. 0. 0. 1. 1. 0. 0. 0. 0. 1. 1. 0. 1. 1. 1. 0. 0. 0. 0. 1. 0.
 1. 1. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1.
 0. 1. 1. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.
 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 1. 1. 1. 1. 1. 1.]


## Save the state of our processor

In [97]:
with open('./processor_state.pkl', 'wb') as f:
    pickle.dump(processor, f)

## Create the Model

In [98]:
def create_model(vocab_size, num_tags):
    model = Sequential()
    model.add(Dense(50, input_shape=(vocab_size,), activation='relu'))
    model.add(Dense(25, activation='relu'))
    model.add(Dense(num_tags, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

model = create_model(VOCAB_SIZE, num_industries)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 50)                10050     
_________________________________________________________________
dense_4 (Dense)              (None, 25)                1275      
_________________________________________________________________
dense_5 (Dense)              (None, 2)                 52        
Total params: 11,377
Trainable params: 11,377
Non-trainable params: 0
_________________________________________________________________


## Train the Model

In [103]:
train_descriptions

array(['Harris & Harris Group Inc. ® (the "Company" "us" "our" and "we") is an internally managed investment company. We have elected to be regulated as a business development company ("BDC") under the Investment Company Act of 1940 (the "1940 Act"). For tax purposes we have elected to be treated as a regulated investment company ("RIC") under Subchapter M of the Internal Revenue Code of 1986 (the "Code"). However as is discussed in detail in Note 10 to our Consolidated Financial Statements we did not qualify as a RIC in 2016 but we had no taxable income. We were incorporated under the laws of the state of New York in August 1981. We are overseen by our Board of Directors and managed by our officers and have no external investment adviser.   Historically our investment objective has been to achieve long-term capital appreciation investing in venture capital investments. Our focus was making investments in transformative companies enabled by what we believed was disruptive science. More

In [104]:
model.fit(body_train, train_industries, epochs=3, batch_size=1, validation_split=0.1)

Train on 7 samples, validate on 1 samples
Instructions for updating:
Use tf.cast instead.
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x1a3696aba8>

## Evaluate the Model

In [105]:
model.evaluate(body_test, test_industries, batch_size=128)



[0.7143846750259399, 0.5]

## Save the Model

In [106]:
model.save('industry_classifier.h5')

## Manual Model Testing