### IMDB Dataset

In [4]:
from keras.datasets import imdb

In [5]:
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=10000)

Downloading data from https://s3.amazonaws.com/text-datasets/imdb.npz


### Custom Dataset

In [48]:
import pandas as pd
pd.options.display.max_columns = 999
pd.options.display.max_rows = 999

In [21]:
df = pd.read_csv('./BasicCompanyDataAsOneFile-2018-03-01.csv', nrows=10000)
df.head()

Unnamed: 0,CompanyName,CompanyNumber,RegAddress.CareOf,RegAddress.POBox,RegAddress.AddressLine1,RegAddress.AddressLine2,RegAddress.PostTown,RegAddress.County,RegAddress.Country,RegAddress.PostCode,CompanyCategory,CompanyStatus,CountryOfOrigin,DissolutionDate,IncorporationDate,Accounts.AccountRefDay,Accounts.AccountRefMonth,Accounts.NextDueDate,Accounts.LastMadeUpDate,Accounts.AccountCategory,Returns.NextDueDate,Returns.LastMadeUpDate,Mortgages.NumMortCharges,Mortgages.NumMortOutstanding,Mortgages.NumMortPartSatisfied,Mortgages.NumMortSatisfied,SICCode.SicText_1,SICCode.SicText_2,SICCode.SicText_3,SICCode.SicText_4,LimitedPartnerships.NumGenPartners,LimitedPartnerships.NumLimPartners,URI,PreviousName_1.CONDATE,PreviousName_1.CompanyName,PreviousName_2.CONDATE,PreviousName_2.CompanyName,PreviousName_3.CONDATE,PreviousName_3.CompanyName,PreviousName_4.CONDATE,PreviousName_4.CompanyName,PreviousName_5.CONDATE,PreviousName_5.CompanyName,PreviousName_6.CONDATE,PreviousName_6.CompanyName,PreviousName_7.CONDATE,PreviousName_7.CompanyName,PreviousName_8.CONDATE,PreviousName_8.CompanyName,PreviousName_9.CONDATE,PreviousName_9.CompanyName,PreviousName_10.CONDATE,PreviousName_10.CompanyName,ConfStmtNextDueDate,ConfStmtLastMadeUpDate
0,! LTD,08209948,,,METROHOUSE 57 PEPPER ROAD,HUNSLET,LEEDS,YORKSHIRE,,LS10 2RU,Private Limited Company,Active,United Kingdom,,11/09/2012,30.0,9.0,30/06/2018,30/09/2016,DORMANT,09/10/2016,11/09/2015,0,0,0,0,99999 - Dormant Company,,,,0,0,http://business.data.gov.uk/id/company/08209948,,,,,,,,,,,,,,,,,,,,,25/09/2019,11/09/2017
1,!NNOV8 LIMITED,11006939,,,C/O FRANK HIRTH 1ST FLOOR,236 GRAY'S INN ROAD,LONDON,,UNITED KINGDOM,WC1X 8HB,Private Limited Company,Active,United Kingdom,,11/10/2017,31.0,3.0,11/07/2019,,NO ACCOUNTS FILED,08/11/2018,,0,0,0,0,62090 - Other information technology service a...,,,,0,0,http://business.data.gov.uk/id/company/11006939,,,,,,,,,,,,,,,,,,,,,24/10/2019,
2,!NSPIRED LTD,SC421617,,,26 POLMUIR ROAD,,ABERDEEN,,UNITED KINGDOM,AB11 7SY,Private Limited Company,Active,United Kingdom,,11/04/2012,30.0,3.0,30/12/2018,30/03/2017,TOTAL EXEMPTION FULL,09/05/2017,11/04/2016,0,0,0,0,70229 - Management consultancy activities othe...,,,,0,0,http://business.data.gov.uk/id/company/SC421617,,,,,,,,,,,,,,,,,,,,,25/04/2020,11/04/2017
3,!NVERTD DESIGNS LIMITED,09152972,,,55A HIGH STREET,,SILSOE,BEDFORDSHIRE,,MK45 4EW,Private Limited Company,Active,United Kingdom,,30/07/2014,31.0,7.0,30/04/2019,31/07/2017,,27/08/2016,30/07/2015,0,0,0,0,58190 - Other publishing activities,,,,0,0,http://business.data.gov.uk/id/company/09152972,,,,,,,,,,,,,,,,,,,,,13/08/2020,30/07/2017
4,!OBAC LIMITED,FC031362,,,1ST AND 2ND FLOORS ELIZABETH HOUSE,LES RUETIES BRAYES,ST PETER PORT,GUERNSEY,GUERNSEY,GY1 1EW,Other company type,Active,CHANNEL ISLANDS,,30/11/2012,31.0,12.0,,31/12/2016,GROUP,,,0,0,0,0,None Supplied,,,,0,0,http://business.data.gov.uk/id/company/FC031362,,,,,,,,,,,,,,,,,,,,,,


In [176]:
def clean(df):
    wdf = df[~df.CompanyName.str.contains('!')].dropna(subset=['Returns.LastMadeUpDate'])
    wdf = wdf[['CompanyName', 'SICCode.SicText_1']]
    wdf['CompanyName'] = wdf.CompanyName.str.replace('[".,()\']', ' ')\
        .str.replace('\s+', ' ').str.strip()\
        .str.replace('LIMITED', 'LTD')\
        .str.replace('INCORPORATED', 'INC')\
        .str.replace('([A-Z])LTD', '\\1 LTD')
    wdf['SICC'] = wdf['SICCode.SicText_1'].str.split(' - ').str[0]
    wdf = wdf[(wdf.SICC != '99999') & wdf.SICC.str.isnumeric()]
    wdf = wdf[wdf.CompanyName.str.count(' ') < 8]
    return wdf

In [177]:
wdf = clean(df)

### Custom Word Embedding

In [82]:
from gensim.models import Word2Vec

In [112]:
sentences = list(wdf.CompanyName.values)
sentences = [line.lower().split(' ') for line in sentences]

In [114]:
model = Word2Vec(sentences, size=100, window=5, min_count=3, workers=4)
vectors = model.wv

In [123]:
vectors.most_similar('construction')

[('royal', 0.9979995489120483),
 ('100', 0.9979317784309387),
 ('123', 0.9979284405708313),
 ('solutions', 0.9979108572006226),
 ('lane', 0.9978777170181274),
 ('services', 0.9978717565536499),
 ('flat', 0.9978603720664978),
 ('to', 0.9978025555610657),
 ('stop', 0.9977952837944031),
 ('121', 0.9977943897247314)]

### Train Custom Embedding Layer

In [124]:
from keras.layers import LSTM, Convolution1D, Flatten, Dropout, Dense
from keras.layers.embeddings import Embedding
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import one_hot

In [149]:
embedding_vector_length = 100
vocab_size = 1000

In [150]:
encoded_company_names = [one_hot(d, vocab_size) for d in wdf.CompanyName]
encoded_company_names = [d for d in encoded_company_names]
sorted((len(d) for d in encoded_company_names), reverse=True)

[10,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,

In [153]:
encoded_company_names = pad_sequences(encoded_company_names, maxlen=maxlen, padding='post')

In [173]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_vector_length, input_length=maxlen))
model.add(Flatten())
model.add(Dense(420, activation='sigmoid'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 10, 100)           100000    
_________________________________________________________________
flatten_5 (Flatten)          (None, 1000)              0         
_________________________________________________________________
dense_5 (Dense)              (None, 420)               420420    
Total params: 520,420
Trainable params: 520,420
Non-trainable params: 0
_________________________________________________________________


In [172]:
len(encoded_company_names), len(wdf)

(6869, 6869)

In [166]:
from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder

In [169]:
encoder = LabelEncoder()
encoder.fit(wdf.SICC)
encoded_Y = encoder.transform(wdf.SICC)
labels = np_utils.to_categorical(encoded_Y)
labels.shape

(6869, 420)

In [175]:
model.fit(encoded_company_names, labels, epochs=50)
loss, accuracy = model.evaluate(encoded_company_names, labels)
accuracy

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


0.84146163930954032

### Evaluation

In [None]:
df = pd.read_csv('./BasicCompanyDataAsOneFile-2018-03-01.csv', nrows=10000)
df.head()