In [8]:
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.layers import Embedding
import random
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding, Bidirectional
import config
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.metrics import classification_report
import pickle 
import sys

np.random.seed(1234)
tf.random.set_seed(1234)
random.seed(1234)

In [9]:
def trainTestSplit(df,n):
    
    df1 = df['ASSET_CLASS'].value_counts().rename_axis('Assets').reset_index(name = 'counts')
    df_new = df1[df1['counts']>=n] # Train Test split 75% - train
    assets = list(df_new['Assets'])
    dffiltered = df[df['ASSET_CLASS'].isin(assets)]
    dffiltered['ASSET_CLASS_CODES'] = pd.Categorical(dffiltered['ASSET_CLASS'])
    dffiltered['ASSET_CLASS_CODES'] = dffiltered['ASSET_CLASS_CODES'].cat.codes
    
    x = dffiltered['SPELL_CORRECTED']
    y = pd.get_dummies(dffiltered['ASSET_CLASS_CODES']) 

    #buliding mapping dict from codes to Asset Classes
    indexes_y = y.drop_duplicates().index
    asset_classes = dffiltered.loc[indexes_y,"ASSET_CLASS"].values
    asset_class_codes = dffiltered.loc[indexes_y,"ASSET_CLASS_CODES"].values
    code_asset_class_mapping_dict = dict(zip(asset_class_codes,asset_classes))
    
    with open(config.code_asset_class_mapping_dict, 'wb') as f:
        pickle.dump(code_asset_class_mapping_dict, f)
    
    #sanity check to determine whether the codes are being mapped correctly to the asset classes
    #y_stack = y.stack()
    #print(pd.Series(pd.Categorical(y_stack[y_stack!=0].index.get_level_values(1))))
    
    X_train, X_test, Y_train, Y_test = train_test_split(x,y, test_size = 0.20, stratify = y)
    print(' Number of Assets ' + str(len(set(list(dffiltered['ASSET_CLASS'])))))
    return X_train, X_test,  Y_train, Y_test

In [10]:
def generateEmbeddingIndex():
    print('Indexing word vectors.')
    embeddings_index = {}
    with open((config.utils_dir+config.glove_txt_300d)) as f:
        for line in f:
            word, coefs = line.split(maxsplit=1)
            coefs = np.fromstring(coefs, 'f', sep=' ')
            embeddings_index[word] = coefs

    print('Found %s word vectors.' % len(embeddings_index))
    return embeddings_index

In [11]:
def bilstm(X_train, X_test, Y_train, Y_test,wordembeddings):
    np.random.seed(1234)
    tf.random.set_seed(1234)
    random.seed(1234)
    
    max_length_sentence = X_train.str.split().str.len().max()
    tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\'',lower=True)
    tokenizer.fit_on_texts(X_train)
    word_index = tokenizer.word_index
    
    #pickle dump word_index dictionary
    with open(config.word_index_lstm, 'wb') as f:
        pickle.dump(word_index, f)
    
    #pickle dump max_length_sentence
    with open(config.max_length_sentence_lstm, 'wb') as f:
        pickle.dump(max_length_sentence, f)
    
    with open(config.tokenizer_lstm, 'wb') as f:
        pickle.dump(tokenizer, f)
    
    
    EMBEDDING_DIM=300
    vocabulary_size=len(word_index)+1
    print('Found %s unique tokens.' % len(word_index))
    
    sequences_train = tokenizer.texts_to_sequences(X_train)
    sequences_valid=tokenizer.texts_to_sequences(X_test)
    X_train = pad_sequences(sequences_train,maxlen=max_length_sentence)
    X_val = pad_sequences(sequences_valid,maxlen=X_train.shape[1])
    y_train = np.asarray(Y_train)
    y_val = np.asarray(Y_test)
    #print(word_index)
    
    '''
    print('Shape of data tensor:', X_train.shape)
    print('Shape of data tensor:', X_val.shape)
    print('Shape of data tensor:', y_train.shape)
    print('Shape of data tensor:', y_val.shape)
    
    print(X_train)
    print("*"*100)
    print(X_val)
    print("*"*100)
    print(y_train)
    print("*"*100)
    print(y_val)
    '''
    
    embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM))
    for word, i in word_index.items():
        if(word in wordembeddings.keys()):
            embedding_vector = wordembeddings[word]
            if len(embedding_vector)==0: #if array is empty
                embedding_vector = wordembeddings[word.title()]
                if len(embedding_vector)==0:
                    embedding_vector = wordembeddings[word.upper()]
                    if len(embedding_vector)==0:
                        embedding_vector = np.array([round(np.random.rand(),8) for i in range(0,300)])
                        
        else:
            #print("WORD NOT IN DICT",word)
            embedding_vector = np.array([round(np.random.rand(),8) for i in range(0,300)])
            
        if len(embedding_vector)!=0:
            embedding_matrix[i] = embedding_vector

    embedding_layer = Embedding(vocabulary_size,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            trainable=False) #Try with True
    
    
    inputs = Input(shape=(X_train.shape[1],))
    model = (Embedding(vocabulary_size, EMBEDDING_DIM, input_length=max_length_sentence,weights=[embedding_matrix]))(inputs)
    
    model = (LSTM(64))(model)
    model = (Dense(900, activation='relu'))(model)
    model = (Dense(400, activation='relu'))(model)
    model = (Dense(250, activation='relu'))(model)
    model = (Dense(204, activation='softmax'))(model)
    model = Model(inputs=inputs,outputs=model)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    
    callbacks = [EarlyStopping(monitor='val_loss')]
    hist_adam = model.fit(X_train, y_train, batch_size=1000, epochs=200, verbose=1, validation_data=(X_val, y_val),callbacks=callbacks)     #!!!!!!!!!!!!!!!!!!!!!!!CHANGE BATCH SIZE TO 1000 #change epochs to 200
    model.save(config.lstm_prepocessed_dataset1_chai)
    
    y_pred = model.predict(X_val)
    print(y_pred)
    
    y_val_class = pd.DataFrame(y_val).idxmax(axis=1)
    print(y_val_class)
    
    y_val_class_argmax = np.argmax(y_val,axis=1)
    y_pred_class_argmax = np.argmax(y_pred,axis=1)
    
    y_pred_class = pd.DataFrame(y_pred).idxmax(axis=1)
    print(y_pred_class)
    
    
    print(classification_report(y_val_class, y_pred_class))
    
    plt.suptitle('Optimizer : Adam', fontsize=10)
    plt.ylabel('Loss', fontsize=16)
    plt.xlabel('Epoch', fontsize=14)
    plt.plot(hist_adam.history['loss'], color='b', label='Training Loss')
    plt.plot(hist_adam.history['val_loss'], color='r', label='Validation Loss')
    plt.legend(loc='upper right')
    plt.savefig('/home/ubuntu/asset_classification/results/lstm_model_dataset1_preprocessed_chai.png')
    
    tf.keras.utils.plot_model(model, to_file=config.lstm_architecture, show_shapes=True)
    
    return(y_pred,y_val_class,y_pred_class,y_val_class_argmax,y_pred_class_argmax)
    

In [12]:
df = pd.read_csv(config.datasets_dir+config.final_preprocessed)
df.head()

Unnamed: 0,BUSINESS_UNIT,PSC_CODE,FUND_SUBOBJCLASS,OBJ_CODE,SUB_OBJ_DESCR,ORDER_DATE,ORDER_TITLE,LINE_DESCRIPTION,VENDOR_NAME,VENDOR_COUNTRY,COST,ASSET_CLASS,ASSET_CLASS_DESCRIPTION,text_fields,PROCESSED_TEXT_FIELDS,SPELL_CORRECTED
0,LOCATION 81,7290,4161,GRANTS/CONT/SUBSIDY,VALUE-ADDED TAXES,Mon Apr 22 2019 07:05:43 GMT-0400 (EDT),transformers warehouse location fap,transformers outlet made plastic case carrying...,RON SITON,ISR,700.6405,39300,TRANSFORMER,transformers warehouse location fap transforme...,transformers warehouse location fap transforme...,transformers warehouse location fap transforme...
1,LOCATION 81,7290,3123,EQUIPMENT,HOUSEHOLD FURNISHING,Mon Apr 22 2019 07:05:43 GMT-0400 (EDT),transformers warehouse location fap,transformers outlet made plastic case carrying...,RON SITON,ISR,4121.4146,39300,TRANSFORMER,transformers warehouse location fap transforme...,transformers warehouse location fap transforme...,transformers warehouse location fap transforme...
2,LOCATION 169,6120,4161,GRANTS/CONT/SUBSIDY,VALUE-ADDED TAXES,Thu Apr 25 2019 11:23:35 GMT-0400 (EDT),gso icass stepdown transformers fap use,stepdown transformer full loadable primary vol...,Cosmos International Building Materials LLC,ARE,447.1821,39300,TRANSFORMER,gso icass stepdown transformers fap use stepdo...,gso ass stepdown transformers fap use stepdown...,so ass stepson transformers fap use stepson tr...
3,LOCATION 169,6120,3123,EQUIPMENT,HOUSEHOLD FURNISHING,Thu Apr 25 2019 11:23:35 GMT-0400 (EDT),gso icass stepdown transformers fap use,stepdown transformer full loadable primary vol...,Cosmos International Building Materials LLC,ARE,8943.6428,39300,TRANSFORMER,gso icass stepdown transformers fap use stepdo...,gso ass stepdown transformers fap use stepdown...,so ass stepson transformers fap use stepson tr...
4,LOCATION 78,6120,2675,SUPPLIES & MATERIALS,RESIDENTIAL SUPPL/FU,Tue Mar 26 2019 14:53:15 GMT-0400 (EDT),transformer step down priority,kohler cie fabrication transformateurs transfo...,KOHLER & CIE,CHE,5790.5337,39300,TRANSFORMER,transformer step down priority kohler cie fabr...,transformer step down priority kohler cie fabr...,transformer step down priority kohler cie fabr...


In [13]:
df = df.replace(np.nan, '', regex = True)
df["SPELL_CORRECTED"].isnull().values.any()

False

In [14]:
X_train, X_test, Y_train, Y_test = trainTestSplit(df,100)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dffiltered['ASSET_CLASS_CODES'] = pd.Categorical(dffiltered['ASSET_CLASS'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dffiltered['ASSET_CLASS_CODES'] = dffiltered['ASSET_CLASS_CODES'].cat.codes


SystemExit: 0

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
X_train.head()

In [None]:
X_test.head()

In [None]:
Y_train.head()

In [None]:
Y_test.head()

In [None]:
X_train.shape,X_test.shape,Y_train.shape,Y_test.shape

In [None]:
wordembeddings = generateEmbeddingIndex()

In [None]:
y_pred,y_val_class,y_pred_class,y_val_class_argmax,y_pred_class_argmax = bilstm(X_train, X_test, Y_train, Y_test, wordembeddings)

In [None]:
y_val_class_argmax

In [None]:
y_val_class

In [None]:
y_pred_class

In [None]:
y_pred_class_argmax

In [None]:
plt.savefig('/home/ubuntu/asset_classification/results/lstm_model_dataset1_preprocessed_chai_new.png')