# LSTM

### Importing packages

In [42]:
# Credits: https://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/
#https://www.liip.ch/en/blog/sentiment-detection-with-keras-word-embeddings-and-lstm-deep-learning-networks

# LSTM for sequence classification in the IMDB dataset
import numpy
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from keras.layers.embeddings import Embedding
from tensorflow.keras.preprocessing import text,sequence
from tensorflow.keras.layers import Activation, Add, Bidirectional, Conv1D, Dense, Dropout, Embedding, Flatten, Reshape
from tensorflow.keras.layers import concatenate, GRU, Input, LSTM, MaxPooling1D
from tensorflow.keras.layers import GlobalAveragePooling1D,  GlobalMaxPooling1D, SpatialDropout1D
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss
from sklearn.model_selection import train_test_split
from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from tensorflow.keras.initializers import he_normal
from tensorflow.python.keras import backend as k
# fix random seed for reproducibility
numpy.random.seed(7)

import numpy as np
import pandas as pd
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

### Loading preprocessed data

In [43]:
project_data = pd.read_csv("preprocessed_data.csv")

In [44]:
project_data.head()

Unnamed: 0,school_state,teacher_prefix,project_grade_category,teacher_number_of_previously_posted_projects,project_is_approved,clean_categories,clean_subcategories,essay,price
0,ca,mrs,grades_prek_2,53,1,math_science,appliedsciences health_lifescience,i fortunate enough use fairy tale stem kits cl...,725.05
1,ut,ms,grades_3_5,4,1,specialneeds,specialneeds,imagine 8 9 years old you third grade classroo...,213.03
2,ca,mrs,grades_prek_2,10,1,literacy_language,literacy,having class 24 students comes diverse learner...,329.0
3,ga,mrs,grades_prek_2,2,1,appliedlearning,earlydevelopment,i recently read article giving students choice...,481.04
4,wa,mrs,grades_3_5,2,1,literacy_language,literacy,my students crave challenge eat obstacles brea...,17.74


In [45]:
#Printing the attributes of project_data
print("Attributes :", project_data.columns.values)

Attributes : ['school_state' 'teacher_prefix' 'project_grade_category'
 'teacher_number_of_previously_posted_projects' 'project_is_approved'
 'clean_categories' 'clean_subcategories' 'essay' 'price']


### Resource data

In [46]:
resource_data = pd.read_csv('resources.csv')

In [47]:
resource_data.head(3)

Unnamed: 0,id,description,quantity,price
0,p233245,LC652 - Lakeshore Double-Space Mobile Drying Rack,1,149.0
1,p069063,Bouncy Bands for Desks (Blue support pipes),3,14.95
2,p069063,Cory Stories: A Kid's Book About Living With Adhd,1,8.45


In [48]:
# reference : https://stackoverflow.com/questions/22407798/how-to-reset-a-dataframes-indexes-for-all-groups-in-one-step
price_data = resource_data.groupby('id').agg({'price':'sum', 'quantity':'sum'}).reset_index()
price_data.head(2)

Unnamed: 0,id,price,quantity
0,p000001,459.56,7
1,p000002,515.89,21


In [49]:
# join two dataframes(project_data and price_data) in python
# reference : https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.merge.html
project_data['price'] = resource_data['price']
project_data['quantity'] = resource_data['quantity']

In [50]:
project_data.head(2)

Unnamed: 0,school_state,teacher_prefix,project_grade_category,teacher_number_of_previously_posted_projects,project_is_approved,clean_categories,clean_subcategories,essay,price,quantity
0,ca,mrs,grades_prek_2,53,1,math_science,appliedsciences health_lifescience,i fortunate enough use fairy tale stem kits cl...,149.0,1
1,ut,ms,grades_3_5,4,1,specialneeds,specialneeds,imagine 8 9 years old you third grade classroo...,14.95,3


In [51]:
#numerical inputs
project_data['num'] = project_data['teacher_number_of_previously_posted_projects'] + project_data['price'] + project_data['quantity']

In [52]:
project_data.head()

Unnamed: 0,school_state,teacher_prefix,project_grade_category,teacher_number_of_previously_posted_projects,project_is_approved,clean_categories,clean_subcategories,essay,price,quantity,num
0,ca,mrs,grades_prek_2,53,1,math_science,appliedsciences health_lifescience,i fortunate enough use fairy tale stem kits cl...,149.0,1,203.0
1,ut,ms,grades_3_5,4,1,specialneeds,specialneeds,imagine 8 9 years old you third grade classroo...,14.95,3,21.95
2,ca,mrs,grades_prek_2,10,1,literacy_language,literacy,having class 24 students comes diverse learner...,8.45,1,19.45
3,ga,mrs,grades_prek_2,2,1,appliedlearning,earlydevelopment,i recently read article giving students choice...,13.59,2,17.59
4,wa,mrs,grades_3_5,2,1,literacy_language,literacy,my students crave challenge eat obstacles brea...,24.95,3,29.95


In [53]:
col = ['teacher_number_of_previously_posted_projects', 'price', 'quantity']

project_data.drop(labels=col,axis =1, inplace=True)

In [54]:
project_data.head()

Unnamed: 0,school_state,teacher_prefix,project_grade_category,project_is_approved,clean_categories,clean_subcategories,essay,num
0,ca,mrs,grades_prek_2,1,math_science,appliedsciences health_lifescience,i fortunate enough use fairy tale stem kits cl...,203.0
1,ut,ms,grades_3_5,1,specialneeds,specialneeds,imagine 8 9 years old you third grade classroo...,21.95
2,ca,mrs,grades_prek_2,1,literacy_language,literacy,having class 24 students comes diverse learner...,19.45
3,ga,mrs,grades_prek_2,1,appliedlearning,earlydevelopment,i recently read article giving students choice...,17.59
4,wa,mrs,grades_3_5,1,literacy_language,literacy,my students crave challenge eat obstacles brea...,29.95


In [55]:
y = project_data['project_is_approved'].values
project_data.drop(['project_is_approved'], axis=1, inplace=True)
X = project_data

print(X.shape)
print(y.shape)

(109248, 7)
(109248,)


### Data preparation

In [56]:
#splitting data
X_train,X_test,y_train,y_test = train_test_split(X, y , stratify = y, train_size = 0.7)
X_train,X_cv,y_train,y_cv = train_test_split(X_train,y_train,stratify = y_train,train_size = 0.7)

In [57]:
print(X_train.shape, y_train.shape)
print(X_cv.shape, y_cv.shape)
print(X_test.shape, y_test.shape)

(53531, 7) (53531,)
(22942, 7) (22942,)
(32775, 7) (32775,)


In [58]:
#converting class labels to categorical variables
from keras.utils import to_categorical
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)
y_cv = to_categorical(y_cv)

In [59]:
X_train.to_csv('train.csv')
X_cv.to_csv('cv.csv')
X_test.to_csv('test.csv')

### Creating embedding matrix using pretrain golve model

In [19]:
emb_dict = {}
glove = open('glove.42B.300d.txt', encoding="utf8")     
for line in glove:
    values = line.split()
    word = values[0]
    vector = np.asarray(values[1:], dtype='float32')
    emb_dict[word] = vector
glove.close()

In [76]:
#https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/
def embedding_matrix(word_index, embedding_dim):
    embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = emb_dict.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

### Tokenizing and padding text data (essay)

In [211]:
from sklearn.feature_extraction.text import CountVectorizer
bag_of_words = CountVectorizer(lowercase= False)
features = []
bow_words = bag_of_words.fit_transform(X_train['essay'])
freqs = bow_words.sum(axis=0).A1
index = freqs.argsort()
words = bag_of_words.get_feature_names()

word_rank = dict()
rank = 1
for i in index[::-1]:
    k = words[i]
    word_rank[k] = rank
    rank+=1
features.append(word_rank)
rank = [] 
for sent in X_train['essay'].values:
    txt_row = []
    for word in sent.split():
        if word in word_rank.keys():
            txt_row.append(word_rank[word])
        else:
            pass
    rank.append(txt_row)
X_train['essay'] = rank

rank = []
for sent in X_test['essay'].values:
    txt_row = []
    for word in sent.split():
        if word in word_rank.keys():
            txt_row.append(word_rank[word])
        else:
            pass
    rank.append(txt_row)
X_test['essay'] = rank

rank = [] # list of all the review with words replaced with rank
for sent in X_cv['essay'].values:
    txt_row = []
    for word in sent.split():
        if word in word_rank.keys():
            txt_row.append(word_rank[word])
        else:
            pass
    rank.append(txt_row)
X_cv['essay'] = rank

from keras.preprocessing.sequence import pad_sequences
max_review_length = 250
essay_train = pad_sequences(X_train['essay'], maxlen=max_review_length) 
essay_test = pad_sequences(X_test['essay'], maxlen=max_review_length)
essay_cv = pad_sequences(X_cv['essay'], maxlen=max_review_length)

In [212]:
essay_train[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0, 1447,   78,  155,    1,  542,
        317,    1,   88,  403,  109,  242,  804, 1623,    9,  206,    3,
          1,  106,  139,  242,  279, 2252,    4,   

### Tokenizing categorical data

#### 1. School_state

In [213]:
bow_words = bag_of_words.fit_transform(X_train['school_state'])
freqs = bow_words.sum(axis=0).A1
index = freqs.argsort()
words = bag_of_words.get_feature_names()

word_rank = dict()
rank = 1
for i in index[::-1]:
    k = words[i]
    word_rank[k] = rank
    rank+=1
rank = [] 
for sent in X_train['school_state'].values:
    txt_row = []
    for word in sent.split():
        if word in word_rank.keys():
            txt_row.append(word_rank[word])
        else:
            pass
    rank.append(txt_row)
X_train['school_state'] = rank

rank = []
for sent in X_test['school_state'].values:
    txt_row = []
    for word in sent.split():
        if word in word_rank.keys():
            txt_row.append(word_rank[word])
        else:
            pass
    rank.append(txt_row)
X_test['school_state'] = rank

rank = [] # list of all the review with words replaced with rank
for sent in X_cv['school_state'].values:
    txt_row = []
    for word in sent.split():
        if word in word_rank.keys():
            txt_row.append(word_rank[word])
        else:
            pass
    rank.append(txt_row)
X_cv['school_state'] = rank

max_review_length = 1
state_train = pad_sequences(X_train['school_state'], maxlen=max_review_length) 
state_test = pad_sequences(X_test['school_state'], maxlen=max_review_length)
state_cv = pad_sequences(X_cv['school_state'], maxlen=max_review_length)

#### 2.Teacher_prefix

In [214]:
bow_words = bag_of_words.fit_transform(X_train['teacher_prefix'])
freqs = bow_words.sum(axis=0).A1
index = freqs.argsort()
words = bag_of_words.get_feature_names()

word_rank = dict()
rank = 1
for i in index[::-1]:
    k = words[i]
    word_rank[k] = rank
    rank+=1

rank = [] 
for sent in X_train['teacher_prefix'].values:
    txt_row = []
    for word in sent.split():
        if word in word_rank.keys():
            txt_row.append(word_rank[word])
        else:
            pass
    rank.append(txt_row)
X_train['teacher_prefix'] = rank

rank = []
for sent in X_test['teacher_prefix'].values:
    txt_row = []
    for word in sent.split():
        if word in word_rank.keys():
            txt_row.append(word_rank[word])
        else:
            pass
    rank.append(txt_row)
X_test['teacher_prefix'] = rank

rank = [] # list of all the review with words replaced with rank
for sent in X_cv['teacher_prefix'].values:
    txt_row = []
    for word in sent.split():
        if word in word_rank.keys():
            txt_row.append(word_rank[word])
        else:
            pass
    rank.append(txt_row)
X_cv['teacher_prefix'] = rank

max_review_length = 1
prefix_train = pad_sequences(X_train['teacher_prefix'], maxlen=max_review_length) 
prefix_test = pad_sequences(X_test['teacher_prefix'], maxlen=max_review_length)
prefix_cv = pad_sequences(X_cv['teacher_prefix'], maxlen=max_review_length)

#### 3. Project_grade_category

In [215]:
bow_words = bag_of_words.fit_transform(X_train['project_grade_category'])
freqs = bow_words.sum(axis=0).A1
index = freqs.argsort()
words = bag_of_words.get_feature_names()

word_rank = dict()
rank = 1
for i in index[::-1]:
    k = words[i]
    word_rank[k] = rank
    rank+=1
rank = [] 
for sent in X_train['project_grade_category'].values:
    txt_row = []
    for word in sent.split():
        if word in word_rank.keys():
            txt_row.append(word_rank[word])
        else:
            pass
    rank.append(txt_row)
X_train['project_grade_category'] = rank

rank = []
for sent in X_test['project_grade_category'].values:
    txt_row = []
    for word in sent.split():
        if word in word_rank.keys():
            txt_row.append(word_rank[word])
        else:
            pass
    rank.append(txt_row)
X_test['project_grade_category'] = rank

rank = [] # list of all the review with words replaced with rank
for sent in X_cv['project_grade_category'].values:
    txt_row = []
    for word in sent.split():
        if word in word_rank.keys():
            txt_row.append(word_rank[word])
        else:
            pass
    rank.append(txt_row)
X_cv['project_grade_category'] = rank

max_review_length = 1
grade_train = pad_sequences(X_train['project_grade_category'], maxlen=max_review_length) 
grade_test = pad_sequences(X_test['project_grade_category'], maxlen=max_review_length)
grade_cv = pad_sequences(X_cv['project_grade_category'], maxlen=max_review_length)

#### 4. Clean_categories

In [216]:
bow_words = bag_of_words.fit_transform(X_train['clean_categories'])
freqs = bow_words.sum(axis=0).A1
index = freqs.argsort()
words = bag_of_words.get_feature_names()

word_rank = dict()
rank = 1
for i in index[::-1]:
    k = words[i]
    word_rank[k] = rank
    rank+=1

rank = [] 
for sent in X_train['clean_categories'].values:
    txt_row = []
    for word in sent.split():
        if word in word_rank.keys():
            txt_row.append(word_rank[word])
        else:
            pass
    rank.append(txt_row)
X_train['clean_categories'] = rank

rank = []
for sent in X_test['clean_categories'].values:
    txt_row = []
    for word in sent.split():
        if word in word_rank.keys():
            txt_row.append(word_rank[word])
        else:
            pass
    rank.append(txt_row)
X_test['clean_categories'] = rank

rank = [] # list of all the review with words replaced with rank
for sent in X_cv['clean_categories'].values:
    txt_row = []
    for word in sent.split():
        if word in word_rank.keys():
            txt_row.append(word_rank[word])
        else:
            pass
    rank.append(txt_row)
X_cv['clean_categories'] = rank

max_review_length = 1
clean_cat_train = pad_sequences(X_train['clean_categories'], maxlen=max_review_length) 
clean_cat_test = pad_sequences(X_test['clean_categories'], maxlen=max_review_length)
clean_cat_cv = pad_sequences(X_cv['clean_categories'], maxlen=max_review_length)

#### 5. Clean_subcategories

In [217]:
bow_words = bag_of_words.fit_transform(X_train['clean_subcategories'])
freqs = bow_words.sum(axis=0).A1
index = freqs.argsort()
words = bag_of_words.get_feature_names()

word_rank = dict()
rank = 1
for i in index[::-1]:
    k = words[i]
    word_rank[k] = rank
    rank+=1

rank = [] 
for sent in X_train['clean_subcategories'].values:
    txt_row = []
    for word in sent.split():
        if word in word_rank.keys():
            txt_row.append(word_rank[word])
        else:
            pass
    rank.append(txt_row)
X_train['clean_subcategories'] = rank

rank = []
for sent in X_test['clean_subcategories'].values:
    txt_row = []
    for word in sent.split():
        if word in word_rank.keys():
            txt_row.append(word_rank[word])
        else:
            pass
    rank.append(txt_row)
X_test['clean_subcategories'] = rank

rank = [] # list of all the review with words replaced with rank
for sent in X_cv['clean_subcategories'].values:
    txt_row = []
    for word in sent.split():
        if word in word_rank.keys():
            txt_row.append(word_rank[word])
        else:
            pass
    rank.append(txt_row)
X_cv['clean_subcategories'] = rank

max_review_length = 1
clean_subcat_train = pad_sequences(X_train['clean_subcategories'], maxlen=max_review_length) 
clean_subcat_test = pad_sequences(X_test['clean_subcategories'], maxlen=max_review_length)
clean_subcat_cv = pad_sequences(X_cv['clean_subcategories'], maxlen=max_review_length)

### Function for AUC Score

In [68]:
#https://stackoverflow.com/questions/41032551/how-to-compute-receiving-operating-characteristic-roc-and-auc-in-keras
#https://developpaper.com/question/how-to-apply-the-custom-operation-of-py_func-in-tensorflow-to-keras/
def auc(y_true, y_pred) :
    score = tf.py_func( lambda y_true, y_pred : roc_auc_score( y_true, y_pred, average='macro', sample_weight=None).astype('float32'),
                        [y_true, y_pred],
                        'float32',
                        stateful=True,
                        name='sklearnAUC')
    return score

# --------------------------------------------  Model 1  ----------------------------------------------------

<img src='https://i.imgur.com/w395Yk9.png'>
ref: https://i.imgur.com/w395Yk9.png

In [79]:
from tensorflow.keras import optimizers
from tensorflow.keras.initializers import he_normal
from tensorflow.keras.layers import BatchNormalization
from time import time
from tensorflow.python.keras.callbacks import TensorBoard

tf.keras.backend.clear_session()

#Essay input --> 1
essay = Input(shape=(250,), name="essay")
x1 = Embedding(input_dim=42533,output_dim=300,trainable=False,weights=[embedding_matrix(features[0],300)])(essay)
x1 = (Dense(64, activation='relu',kernel_initializer=he_normal()))(x1)
x1 = SpatialDropout1D(0.3)(x1)
x1 = (Dense(64, activation='relu',kernel_initializer=he_normal()))(x1)
x1 = SpatialDropout1D(0.3)(x1)
x1 = Flatten()(x1)


#State input --> 2
state = Input(shape=(1,), name="state")
x2 = Embedding(input_dim=52,output_dim=2)(state)
x2 = (Dense(64, activation='relu',kernel_initializer=he_normal()))(x2)
x2 = SpatialDropout1D(0.3)(x2)
x2 = (Dense(64, activation='relu',kernel_initializer=he_normal()))(x2)
x2 = SpatialDropout1D(0.3)(x2)
x2 = (Dense(64, activation='relu',kernel_initializer=he_normal()))(x2)
x2 = SpatialDropout1D(0.3)(x2)
x2 = Flatten()(x2)

#Teacher prefix input --> 3
prefix = Input(shape=(1,), name="prefix")
x3 = Embedding(input_dim=5,output_dim=2)(prefix)
x3 = (Dense(64, activation='relu',kernel_initializer=he_normal()))(x3)
x3 = SpatialDropout1D(0.3)(x3)
x3 = (Dense(64, activation='relu',kernel_initializer=he_normal()))(x3)
x3 = SpatialDropout1D(0.3)(x3)
x3 = Flatten()(x3)

#Grade category input --> 4
grade = Input(shape=(1,), name="grade")
x4 = Embedding(input_dim=50,output_dim=2)(grade)
x4 = (Dense(64, activation='relu',kernel_initializer=he_normal()))(x4)
x4 = SpatialDropout1D(0.3)(x4)
x4 = (Dense(64, activation='relu',kernel_initializer=he_normal()))(x4)
x4 = SpatialDropout1D(0.3)(x4)
x4 = Flatten()(x4)

#Subject category input --> 5
subj_cat = Input(shape=(1,), name="subject_category")
x5 = Embedding(input_dim=385,output_dim=50)(subj_cat)
x5 = (Dense(64, activation='relu',kernel_initializer=he_normal()))(x5)
x5 = SpatialDropout1D(0.3)(x5)
x5 = (Dense(64, activation='relu',kernel_initializer=he_normal()))(x5)
x5 = SpatialDropout1D(0.3)(x5)
x5 = Flatten()(x5)

#Subject subcategory input --> 6
subj_subcat = Input(shape=(1,), name="subject_sub_category")
x6 = Embedding(input_dim=6,output_dim=5)(subj_subcat)
x6 = (Dense(64, activation='relu',kernel_initializer=he_normal()))(x6)
x6 = SpatialDropout1D(0.3)(x6)
x6 = (Dense(64, activation='relu',kernel_initializer=he_normal()))(x6)
x6 = SpatialDropout1D(0.3)(x6)
x6 = Flatten()(x6)

#Numerical input -->7
num = Input(shape=(1,), name="numerical")
x7 = (Dense(32, activation='relu',kernel_initializer=he_normal()))(num)
x7 = (Dense(64, activation='relu',kernel_initializer=he_normal()))(x7)
x7 = Dropout(0.3)(x7)
x7 = (Dense(64, activation='relu',kernel_initializer=he_normal()))(x7)
x7 = Dropout(0.3)(x7)


concat = concatenate([x1,x2,x3,x4,x5,x6,x7])

x = (Dense(32, activation='relu',kernel_initializer=he_normal()))(concat)
x = Dropout(0.3)(x)
x = BatchNormalization()(x)
x = (Dense(64, activation='relu',kernel_initializer=he_normal()))(x)
x = Dropout(0.3)(x)
x = BatchNormalization()(x)
x = (Dense(64, activation='relu',kernel_initializer=he_normal()))(x)
x = Dropout(0.3)(x)
x = BatchNormalization()(x)
x = (Dense(64, activation='relu',kernel_initializer=he_normal()))(x)
output = (Dense(2, activation='softmax'))(x)
 

model = Model([essay,state,prefix,grade,subj_cat,subj_subcat,num], output)

#https://www.youtube.com/watch?v=2U6Jl7oqRkM
#Instantiating tensorboard for model visualization
#To visualize, run -  tensorboard --log_dir=logs/{} in command prompt
log_dir="logs/visualize/"
tensorboard = TensorBoard(log_dir.format(time()))

model.compile(loss="categorical_crossentropy", optimizer='adam', metrics=[auc])

print(model.summary())

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
state (InputLayer)              [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1, 2)         104         state[0][0]                      
__________________________________________________________________________________________________
essay (InputLayer)              [(None, 250)]        0                                            
__________________________________________________________________________________________________
dense_2 (Dense)                 (None, 1, 64)        192         embedding_1[0][0]                
______________________________________________________________________________________________

In [80]:
model.fit([essay_train,state_train,grade_train,prefix_train,clean_cat_train,clean_subcat_train,X_train['num']], y_train, epochs=20, verbose=1, batch_size=300, validation_data=([essay_cv,state_cv,grade_cv,prefix_cv,clean_cat_cv,clean_subcat_cv,X_cv['num']], y_cv), callbacks=[tensorboard])

Train on 53531 samples, validate on 22942 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x144598045c0>

#### Saving model weights

In [81]:
#https://machinelearningmastery.com/save-load-keras-deep-learning-models/
# serialize model to JSON
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model1.h5")

### Model visualization

In [82]:
print("Train AUC",roc_auc_score(y_train,(model.predict([essay_train,state_train,grade_train,prefix_train,clean_cat_train,clean_subcat_train,X_train['num']]))))
print("-"*50)
print("Cv AUC",roc_auc_score(y_cv,model.predict([essay_cv,state_cv,grade_cv,prefix_cv,clean_cat_cv,clean_subcat_cv,X_cv['num']])))
print("-"*50)
print("Test AUC",roc_auc_score(y_test,model.predict([essay_test,state_test,grade_test,prefix_test,clean_cat_test,clean_subcat_test,X_test['num']])))

Train AUC 0.7809139488148835
--------------------------------------------------
Cv AUC 0.7231819219387698
--------------------------------------------------
Test AUC 0.73341316458027


# --------------------------------------------  Model 2  ----------------------------------------------------

In [33]:
X_train = pd.read_csv('train.csv')
X_cv = pd.read_csv('cv.csv')
X_test = pd.read_csv('test.csv')

In [34]:
X_train.head(3)

Unnamed: 0.1,Unnamed: 0,school_state,teacher_prefix,project_grade_category,clean_categories,clean_subcategories,essay,num
0,42462,or,ms,grades_6_8,specialneeds,specialneeds,i privilege working amazing students although ...,406.06
1,33162,il,ms,grades_prek_2,appliedlearning,earlydevelopment,my little ones beyond special they diverse ric...,350.99
2,3065,la,ms,grades_3_5,literacy_language,literacy,i would really like students interested readin...,21.99


In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
tfidf = TfidfVectorizer()
data_text = tfidf.fit_transform(X_train['essay'])
plt.boxplot(tfidf.idf_)
plt.ylabel("IDF score")

Text(0, 0.5, 'IDF score')

In [18]:
print("25 percentile (idf):", np.percentile(tfidf.idf_,[25]))
print("50 percentile (idf):",np.percentile(tfidf.idf_,[50]))
print("75 percentile (idf):",np.percentile(tfidf.idf_,[75]))
print("90 percentile (idf):",np.percentile(tfidf.idf_,[90]))

25 percentile (idf): [9.17998468]
50 percentile (idf): [10.50174052]
75 percentile (idf): [11.1948877]
90 percentile (idf): [11.1948877]


In [19]:
feat_idf_val = zip(tfidf.get_feature_names(),tfidf.idf_)

In [20]:
#Taking only those feature which have idf value between 25th and 75th percentile
feat = []
for f,val in feat_idf_val:
    if val>=9.31350907 and val<=11.32841209:
        feat.append(f)
    else:
        continue   

In [21]:
feat[0:5]

['001', '002', '00am', '00p', '00pm']

### Considering only those features with idf value between 25th and 75th percentile in 'project_essay'

In [None]:
train_essay = []
for text in X_train['essay']:
    sent = " "
    words = text.split()
    for word in words:
        if word in feat:
            sent = " "+word
        else:
            continue
    train_essay.append(sent)
X_train['essay'] = train_essay  

#Featurizing cv essay
cv_essay = []
for text in X_cv['essay']:
    sent = " "
    words = text.split()
    for word in words:
        if word in feat:
            sent = " "+word
        else:
            continue
    cv_essay.append(sent)
X_cv['essay'] = cv_essay  


#Featurizing test essay
test_essay = []
for text in X_test['essay']:
    sent = " "
    words = text.split()
    for word in words:
        if word in feat:
            sent = " "+word
        else:
            continue
    test_essay.append(sent)
X_test['essay'] = test_essay  

#### 1. Essay

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
bag_of_words = CountVectorizer(lowercase= False)
features = []
bow_words = bag_of_words.fit_transform(X_train['essay'])
freqs = bow_words.sum(axis=0).A1
index = freqs.argsort()
words = bag_of_words.get_feature_names()

word_rank = dict()
rank = 1
for i in index[::-1]:
    k = words[i]
    word_rank[k] = rank
    rank+=1
features.append(word_rank)
rank = [] 
for sent in X_train['essay'].values:
    txt_row = []
    for word in sent.split():
        if word in word_rank.keys():
            txt_row.append(word_rank[word])
        else:
            pass
    rank.append(txt_row)
X_train['essay'] = rank

rank = []
for sent in X_test['essay'].values:
    txt_row = []
    for word in sent.split():
        if word in word_rank.keys():
            txt_row.append(word_rank[word])
        else:
            pass
    rank.append(txt_row)
X_test['essay'] = rank

rank = [] # list of all the review with words replaced with rank
for sent in X_cv['essay'].values:
    txt_row = []
    for word in sent.split():
        if word in word_rank.keys():
            txt_row.append(word_rank[word])
        else:
            pass
    rank.append(txt_row)
X_cv['essay'] = rank

from keras.preprocessing.sequence import pad_sequences
max_review_length = 250
essay_train_m2 = pad_sequences(X_train['essay'], maxlen=max_review_length) 
essay_test_m2 = pad_sequences(X_test['essay'], maxlen=max_review_length)
essay_cv_m2 = pad_sequences(X_cv['essay'], maxlen=max_review_length)

#### 1.School State

In [42]:
bow_words = bag_of_words.fit_transform(X_train['school_state'])
freqs = bow_words.sum(axis=0).A1
index = freqs.argsort()
words = bag_of_words.get_feature_names()

word_rank = dict()
rank = 1
for i in index[::-1]:
    k = words[i]
    word_rank[k] = rank
    rank+=1
features.append(word_rank)
rank = [] 
for sent in X_train['school_state'].values:
    txt_row = []
    for word in sent.split():
        if word in word_rank.keys():
            txt_row.append(word_rank[word])
        else:
            pass
    rank.append(txt_row)
X_train['school_state'] = rank

rank = []
for sent in X_test['school_state'].values:
    txt_row = []
    for word in sent.split():
        if word in word_rank.keys():
            txt_row.append(word_rank[word])
        else:
            pass
    rank.append(txt_row)
X_test['school_state'] = rank

rank = [] # list of all the review with words replaced with rank
for sent in X_cv['school_state'].values:
    txt_row = []
    for word in sent.split():
        if word in word_rank.keys():
            txt_row.append(word_rank[word])
        else:
            pass
    rank.append(txt_row)
X_cv['school_state'] = rank

max_review_length = 1
state_train = pad_sequences(X_train['school_state'], maxlen=max_review_length) 
state_test = pad_sequences(X_test['school_state'], maxlen=max_review_length)
state_cv = pad_sequences(X_cv['school_state'], maxlen=max_review_length)

#### 2.Teacher_prefix

In [43]:
bow_words = bag_of_words.fit_transform(X_train['teacher_prefix'])
freqs = bow_words.sum(axis=0).A1
index = freqs.argsort()
words = bag_of_words.get_feature_names()

word_rank = dict()
rank = 1
for i in index[::-1]:
    k = words[i]
    word_rank[k] = rank
    rank+=1
features.append(word_rank)
rank = [] 
for sent in X_train['teacher_prefix'].values:
    txt_row = []
    for word in sent.split():
        if word in word_rank.keys():
            txt_row.append(word_rank[word])
        else:
            pass
    rank.append(txt_row)
X_train['teacher_prefix'] = rank

rank = []
for sent in X_test['teacher_prefix'].values:
    txt_row = []
    for word in sent.split():
        if word in word_rank.keys():
            txt_row.append(word_rank[word])
        else:
            pass
    rank.append(txt_row)
X_test['teacher_prefix'] = rank

rank = [] # list of all the review with words replaced with rank
for sent in X_cv['teacher_prefix'].values:
    txt_row = []
    for word in sent.split():
        if word in word_rank.keys():
            txt_row.append(word_rank[word])
        else:
            pass
    rank.append(txt_row)
X_cv['teacher_prefix'] = rank

max_review_length = 1
prefix_train = pad_sequences(X_train['teacher_prefix'], maxlen=max_review_length) 
prefix_test = pad_sequences(X_test['teacher_prefix'], maxlen=max_review_length)
prefix_cv = pad_sequences(X_cv['teacher_prefix'], maxlen=max_review_length)

#### 3. Project_grade_category

In [44]:
bow_words = bag_of_words.fit_transform(X_train['project_grade_category'])
freqs = bow_words.sum(axis=0).A1
index = freqs.argsort()
words = bag_of_words.get_feature_names()

word_rank = dict()
rank = 1
for i in index[::-1]:
    k = words[i]
    word_rank[k] = rank
    rank+=1
features.append(word_rank)
rank = [] 
for sent in X_train['project_grade_category'].values:
    txt_row = []
    for word in sent.split():
        if word in word_rank.keys():
            txt_row.append(word_rank[word])
        else:
            pass
    rank.append(txt_row)
X_train['project_grade_category'] = rank

rank = []
for sent in X_test['project_grade_category'].values:
    txt_row = []
    for word in sent.split():
        if word in word_rank.keys():
            txt_row.append(word_rank[word])
        else:
            pass
    rank.append(txt_row)
X_test['project_grade_category'] = rank

rank = [] # list of all the review with words replaced with rank
for sent in X_cv['project_grade_category'].values:
    txt_row = []
    for word in sent.split():
        if word in word_rank.keys():
            txt_row.append(word_rank[word])
        else:
            pass
    rank.append(txt_row)
X_cv['project_grade_category'] = rank

max_review_length = 1
grade_train = pad_sequences(X_train['project_grade_category'], maxlen=max_review_length) 
grade_test = pad_sequences(X_test['project_grade_category'], maxlen=max_review_length)
grade_cv = pad_sequences(X_cv['project_grade_category'], maxlen=max_review_length)

#### 4. Clean_categories

In [45]:
bow_words = bag_of_words.fit_transform(X_train['clean_categories'])
freqs = bow_words.sum(axis=0).A1
index = freqs.argsort()
words = bag_of_words.get_feature_names()

word_rank = dict()
rank = 1
for i in index[::-1]:
    k = words[i]
    word_rank[k] = rank
    rank+=1
features.append(word_rank)
rank = [] 
for sent in X_train['clean_categories'].values:
    txt_row = []
    for word in sent.split():
        if word in word_rank.keys():
            txt_row.append(word_rank[word])
        else:
            pass
    rank.append(txt_row)
X_train['clean_categories'] = rank

rank = []
for sent in X_test['clean_categories'].values:
    txt_row = []
    for word in sent.split():
        if word in word_rank.keys():
            txt_row.append(word_rank[word])
        else:
            pass
    rank.append(txt_row)
X_test['clean_categories'] = rank

rank = [] # list of all the review with words replaced with rank
for sent in X_cv['clean_categories'].values:
    txt_row = []
    for word in sent.split():
        if word in word_rank.keys():
            txt_row.append(word_rank[word])
        else:
            pass
    rank.append(txt_row)
X_cv['clean_categories'] = rank

max_review_length = 1
clean_cat_train = pad_sequences(X_train['clean_categories'], maxlen=max_review_length) 
clean_cat_test = pad_sequences(X_test['clean_categories'], maxlen=max_review_length)
clean_cat_cv = pad_sequences(X_cv['clean_categories'], maxlen=max_review_length)

#### 5. Clean_subcategories

In [46]:
bow_words = bag_of_words.fit_transform(X_train['clean_subcategories'])
freqs = bow_words.sum(axis=0).A1
index = freqs.argsort()
words = bag_of_words.get_feature_names()

word_rank = dict()
rank = 1
for i in index[::-1]:
    k = words[i]
    word_rank[k] = rank
    rank+=1
features.append(word_rank)
rank = [] 
for sent in X_train['clean_subcategories'].values:
    txt_row = []
    for word in sent.split():
        if word in word_rank.keys():
            txt_row.append(word_rank[word])
        else:
            pass
    rank.append(txt_row)
X_train['clean_subcategories'] = rank

rank = []
for sent in X_test['clean_subcategories'].values:
    txt_row = []
    for word in sent.split():
        if word in word_rank.keys():
            txt_row.append(word_rank[word])
        else:
            pass
    rank.append(txt_row)
X_test['clean_subcategories'] = rank

rank = [] # list of all the review with words replaced with rank
for sent in X_cv['clean_subcategories'].values:
    txt_row = []
    for word in sent.split():
        if word in word_rank.keys():
            txt_row.append(word_rank[word])
        else:
            pass
    rank.append(txt_row)
X_cv['clean_subcategories'] = rank

max_review_length = 1
clean_subcat_train = pad_sequences(X_train['clean_subcategories'], maxlen=max_review_length) 
clean_subcat_test = pad_sequences(X_test['clean_subcategories'], maxlen=max_review_length)
clean_subcat_cv = pad_sequences(X_cv['clean_subcategories'], maxlen=max_review_length)

In [47]:
from tensorflow.keras import optimizers
from tensorflow.keras.initializers import he_normal
from tensorflow.keras.layers import BatchNormalization
from time import time
from tensorflow.python.keras.callbacks import TensorBoard

tf.keras.backend.clear_session()

#Essay input --> 1
essay = Input(shape=(250,1), name="essay")
x1 = Embedding(input_dim=17508,output_dim=300,trainable=False,weights=[embedding_matrix(features[0],300)])(essay)
x1 = (Dense(64, activation='relu',kernel_initializer=he_normal()))(x1)
x1 = Flatten()(x1)


#State input --> 2
state = Input(shape=(1,), name="state")
x2 = Embedding(input_dim=52,output_dim=2)(state)
x2 = (Dense(64, activation='relu',kernel_initializer=he_normal()))(x2)
x2 = SpatialDropout1D(0.3)(x2)
x2 = Flatten()(x2)

#Teacher prefix input --> 3
prefix = Input(shape=(1,), name="prefix")
x3 = Embedding(input_dim=5,output_dim=2)(prefix)
x3 = (Dense(64, activation='relu',kernel_initializer=he_normal()))(x3)
x3 = SpatialDropout1D(0.3)(x3)
x3 = Flatten()(x3)

#Grade category input --> 4
grade = Input(shape=(1,), name="grade")
x4 = Embedding(input_dim=50,output_dim=2)(grade)
x4 = (Dense(64, activation='relu',kernel_initializer=he_normal()))(x4)
x4 = SpatialDropout1D(0.3)(x4)

x4 = Flatten()(x4)

#Subject category input --> 5
subj_cat = Input(shape=(1,), name="subject_category")
x5 = Embedding(input_dim=385,output_dim=50)(subj_cat)
x5 = (Dense(64, activation='relu',kernel_initializer=he_normal()))(x5)
x5 = SpatialDropout1D(0.3)(x5)
x5 = Flatten()(x5)

#Subject subcategory input --> 6
subj_subcat = Input(shape=(1,), name="subject_sub_category")
x6 = Embedding(input_dim=6,output_dim=5)(subj_subcat)
x6 = (Dense(64, activation='relu',kernel_initializer=he_normal()))(x6)

x6 = Flatten()(x6)

#Numerical input -->7
num = Input(shape=(1,), name="numerical")
x7 = (Dense(32, activation='relu',kernel_initializer=he_normal()))(num)
x7 = (Dense(64, activation='relu',kernel_initializer=he_normal()))(x7)
x7 = Dropout(0.3)(x7)



concat = concatenate([x1,x2,x3,x4,x5,x6,x7])

x = (Dense(32, activation='relu',kernel_initializer=he_normal()))(concat)
x = Dropout(0.3)(x)
x = BatchNormalization()(x)
x = (Dense(64, activation='relu',kernel_initializer=he_normal()))(x)

output = (Dense(2, activation='softmax'))(x)
 

model = Model([essay,state,prefix,grade,subj_cat,subj_subcat,num], output)

#https://www.youtube.com/watch?v=2U6Jl7oqRkM
#Instantiating tensorboard for model visualization
#To visualize, run -  tensorboard --log_dir=logs/{} in command prompt
log_dir="logs/visualize_m2/"
tensorboard = TensorBoard(log_dir.format(time()))

model.compile(loss="categorical_crossentropy", optimizer='adam', metrics=[auc])

print(model.summary())

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
state (InputLayer)              [(None, 1)]          0                                            
__________________________________________________________________________________________________
prefix (InputLayer)             [(None, 1)]          0                                            
__________________________________________________________________________________________________
grade (InputLayer)              [(None, 1)]          0                                            
__________________________________________________________________________________________________
subject_category (InputLayer)   [(None, 1)]          0                                            
______________________________________________________________________________________________

In [48]:
model.fit([essay_train_m2,state_train,grade_train,prefix_train,clean_cat_train,clean_subcat_train,X_train['num']], y_train, epochs=20, verbose=1, batch_size=300, validation_data=([essay_cv_m2,state_cv,grade_cv,prefix_cv,clean_cat_cv,clean_subcat_cv,X_cv['num']], y_cv), callbacks=[tensorboard])

Train on 53531 samples, validate on 22942 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x18a72b9c358>

#### Saving model weights

In [49]:
#https://machinelearningmastery.com/save-load-keras-deep-learning-models/
# serialize model to JSON
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model2.h5")

### Model visualization

In [50]:
print("Train AUC",roc_auc_score(y_train,(model.predict([essay_train_m2,state_train,grade_train,prefix_train,clean_cat_train,clean_subcat_train,X_train['num']]))))
print("-"*50)
print("Cv AUC",roc_auc_score(y_cv,model.predict([essay_cv_m2,state_cv,grade_cv,prefix_cv,clean_cat_cv,clean_subcat_cv,X_cv['num']])))
print("-"*50)
print("Test AUC",roc_auc_score(y_test,model.predict([essay_test_m2,state_test,grade_test,prefix_test,clean_cat_test,clean_subcat_test,X_test['num']])))

Train AUC 0.6774337552953027
--------------------------------------------------
Cv AUC 0.5439057084835437
--------------------------------------------------
Test AUC 0.5417117791316319


# --------------------------------------------  Model 3  ----------------------------------------------------

### One-hot encoding categorical features

In [219]:
X_train = pd.read_csv('train.csv')
X_cv = pd.read_csv('cv.csv')
X_test = pd.read_csv('test.csv')

In [220]:
from sklearn.feature_extraction.text import CountVectorizer
bag_of_words = CountVectorizer(lowercase= False)
features = []
bow_words = bag_of_words.fit_transform(X_train['essay'])
freqs = bow_words.sum(axis=0).A1
index = freqs.argsort()
words = bag_of_words.get_feature_names()

word_rank = dict()
rank = 1
for i in index[::-1]:
    k = words[i]
    word_rank[k] = rank
    rank+=1
features.append(word_rank)
rank = [] 
for sent in X_train['essay'].values:
    txt_row = []
    for word in sent.split():
        if word in word_rank.keys():
            txt_row.append(word_rank[word])
        else:
            pass
    rank.append(txt_row)
X_train['essay'] = rank

rank = []
for sent in X_test['essay'].values:
    txt_row = []
    for word in sent.split():
        if word in word_rank.keys():
            txt_row.append(word_rank[word])
        else:
            pass
    rank.append(txt_row)
X_test['essay'] = rank

rank = [] # list of all the review with words replaced with rank
for sent in X_cv['essay'].values:
    txt_row = []
    for word in sent.split():
        if word in word_rank.keys():
            txt_row.append(word_rank[word])
        else:
            pass
    rank.append(txt_row)
X_cv['essay'] = rank

from keras.preprocessing.sequence import pad_sequences
max_review_length = 250
essay_train = pad_sequences(X_train['essay'], maxlen=max_review_length) 
essay_test = pad_sequences(X_test['essay'], maxlen=max_review_length)
essay_cv = pad_sequences(X_cv['essay'], maxlen=max_review_length)

In [221]:
print(essay_train.shape)
essay_train[0]

(53531, 250)


array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0, 1447,   78,  155,    1,  542,
        317,    1,   88,  403,  109,  242,  804, 1623,    9,  206,    3,
          1,  106,  139,  242,  279, 2252,    4,   

In [229]:
token = CountVectorizer()

school_state_train = (token.fit_transform(X_train['school_state'])).toarray()
school_state_test = (token.transform(X_test['school_state'])).toarray()
school_state_cv = (token.transform(X_cv['school_state'])).toarray()

print(school_state_train.shape)

(53531, 51)


In [230]:
prefix_train = token.fit_transform(X_train['teacher_prefix']).toarray()
prefix_cv = token.transform(X_cv['teacher_prefix']).toarray()
prefix_test = token.transform(X_test['teacher_prefix']).toarray()
print(prefix_train.shape)

(53531, 5)


In [231]:
grade_train = token.fit_transform(X_train['project_grade_category']).toarray()
grade_cv = token.transform(X_cv['project_grade_category']).toarray()
grade_test = token.transform(X_test['project_grade_category']).toarray()
print(grade_train.shape)

(53531, 4)


In [232]:
cat_train = token.fit_transform(X_train['clean_categories']).toarray()
cat_cv = token.transform(X_cv['clean_categories']).toarray()
cat_test = token.transform(X_test['clean_categories']).toarray()
print(cat_train.shape)

(53531, 9)


In [236]:
subcat_train = token.fit_transform(X_train['clean_subcategories']).toarray()
subcat_cv = token.transform(X_cv['clean_subcategories']).toarray()
subcat_test = token.transform(X_test['clean_subcategories']).toarray()
print(subcat_train.shape)

(53531, 30)


In [237]:
train_num = X_train['num'].values.reshape(-1,1)
cv_num = X_cv['num'].values.reshape(-1,1)
test_num = X_test['num'].values.reshape(-1,1)
print(train_num.shape)
print(cv_num.shape)
print(test_num.shape)

(53531, 1)
(22942, 1)
(32775, 1)


In [238]:
cat_num_train_feat = np.hstack((school_state_train,prefix_train,grade_train,cat_train,subcat_train,train_num))
cat_num_cv_feat = np.hstack((school_state_cv,prefix_cv,grade_cv,cat_cv,subcat_cv,cv_num))
cat_num_test_feat = np.hstack((school_state_test,prefix_test,grade_test,cat_test,subcat_test,test_num))
print(cat_num_train_feat.shape)
print(cat_num_cv_feat.shape)
print(cat_num_test_feat.shape)

(53531, 100)
(22942, 100)
(32775, 100)


In [239]:
cat_num_train_feat = np.resize(cat_num_train_feat, new_shape=(53531,100,1))
cat_num_cv_feat = np.resize(cat_num_cv_feat, new_shape=(22942,100,1))
cat_num_test_feat = np.resize(cat_num_test_feat, new_shape=(32775,100,1))

<img src='https://i.imgur.com/fkQ8nGo.png'>
ref: https://i.imgur.com/fkQ8nGo.png

In [242]:
tf.keras.backend.clear_session()

# input 1
essay = Input(batch_shape=(None,250), name="essay_input")
x1 = Embedding(input_dim=42533,output_dim = 300,weights=[embedding_matrix(features[0],300)],trainable = False)(essay)
x1 = (Dense(64, activation='relu',kernel_initializer=he_normal()))(x1)
x1 = SpatialDropout1D(0.3)(x1)
x1 = (Dense(128, activation='relu',kernel_initializer=he_normal()))(x1)
x1 = SpatialDropout1D(0.3)(x1)
x1 = Flatten()(x1)

# input 2
other = Input(shape=(100,1),name="other_input")
x2 = Conv1D(filters=64,kernel_size=2,strides=1)(other)
x2 = Conv1D(filters=128,kernel_size=2,strides=1)(x2)
x2 = Dropout(0.3)(x2)
x2 = Conv1D(filters=512,kernel_size=2,strides=1)(x2)
x2 = Flatten()(x2)

concat = concatenate([x1,x2])

x = (Dense(32, activation='relu',kernel_initializer=he_normal()))(concat)
x = (Dense(64, activation='relu',kernel_initializer=he_normal()))(x)
x = (Dense(128, activation='relu',kernel_initializer=he_normal()))(x)
x = Dropout(0.3)(x)
x = BatchNormalization()(x)
x = (Dense(512, activation='relu',kernel_initializer=he_normal()))(x)
x = Dropout(0.3)(x)
x = (Dense(512, activation='relu',kernel_initializer=he_normal()))(x)

output = (Dense(2, activation='softmax'))(x)
model = Model([essay,other], output)

#To visualize, run -  tensorboard --log_dir=logs/ in command prompt
log_dir="logs/visualize_m3/"
tensorboard = TensorBoard(log_dir.format(time()))
model.compile(loss="categorical_crossentropy", optimizer='adam', metrics=[auc])
print(model.summary())

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
essay_input (InputLayer)        [(None, 250)]        0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 250, 300)     12759900    essay_input[0][0]                
__________________________________________________________________________________________________
other_input (InputLayer)        [(None, 100, 1)]     0                                            
__________________________________________________________________________________________________
dense (Dense)                   (None, 250, 64)      19264       embedding[0][0]                  
______________________________________________________________________________________________

In [243]:
model.fit([essay_train,cat_num_train_feat], y_train, epochs=20, verbose=1, batch_size=300, validation_data=([essay_cv,cat_num_cv_feat], y_cv), callbacks=[tensorboard])

Train on 53531 samples, validate on 22942 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x1445e90f2b0>

#### Saving model weights

In [None]:
#https://machinelearningmastery.com/save-load-keras-deep-learning-models/
# serialize model to JSON
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model3.h5")

### Model visualization

In [None]:
print("Train AUC",roc_auc_score(y_train,(model.predict([essay_train_m2,state_train,grade_train,prefix_train,clean_cat_train,clean_subcat_train,X_train['num']]))))
print("-"*50)
print("Cv AUC",roc_auc_score(y_cv,model.predict([essay_cv_m2,state_cv,grade_cv,prefix_cv,clean_cat_cv,clean_subcat_cv,X_cv['num']])))
print("-"*50)
print("Test AUC",roc_auc_score(y_test,model.predict([essay_test_m2,state_test,grade_test,prefix_test,clean_cat_test,clean_subcat_test,X_test['num']])))