# LSTM

### Importing packages

In [1]:
# Credits: https://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/
#https://www.liip.ch/en/blog/sentiment-detection-with-keras-word-embeddings-and-lstm-deep-learning-networks

# LSTM for sequence classification in the IMDB dataset
import numpy
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from keras.layers.embeddings import Embedding
from tensorflow.keras.preprocessing import text,sequence
from tensorflow.keras.layers import Activation, Add, Bidirectional, Conv1D, Dense, Dropout, Embedding, Flatten, Reshape
from tensorflow.keras.layers import concatenate, GRU, Input, CuDNNLSTM, MaxPooling1D
from tensorflow.keras.layers import GlobalAveragePooling1D,  GlobalMaxPooling1D, SpatialDropout1D
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss
from sklearn.model_selection import train_test_split
from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from tensorflow.keras.initializers import he_normal
from tensorflow.keras.regularizers import l2
from tensorflow.python.keras import backend as k
# fix random seed for reproducibility
numpy.random.seed(7)

import numpy as np
import pandas as pd
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

Using TensorFlow backend.


### Loading preprocessed data

In [2]:
project_data = pd.read_csv("preprocessed_data.csv")

In [3]:
project_data.head()

Unnamed: 0,school_state,teacher_prefix,project_grade_category,teacher_number_of_previously_posted_projects,project_is_approved,clean_categories,clean_subcategories,essay,price
0,ca,mrs,grades_prek_2,53,1,math_science,appliedsciences health_lifescience,i fortunate enough use fairy tale stem kits cl...,725.05
1,ut,ms,grades_3_5,4,1,specialneeds,specialneeds,imagine 8 9 years old you third grade classroo...,213.03
2,ca,mrs,grades_prek_2,10,1,literacy_language,literacy,having class 24 students comes diverse learner...,329.0
3,ga,mrs,grades_prek_2,2,1,appliedlearning,earlydevelopment,i recently read article giving students choice...,481.04
4,wa,mrs,grades_3_5,2,1,literacy_language,literacy,my students crave challenge eat obstacles brea...,17.74


In [4]:
#Printing the attributes of project_data
print("Attributes :", project_data.columns.values)

Attributes : ['school_state' 'teacher_prefix' 'project_grade_category'
 'teacher_number_of_previously_posted_projects' 'project_is_approved'
 'clean_categories' 'clean_subcategories' 'essay' 'price']


### Resource data

In [5]:
resource_data = pd.read_csv('resources.csv')

In [6]:
resource_data.head(3)

Unnamed: 0,id,description,quantity,price
0,p233245,LC652 - Lakeshore Double-Space Mobile Drying Rack,1,149.0
1,p069063,Bouncy Bands for Desks (Blue support pipes),3,14.95
2,p069063,Cory Stories: A Kid's Book About Living With Adhd,1,8.45


In [7]:
# reference : https://stackoverflow.com/questions/22407798/how-to-reset-a-dataframes-indexes-for-all-groups-in-one-step
price_data = resource_data.groupby('id').agg({'price':'sum', 'quantity':'sum'}).reset_index()
price_data.head(2)

Unnamed: 0,id,price,quantity
0,p000001,459.56,7
1,p000002,515.89,21


In [8]:
# join two dataframes(project_data and price_data) in python
# reference : https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.merge.html
project_data['price'] = resource_data['price']
project_data['quantity'] = resource_data['quantity']

In [9]:
project_data.head(2)

Unnamed: 0,school_state,teacher_prefix,project_grade_category,teacher_number_of_previously_posted_projects,project_is_approved,clean_categories,clean_subcategories,essay,price,quantity
0,ca,mrs,grades_prek_2,53,1,math_science,appliedsciences health_lifescience,i fortunate enough use fairy tale stem kits cl...,149.0,1
1,ut,ms,grades_3_5,4,1,specialneeds,specialneeds,imagine 8 9 years old you third grade classroo...,14.95,3


In [10]:
#numerical inputs
project_data['num'] = project_data['teacher_number_of_previously_posted_projects'] + project_data['price'] + project_data['quantity']

In [11]:
project_data.head()

Unnamed: 0,school_state,teacher_prefix,project_grade_category,teacher_number_of_previously_posted_projects,project_is_approved,clean_categories,clean_subcategories,essay,price,quantity,num
0,ca,mrs,grades_prek_2,53,1,math_science,appliedsciences health_lifescience,i fortunate enough use fairy tale stem kits cl...,149.0,1,203.0
1,ut,ms,grades_3_5,4,1,specialneeds,specialneeds,imagine 8 9 years old you third grade classroo...,14.95,3,21.95
2,ca,mrs,grades_prek_2,10,1,literacy_language,literacy,having class 24 students comes diverse learner...,8.45,1,19.45
3,ga,mrs,grades_prek_2,2,1,appliedlearning,earlydevelopment,i recently read article giving students choice...,13.59,2,17.59
4,wa,mrs,grades_3_5,2,1,literacy_language,literacy,my students crave challenge eat obstacles brea...,24.95,3,29.95


In [12]:
col = ['teacher_number_of_previously_posted_projects', 'price', 'quantity']

project_data.drop(labels=col,axis =1, inplace=True)

In [13]:
project_data.head()

Unnamed: 0,school_state,teacher_prefix,project_grade_category,project_is_approved,clean_categories,clean_subcategories,essay,num
0,ca,mrs,grades_prek_2,1,math_science,appliedsciences health_lifescience,i fortunate enough use fairy tale stem kits cl...,203.0
1,ut,ms,grades_3_5,1,specialneeds,specialneeds,imagine 8 9 years old you third grade classroo...,21.95
2,ca,mrs,grades_prek_2,1,literacy_language,literacy,having class 24 students comes diverse learner...,19.45
3,ga,mrs,grades_prek_2,1,appliedlearning,earlydevelopment,i recently read article giving students choice...,17.59
4,wa,mrs,grades_3_5,1,literacy_language,literacy,my students crave challenge eat obstacles brea...,29.95


In [14]:
y = project_data['project_is_approved']
project_data.drop(['project_is_approved'], axis=1, inplace=True)
X = project_data

print(X.shape)
print(y.shape)

(109248, 7)
(109248,)


### Data preparation

In [15]:
#splitting data
X_train,X_test,y_train,y_test = train_test_split(X, y , stratify = y, train_size = 0.7)
X_train,X_cv,y_train,y_cv = train_test_split(X_train,y_train,stratify = y_train,train_size = 0.7)

In [16]:
print(X_train.shape, y_train.shape)
print(X_cv.shape, y_cv.shape)
print(X_test.shape, y_test.shape)

(53531, 7) (53531,)
(22942, 7) (22942,)
(32775, 7) (32775,)


In [17]:
#converting class labels to categorical variables
from keras.utils import to_categorical
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)
y_cv = to_categorical(y_cv)

### Creating embedding matrix using pretrain golve model

In [18]:
emb_dict = {}
glove = open('glove.42B.300d.txt', encoding="utf8")     
for line in glove:
    values = line.split()
    word = values[0]
    vector = np.asarray(values[1:], dtype='float32')
    emb_dict[word] = vector
glove.close()

In [19]:
#https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/
def embedding_matrix(word_index, embedding_dim):
    embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = emb_dict.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

### Tokenizing and padding text data (essay)

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer(lowercase= False)
features = []
essay_train = count.fit_transform(X_train['essay'])
freqs = essay_train.sum(axis=0).A1
index = freqs.argsort()
words = count.get_feature_names()

word_rank = dict()
rank = 1
for i in index[::-1]:
    k = words[i]
    word_rank[k] = rank
    rank+=1
features.append(word_rank)

from keras.preprocessing.text import Tokenizer
# define documents
train_doc = X_train['essay']
# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(train_doc)
vocab_size = len(t.word_index) + 1
essay_train = t.texts_to_sequences(train_doc)
test_doc = X_test['essay']
essay_test = t.texts_to_sequences(test_doc)
cv_doc = X_cv['essay']
essay_cv = t.texts_to_sequences(cv_doc)


from keras.preprocessing.sequence import pad_sequences
max_review_length = 250
essay_train = pad_sequences(essay_train, maxlen=max_review_length) 
essay_test = pad_sequences(essay_test, maxlen=max_review_length)
essay_cv = pad_sequences(essay_cv, maxlen=max_review_length)

In [22]:
essay_train[10]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    7, 1489,   70, 2136, 1473,    4,    1,    5,
       1489,  980, 1520,    4,    1,  576, 1699,  118,   12,  903, 2314,
        275,  316,  254, 2314,  275,  175,  427,    3,   30,  146,  183,
         37,   39,   88,    2,   49,  112,   37,  273,    3,  485,  414,
       4606,    1,  601,   64,  150,   83,   12,   22,   27, 1474,  302,
          4,    1,   19,   27,  471,  920,  571,  641, 2612,    4,    3,
       2369,   27,  207,    1,   57,    2, 3792,  363,  709,    9,  412,
         27,  597, 2624,   50,    3,    4,  146,   58,    6,   82,    1,
        358,   27,  183,  101,  597,  534,   26,    1,   69,  270,  183,
       1931,  980, 1520,  449, 2080, 2234,   26,   

### Tokenizing categorical data

#### 1. School_state

In [49]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
# define documents
state_train_doc = X_train['school_state']
# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(state_train_doc)
vocab_size = len(t.word_index) + 1
state_train = t.texts_to_sequences(state_train_doc)
state_test_doc = X_test['school_state']
state_test = t.texts_to_sequences(state_test_doc)
state_cv_doc = X_cv['school_state']
state_cv = t.texts_to_sequences(state_cv_doc)

max_review_length = 1
state_train = pad_sequences(state_train, maxlen=max_review_length)
state_test = pad_sequences(state_test, maxlen=max_review_length)
state_cv = pad_sequences(state_cv, maxlen=max_review_length)

print(state_train.shape)

(53531, 1)


#### 2.Teacher_prefix

In [50]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
# define documents
prefix_train_doc = X_train['teacher_prefix']
# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(prefix_train_doc)
prefix_size = len(t.word_index) + 1
prefix_train = t.texts_to_sequences(prefix_train_doc)
prefix_test_doc = X_test['teacher_prefix']
prefix_test = t.texts_to_sequences(prefix_test_doc)
prefix_cv_doc = X_cv['teacher_prefix']
prefix_cv = t.texts_to_sequences(prefix_cv_doc)

max_review_length = 1
prefix_train = pad_sequences(prefix_train, maxlen=max_review_length)
prefix_test = pad_sequences(prefix_test, maxlen=max_review_length)
prefix_cv = pad_sequences(prefix_cv, maxlen=max_review_length)

print(prefix_train.shape)

(53531, 1)


#### 3. Project_grade_category

In [51]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
# define documents
grade_train_doc = X_train['project_grade_category']
# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(grade_train_doc)
vocab_size = len(t.word_index) + 1
grade_train = t.texts_to_sequences(grade_train_doc)
grade_test_doc = X_test['project_grade_category']
grade_test = t.texts_to_sequences(grade_test_doc)
grade_cv_doc = X_cv['project_grade_category']
grade_cv = t.texts_to_sequences(grade_cv_doc)

max_review_length = 1
grade_train = pad_sequences(grade_train, maxlen=max_review_length)
grade_test = pad_sequences(grade_test, maxlen=max_review_length)
grade_cv = pad_sequences(grade_cv, maxlen=max_review_length)

print(grade_train.shape)

(53531, 1)


#### 4. Clean_categories

In [52]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
# define documents
clean_cat_train_doc = X_train['clean_categories']
# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(clean_cat_train_doc)
vocab_size = len(t.word_index) + 1
clean_cat_train = t.texts_to_sequences(clean_cat_train_doc)
clean_cat_test_doc = X_test['clean_categories']
clean_cat_test = t.texts_to_sequences(clean_cat_test_doc)
clean_cat_cv_doc = X_cv['clean_categories']
clean_cat_cv = t.texts_to_sequences(clean_cat_cv_doc)

max_review_length = 1
clean_cat_train = pad_sequences(clean_cat_train, maxlen=max_review_length)
clean_cat_test = pad_sequences(clean_cat_test, maxlen=max_review_length)
clean_cat_cv = pad_sequences(clean_cat_cv, maxlen=max_review_length)

print(clean_cat_train.shape)

(53531, 1)


#### 5. Clean_subcategories

In [53]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
# define documents
clean_subcat_train_doc = X_train['clean_subcategories']
# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(clean_subcat_train_doc)
vocab_size = len(t.word_index) + 1
clean_subcat_train = t.texts_to_sequences(clean_subcat_train_doc)
clean_subcat_test_doc = X_test['clean_subcategories']
clean_subcat_test = t.texts_to_sequences(clean_subcat_test_doc)
clean_subcat_cv_doc = X_cv['clean_subcategories']
clean_subcat_cv = t.texts_to_sequences(clean_subcat_cv_doc)

max_review_length = 1
clean_subcat_train = pad_sequences(clean_subcat_train, maxlen=max_review_length)
clean_subcat_test = pad_sequences(clean_subcat_test, maxlen=max_review_length)
clean_subcat_cv = pad_sequences(clean_subcat_cv, maxlen=max_review_length)

print(clean_subcat_train.shape)

(53531, 1)


### Function for AUC Score

In [20]:
#https://stackoverflow.com/questions/41032551/how-to-compute-receiving-operating-characteristic-roc-and-auc-in-keras
#https://developpaper.com/question/how-to-apply-the-custom-operation-of-py_func-in-tensorflow-to-keras/
def auc(y_true, y_pred) :
    score = tf.py_func( lambda y_true, y_pred : roc_auc_score( y_true, y_pred, average='macro', sample_weight=None).astype('float32'),
                        [y_true, y_pred],
                        'float32',
                        stateful=True,
                        name='sklearnAUC')
    return score

# --------------------------------------------  Model 1  ----------------------------------------------------

<img src='https://i.imgur.com/w395Yk9.png'>
ref: https://i.imgur.com/w395Yk9.png

In [29]:
from tensorflow.keras import optimizers
from tensorflow.keras.initializers import he_normal
from tensorflow.keras.layers import BatchNormalization
from time import time
from tensorflow.python.keras.callbacks import TensorBoard

tf.keras.backend.clear_session()

#Essay input --> 1
essay = Input(shape=(250,), name="essay")
x1 = Embedding(input_dim=42533,output_dim=300,trainable=False,weights=[embedding_matrix(features[0],300)])(essay)
x1 = CuDNNLSTM(128,return_sequences=True)(x1)
x1 = SpatialDropout1D(0.3)(x1)
x1 = Flatten()(x1)


#State input --> 2
state = Input(shape=(1,), name="state")
x2 = Embedding(input_dim=52,output_dim=2)(state)
x2 = Flatten()(x2)

#Teacher prefix input --> 3
prefix = Input(shape=(1,), name="prefix")
x3 = Embedding(input_dim=5,output_dim=2)(prefix)
x3 = Flatten()(x3)

#Grade category input --> 4
grade = Input(shape=(1,), name="grade")
x4 = Embedding(input_dim=50,output_dim=2)(grade)
x4 = Flatten()(x4)

#Subject category input --> 5
subj_cat = Input(shape=(1,), name="subject_category")
x5 = Embedding(input_dim=385,output_dim=50)(subj_cat)
x5 = Flatten()(x5)

#Subject subcategory input --> 6
subj_subcat = Input(shape=(1,), name="subject_sub_category")
x6 = Embedding(input_dim=6,output_dim=5)(subj_subcat)
x6 = Flatten()(x6)

#Numerical input -->7
num = Input(shape=(1,), name="numerical")
x7 = (Dense(32, activation='relu',kernel_initializer=he_normal()))(num)



concat = concatenate([x1,x2,x3,x4,x5,x6,x7])

x = (Dense(128, activation='relu',kernel_initializer=he_normal()))(concat)
x = Dropout(0.3)(x)
x = (Dense(64, activation='relu',kernel_initializer=he_normal()))(x)
x = Dropout(0.3)(x)
x = BatchNormalization()(x)
x = (Dense(32, activation='relu',kernel_initializer=he_normal()))(x)
x = Dropout(0.3)(x)
output = (Dense(2, activation='softmax'))(x)
 

model = Model([essay,state,prefix,grade,subj_cat,subj_subcat,num], output)

#https://www.youtube.com/watch?v=2U6Jl7oqRkM
#Instantiating tensorboard for model visualization
#To visualize, run -  tensorboard --log_dir=logs/{} in command prompt
tensorboard = TensorBoard(log_dir="logs/".format(time))

model.compile(loss="categorical_crossentropy", optimizer='adam', metrics=[auc])

print(model.summary())

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
tf.py_func is deprecated in TF V2. Instead, there are two
    options available in V2.
    - tf.py_function takes a python function which manipulates tf eager
    tensors instead of numpy arrays. It's easy to convert a tf eager tensor to
    an ndarray (just call tensor.numpy()) but having access to eager tensors
    means `tf.py_function`s can use accelerators such as GPUs as well as
    being differentiable using a gradient tape.
    - tf.numpy_function maintains the semantics of the deprecated tf.py_func
    (it is not differentiable, and manipulates numpy arrays). It drops the
    stateful argument making all functions stateful.
    
Model: "model"
__________________________________________________________________________________________________
Layer (type)  

In [30]:
model.fit([essay_train,state_train,grade_train,prefix_train,clean_cat_train,clean_subcat_train,X_train['num']], y_train, epochs=15, verbose=1, batch_size=300, validation_data=([essay_cv,state_cv,grade_cv,prefix_cv,clean_cat_cv,clean_subcat_cv,X_cv['num']], y_cv), callbacks=[tensorboard])

Train on 53531 samples, validate on 22942 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x224d0c665c0>

#### Saving model weights

In [31]:
#https://machinelearningmastery.com/save-load-keras-deep-learning-models/
model.save_weights("model1.h5")

#### Model visualization

In [32]:
print("Train AUC",roc_auc_score(y_train,(model.predict([essay_train,state_train,grade_train,prefix_train,clean_cat_train,clean_subcat_train,X_train['num']]))))
print("-"*50)
print("Cv AUC",roc_auc_score(y_cv,model.predict([essay_cv,state_cv,grade_cv,prefix_cv,clean_cat_cv,clean_subcat_cv,X_cv['num']])))
print("-"*50)
print("Test AUC",roc_auc_score(y_test,model.predict([essay_test,state_test,grade_test,prefix_test,clean_cat_test,clean_subcat_test,X_test['num']])))

Train AUC 0.889562317498128
--------------------------------------------------
Cv AUC 0.6936691179680007
--------------------------------------------------
Test AUC 0.7058652426489026


<img src="model1.png">

In [35]:
'''# Load TENSORBOARD
%load_ext tensorboard
# Start TENSORBOARD
%tensorboard --logdir logs'''

'# Load TENSORBOARD\n%load_ext tensorboard\n# Start TENSORBOARD\n%tensorboard --logdir logs'

# --------------------------------------------  Model 2  ----------------------------------------------------

In [21]:
print(X_train.shape, y_train.shape)
print(X_cv.shape, y_cv.shape)
print(X_test.shape, y_test.shape)

(53531, 7) (53531, 2)
(22942, 7) (22942, 2)
(32775, 7) (32775, 2)


In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
tfidf = TfidfVectorizer()
data_text = tfidf.fit_transform(X_train['essay'])
plt.boxplot(tfidf.idf_)
plt.ylabel("IDF score")

Text(0, 0.5, 'IDF score')

In [23]:
print("25 percentile (idf):", np.percentile(tfidf.idf_,[25]))
print("50 percentile (idf):",np.percentile(tfidf.idf_,[50]))
print("75 percentile (idf):",np.percentile(tfidf.idf_,[75]))
print("90 percentile (idf):",np.percentile(tfidf.idf_,[90]))

25 percentile (idf): [9.17998468]
50 percentile (idf): [10.50174052]
75 percentile (idf): [11.1948877]
90 percentile (idf): [11.1948877]


In [24]:
feat_idf_val = zip(tfidf.get_feature_names(),tfidf.idf_)

In [25]:
feat = []
for f,val in feat_idf_val:
    if val>=2 and val<=10:
        feat.append(f)
    else:
        continue   

### Considering only those features with idf value between 25th and 75th percentile in 'project_essay'

In [None]:
train_essay = []
for text in X_train['essay']:
    sent = " "
    words = text.split()
    for word in words:
        if word in feat:
            sent = " "+word
        else:
            continue
    train_essay.append(sent)
X_train['essay'] = train_essay  

#Featurizing cv essay
cv_essay = []
for text in X_cv['essay']:
    sent = " "
    words = text.split()
    for word in words:
        if word in feat:
            sent = " "+word
        else:
            continue
    cv_essay.append(sent)
X_cv['essay'] = cv_essay  


#Featurizing test essay
test_essay = []
for text in X_test['essay']:
    sent = " "
    words = text.split()
    for word in words:
        if word in feat:
            sent = " "+word
        else:
            continue
    test_essay.append(sent)
X_test['essay'] = test_essay  

#### 1. Essay

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer(lowercase= False)
features = []
essay_train = count.fit_transform(X_train['essay'])
freqs = essay_train.sum(axis=0).A1
index = freqs.argsort()
words = count.get_feature_names()

word_rank = dict()
rank = 1
for i in index[::-1]:
    k = words[i]
    word_rank[k] = rank
    rank+=1
features.append(word_rank)

from keras.preprocessing.text import Tokenizer
# define documents
train_doc = X_train['essay']
# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(train_doc)
vocab_size = len(t.word_index) + 1
essay_train = t.texts_to_sequences(train_doc)
test_doc = X_test['essay']
essay_test = t.texts_to_sequences(test_doc)
cv_doc = X_cv['essay']
essay_cv = t.texts_to_sequences(cv_doc)


from keras.preprocessing.sequence import pad_sequences
max_review_length = 250
essay_train = pad_sequences(essay_train, maxlen=max_review_length) 
essay_test = pad_sequences(essay_test, maxlen=max_review_length)
essay_cv = pad_sequences(essay_cv, maxlen=max_review_length)

In [None]:
from tensorflow.keras import optimizers
from tensorflow.keras.initializers import he_normal
from tensorflow.keras.layers import BatchNormalization
from time import time
from tensorflow.python.keras.callbacks import TensorBoard

tf.keras.backend.clear_session()

#Essay input --> 1
essay = Input(shape=(250,), name="essay")
x1 = Embedding(input_dim=4908,output_dim=300,trainable=False,weights=[embedding_matrix(features[0],300)])(essay)
x1 = CuDNNLSTM(128,return_sequences=True)(x1)
x1 = SpatialDropout1D(0.3)(x1)
x1 = Flatten()(x1)


#State input --> 2
state = Input(shape=(1,), name="state")
x2 = Embedding(input_dim=52,output_dim=2)(state)
x2 = Flatten()(x2)

#Teacher prefix input --> 3
prefix = Input(shape=(1,), name="prefix")
x3 = Embedding(input_dim=5,output_dim=2)(prefix)
x3 = Flatten()(x3)

#Grade category input --> 4
grade = Input(shape=(1,), name="grade")
x4 = Embedding(input_dim=50,output_dim=2)(grade)
x4 = Flatten()(x4)

#Subject category input --> 5
subj_cat = Input(shape=(1,), name="subject_category")
x5 = Embedding(input_dim=385,output_dim=50)(subj_cat)
x5 = Flatten()(x5)

#Subject subcategory input --> 6
subj_subcat = Input(shape=(1,), name="subject_sub_category")
x6 = Embedding(input_dim=6,output_dim=5)(subj_subcat)
x6 = Flatten()(x6)

#Numerical input -->7
num = Input(shape=(1,), name="numerical")
x7 = (Dense(32, activation='relu',kernel_initializer=he_normal()))(num)



concat = concatenate([x1,x2,x3,x4,x5,x6,x7])

x = (Dense(128, activation='relu',kernel_initializer=he_normal()))(concat)
x = Dropout(0.3)(x)
x = (Dense(64, activation='relu',kernel_initializer=he_normal()))(x)
x = Dropout(0.3)(x)
x = BatchNormalization()(x)
x = (Dense(32, activation='relu',kernel_initializer=he_normal()))(x)
x = Dropout(0.3)(x)
output = (Dense(2, activation='softmax'))(x)
 

model = Model([essay,state,prefix,grade,subj_cat,subj_subcat,num], output)

#https://www.youtube.com/watch?v=2U6Jl7oqRkM
#Instantiating tensorboard for model visualization
#To visualize, run -  tensorboard --log_dir=logs/{} in command prompt
tensorboard = TensorBoard(log_dir="logs/".format(time))

model.compile(loss="categorical_crossentropy", optimizer='adam', metrics=[auc])

print(model.summary())

In [None]:
model.fit([essay_train,state_train,grade_train,prefix_train,clean_cat_train,clean_subcat_train,X_train['num']], y_train, epochs=20, verbose=1, batch_size=300, validation_data=([essay_cv,state_cv,grade_cv,prefix_cv,clean_cat_cv,clean_subcat_cv,X_cv['num']], y_cv),callbacks=[tensorboard])

Train on 53531 samples, validate on 22942 samples
Epoch 1/20


#### Saving model weights

In [48]:
#https://machinelearningmastery.com/save-load-keras-deep-learning-models/

model.save_weights("model2.h5")

#### Model visualization

In [49]:
print("Train AUC",roc_auc_score(y_train,(model.predict([essay_train,state_train,grade_train,prefix_train,clean_cat_train,clean_subcat_train,X_train['num']]))))
print("-"*50)
print("Cv AUC",roc_auc_score(y_cv,model.predict([essay_cv,state_cv,grade_cv,prefix_cv,clean_cat_cv,clean_subcat_cv,X_cv['num']])))
print("-"*50)
print("Test AUC",roc_auc_score(y_test,model.predict([essay_test,state_test,grade_test,prefix_test,clean_cat_test,clean_subcat_test,X_test['num']])))

Train AUC 0.5289546532322853
--------------------------------------------------
Cv AUC 0.5347986063722183
--------------------------------------------------
Test AUC 0.5250754856033888


In [51]:
'''# Load TENSORBOARD
%load_ext tensorboard
# Start TENSORBOARD
%tensorboard --logdir logs'''

'# Load TENSORBOARD\n%load_ext tensorboard\n# Start TENSORBOARD\n%tensorboard --logdir logs'

<img src="model2.png">

# --------------------------------------------  Model 3  ----------------------------------------------------

In [34]:
#splitting data
X_train,X_test,y_train,y_test = train_test_split(X, y , stratify = y, train_size = 0.7)
X_train,X_cv,y_train,y_cv = train_test_split(X_train,y_train,stratify = y_train,train_size = 0.7)

In [35]:
print(X_train.shape, y_train.shape)
print(X_cv.shape, y_cv.shape)
print(X_test.shape, y_test.shape)

(53531, 7) (53531,)
(22942, 7) (22942,)
(32775, 7) (32775,)


In [36]:
#converting class labels to categorical variables
from keras.utils import to_categorical
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)
y_cv = to_categorical(y_cv)

In [37]:
#splitting data
X_train,X_test,y_train,y_test = train_test_split(X, y , stratify = y, train_size = 0.7)
X_train,X_cv,y_train,y_cv = train_test_split(X_train,y_train,stratify = y_train,train_size = 0.7)

print(X_train.shape, y_train.shape)
print(X_cv.shape, y_cv.shape)
print(X_test.shape, y_test.shape)

#converting class labels to categorical variables
from keras.utils import to_categorical
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)
y_cv = to_categorical(y_cv)

(53531, 7) (53531,)
(22942, 7) (22942,)
(32775, 7) (32775,)


In [42]:
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer(lowercase= False)
features = []
essay_train = count.fit_transform(X_train['essay'])
freqs = essay_train.sum(axis=0).A1
index = freqs.argsort()
words = count.get_feature_names()

word_rank = dict()
rank = 1
for i in index[::-1]:
    k = words[i]
    word_rank[k] = rank
    rank+=1
features.append(word_rank)

from keras.preprocessing.text import Tokenizer
# define documents
train_doc = X_train['essay']
# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(train_doc)
vocab_size = len(t.word_index) + 1
essay_train = t.texts_to_sequences(train_doc)
test_doc = X_test['essay']
essay_test = t.texts_to_sequences(test_doc)
cv_doc = X_cv['essay']
essay_cv = t.texts_to_sequences(cv_doc)


from keras.preprocessing.sequence import pad_sequences
max_review_length = 250
essay_train = pad_sequences(essay_train, maxlen=max_review_length) 
essay_test = pad_sequences(essay_test, maxlen=max_review_length)
essay_cv = pad_sequences(essay_cv, maxlen=max_review_length)

In [43]:
print(essay_train.shape)
essay_train[0]

(53531, 250)


array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,  106,   76,   91,  135,   52,  134,    3,  172,    1,  614,
        291,  129,    6,  256,   12,  129,  204,    2,   34,   73, 6738,
        708,  350,  369,   43,  869,  258,    9,  469,  474,   10,  111,
        110,  474,  101,  469,  745,  294,   74,    9, 1149,  672,  700,
        749,   61,    2,  151,   21,  763,  173,   10,   30,  269,  118,
          5,  127, 1267,  334,   67,  207,  160,   27,    2,  137,   21,
         73,  382,   74,  192,   28, 2872,    3,  7

In [44]:
token = CountVectorizer()

school_state_train = (token.fit_transform(X_train['school_state'])).toarray()
school_state_test = (token.transform(X_test['school_state'])).toarray()
school_state_cv = (token.transform(X_cv['school_state'])).toarray()

print(school_state_train.shape)

(53531, 51)


In [45]:
prefix_train = token.fit_transform(X_train['teacher_prefix']).toarray()
prefix_cv = token.transform(X_cv['teacher_prefix']).toarray()
prefix_test = token.transform(X_test['teacher_prefix']).toarray()
print(prefix_train.shape)

(53531, 5)


In [46]:
grade_train = token.fit_transform(X_train['project_grade_category']).toarray()
grade_cv = token.transform(X_cv['project_grade_category']).toarray()
grade_test = token.transform(X_test['project_grade_category']).toarray()
print(grade_train.shape)

(53531, 4)


In [47]:
cat_train = token.fit_transform(X_train['clean_categories']).toarray()
cat_cv = token.transform(X_cv['clean_categories']).toarray()
cat_test = token.transform(X_test['clean_categories']).toarray()
print(cat_train.shape)

(53531, 9)


In [48]:
subcat_train = token.fit_transform(X_train['clean_subcategories']).toarray()
subcat_cv = token.transform(X_cv['clean_subcategories']).toarray()
subcat_test = token.transform(X_test['clean_subcategories']).toarray()
print(subcat_train.shape)

(53531, 30)


In [49]:
train_num = X_train['num'].values.reshape(-1,1)
cv_num = X_cv['num'].values.reshape(-1,1)
test_num = X_test['num'].values.reshape(-1,1)
print(train_num.shape)
print(cv_num.shape)
print(test_num.shape)

(53531, 1)
(22942, 1)
(32775, 1)


In [50]:
cat_num_train_feat = np.hstack((school_state_train,prefix_train,grade_train,cat_train,subcat_train,train_num))
cat_num_cv_feat = np.hstack((school_state_cv,prefix_cv,grade_cv,cat_cv,subcat_cv,cv_num))
cat_num_test_feat = np.hstack((school_state_test,prefix_test,grade_test,cat_test,subcat_test,test_num))
print(cat_num_train_feat.shape)
print(cat_num_cv_feat.shape)
print(cat_num_test_feat.shape)

(53531, 100)
(22942, 100)
(32775, 100)


In [51]:
cat_num_train_feat = np.resize(cat_num_train_feat, new_shape=(53531,100,1))
cat_num_cv_feat = np.resize(cat_num_cv_feat, new_shape=(22942,100,1))
cat_num_test_feat = np.resize(cat_num_test_feat, new_shape=(32775,100,1))

<img src='https://i.imgur.com/fkQ8nGo.png'>
ref: https://i.imgur.com/fkQ8nGo.png

In [69]:
from tensorflow.keras import optimizers
import os
from tensorflow.keras.initializers import he_normal
from tensorflow.keras.layers import BatchNormalization
from time import time
from tensorflow.python.keras.callbacks import TensorBoard
tf.keras.backend.clear_session()

# input 1
essay = Input(batch_shape=(None,250), name="essay_input")
x1 = Embedding(input_dim=42754,output_dim = 300,weights=[embedding_matrix(features[0],300)],trainable = False)(essay)
x1 = SpatialDropout1D(0.3)(x1)
x1 = CuDNNLSTM(100,return_sequences=True)(x1)
x1 = Flatten()(x1)

# input 2
other = Input(shape=(100,1),name="other_input")
x2 = Conv1D(filters=64,kernel_size=3,strides=1)(other)
x2 = BatchNormalization()(x2)
x2 = Dropout(0.3)(x2)
x2 = Conv1D(filters=128,kernel_size=3,strides=1)(x2)
x2 = Flatten()(x2)


concat = concatenate([x1,x2])


x = Dense(64,activation='relu',kernel_initializer=he_normal(),kernel_regularizer=l2(0.0001))(concat)
x = Dropout(0.3)(x)
x = Dense(128,activation='relu',kernel_initializer=he_normal(),kernel_regularizer=l2(0.0001))(x)
x = Dropout(0.3)(x)
x = BatchNormalization()(x)
x = Dense(128,activation='relu',kernel_initializer=he_normal(),kernel_regularizer=l2(0.0001))(x)
x = Dropout(0.3)(x)
x = BatchNormalization()(x)
x = Dense(516,activation='relu',kernel_initializer=he_normal(),kernel_regularizer=l2(0.0001))(x)
x = Dropout(0.3)(x)
output = (Dense(2, activation='softmax'))(x)
model = Model([essay,other], output)

#To visualize, run -  tensorboard --log_dir=logs/ in command prompt

tensorboard = TensorBoard(log_dir="logs/".format(time))
model.compile(loss="categorical_crossentropy", optimizer='adam', metrics=[auc])
print(model.summary())

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
other_input (InputLayer)        [(None, 100, 1)]     0                                            
__________________________________________________________________________________________________
essay_input (InputLayer)        [(None, 250)]        0                                            
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 98, 64)       256         other_input[0][0]                
__________________________________________________________________________________________________
embedding (Embedding)           (None, 250, 300)     12826200    essay_input[0][0]                
______________________________________________________________________________________________

In [70]:
model.fit([essay_train,cat_num_train_feat], y_train, epochs=20, verbose=1, batch_size=500, validation_data=([essay_cv,cat_num_cv_feat], y_cv), callbacks=[tensorboard])

Train on 53531 samples, validate on 22942 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x1e97160d898>

#### Saving model weights

In [71]:
#https://machinelearningmastery.com/save-load-keras-deep-learning-models/
# serialize model to JSON
model.save_weights("model3.h5")

#### Model visualization

In [72]:
print("Train AUC",roc_auc_score(y_train,(model.predict([essay_train,cat_num_train_feat]))))
print("-"*50)
print("Cv AUC",roc_auc_score(y_cv,model.predict([essay_cv,cat_num_cv_feat])))
print("-"*50)
print("Test AUC",roc_auc_score(y_test,model.predict([essay_test,cat_num_test_feat])))

Train AUC 0.8657763867195336
--------------------------------------------------
Cv AUC 0.7141414415626062
--------------------------------------------------
Test AUC 0.7175532621102763


In [79]:
"""# Load TENSORBOARD
%reload_ext tensorboard
# Start TENSORBOARD
%tensorboard --logdir logs --port=8080"""

'# Load TENSORBOARD\n%reload_ext tensorboard\n# Start TENSORBOARD\n%tensorboard --logdir logs --port=8080'

<img src="model_3.png">

In [77]:
from prettytable import PrettyTable
    
x = PrettyTable(["Model", "Train AUC", "Cv AUC", "Test AUC"])

x.add_row(["Model 1", 0.88,0.69,0.70])
x.add_row(["Model 2", 0.52,0.53,0.52])
x.add_row(["Model 3", 0.86,0.71,0.71])

print(x.get_string(title="Model results"))

+---------+-----------+--------+----------+
|  Model  | Train AUC | Cv AUC | Test AUC |
+---------+-----------+--------+----------+
| Model 1 |    0.88   |  0.69  |   0.7    |
| Model 2 |    0.52   |  0.53  |   0.52   |
| Model 3 |    0.86   |  0.71  |   0.71   |
+---------+-----------+--------+----------+
