<b> Introduction </b>

In [38]:
#importing all the required lib
import pandas as pd
import numpy as np 
import os
import math
from collections import defaultdict
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import SpatialDropout1D, LSTM, BatchNormalization,concatenate,Flatten,Embedding,Dense,Dropout,MaxPooling2D,Reshape,CuDNNLSTM
from keras.models import Sequential
from keras import Model,Input
from keras.layers.convolutional import Conv2D,Conv1D
import keras.backend as k
from sklearn.metrics import roc_auc_score
import tensorflow as tf
import keras
from sklearn.utils import compute_class_weight
from keras.initializers import he_normal,glorot_normal
from keras.regularizers import l1,l2
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint,LearningRateScheduler
from time import time
from tensorflow.python.keras.callbacks import TensorBoard
from IPython.display import SVG, display
import pickle 
import warnings
warnings.filterwarnings("ignore")

In [2]:
dbfile = open('glove_vectors.pickle', 'rb')      
db = pickle.load(dbfile) 

In [3]:
db['mallinson'].shape

(300,)

In [4]:
#REading the dataset
project_data = pd.read_csv('processed_train_data.csv')
project_data.shape

(109248, 21)

In [5]:
#merging teacher number of previouly posted projects, presence of the numerical digits, price and quantity into a single feature
project_data.drop(['Unnamed: 0'], axis =1 , inplace = True)
class_label = project_data['project_is_approved']
project_data['remaining_input'] = project_data['teacher_number_of_previously_posted_projects']  +\
                                    project_data['presence_of_the_numerical_digits']  + \
                                    project_data['price'] + project_data['quantity']

In [6]:
project_data['total_txt'] = project_data['project_title'] + ' ' + project_data['essay'] + ' ' + project_data['project_resource_summary']

In [7]:
project_data.replace(to_replace=np.NaN, value= str('nan'),inplace=True)

In [8]:
col = project_data.columns
col

Index(['id', 'teacher_id', 'teacher_prefix', 'school_state',
       'project_submitted_datetime', 'project_grade_category',
       'project_subject_categories', 'project_subject_subcategories',
       'project_title', 'project_essay_1', 'project_essay_2',
       'project_essay_3', 'project_essay_4', 'project_resource_summary',
       'teacher_number_of_previously_posted_projects', 'project_is_approved',
       'essay', 'price', 'quantity', 'presence_of_the_numerical_digits',
       'remaining_input', 'total_txt'],
      dtype='object')

In [9]:
col = ['id','teacher_id','project_submitted_datetime','project_title','project_essay_1', 'project_essay_2',
       'project_essay_3', 'project_essay_4','project_resource_summary',
       'teacher_number_of_previously_posted_projects', 'project_is_approved','price', 'quantity',
        'presence_of_the_numerical_digits','essay']

project_data.drop(labels=col,axis =1, inplace=True)

In [10]:
col = project_data.columns

In [11]:
col = ['teacher_prefix', 'school_state', 'project_grade_category',
       'project_subject_categories', 'project_subject_subcategories','total_txt',
       'remaining_input']
project_data = project_data[col]

In [12]:
def word_ranking(dataframe):
    col_names = dataframe.columns
    features = []
    #performing train test split
    train,test,y_train,y_test = train_test_split(dataframe, class_label , stratify = class_label, train_size = 0.7)

    train,cv,y_train,y_cv = train_test_split(train,y_train,stratify = y_train,train_size = 0.8)
    for col in col_names[:6]:
        print(col)
        bag_of_words = CountVectorizer(lowercase= False)
        bow_words = bag_of_words.fit_transform(train[col])
        print(bow_words.shape)
        
        #Lets now store the document term matrix in a dictionary.
        freqs = bow_words.sum(axis=0).A1
        index = freqs.argsort()
        words = bag_of_words.get_feature_names()
        
        

        # Assigning Rank to each word based on its freq of occurance. Word with highest freq is assigned rank 1 
        word_rank = dict()
        rank = 1
        for i in index[::-1]:
            k = words[i]
            word_rank[k] = rank
            rank+=1
        features.append(word_rank)

        #Every word in each review is replaced by its rank
        rank = [] # list of all the review with words replaced with rank
        for sent in train[col].values:
            txt_row = []
            for word in sent.split():
                if word in word_rank.keys():
                    txt_row.append(word_rank[word])
                else:
                    pass
            rank.append(txt_row)
        
        train[col] = rank
        
        rank = [] # list of all the review with words replaced with rank
        for sent in test[col].values:
            txt_row = []
            for word in sent.split():
                if word in word_rank.keys():
                    txt_row.append(word_rank[word])
                else:
                    pass
            rank.append(txt_row)
        
        test[col] = rank
        
        rank = [] # list of all the review with words replaced with rank
        for sent in cv[col].values:
            txt_row = []
            for word in sent.split():
                if word in word_rank.keys():
                    txt_row.append(word_rank[word])
                else:
                    pass
            rank.append(txt_row)
        
        cv[col] = rank
    return train,test,cv,y_train,y_test,y_cv,features
            

In [13]:
train,test,cv,y_train,y_test,y_cv,feature_names = word_ranking(project_data)

teacher_prefix
(61178, 5)
school_state
(61178, 51)
project_grade_category
(61178, 4)
project_subject_categories
(61178, 51)
project_subject_subcategories
(61178, 387)
total_txt
(61178, 56905)


In [14]:
print("Shape of the Train dataset: ", train.shape[0])
print("Shape of the Test dataset: ", test.shape[0])
print("Shape of the cv dataset:", cv.shape[0])

Shape of the Train dataset:  61178
Shape of the Test dataset:  32775
Shape of the cv dataset: 15295


In [15]:
#converting class labels to categorical variables
from keras.utils import to_categorical
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)
y_cv = to_categorical(y_cv)

In [16]:
class_wght = compute_class_weight("balanced", classes= np.unique(class_label),y=class_label)

In [17]:
class_wght

array([3.30214001, 0.58921753])

In [18]:
feature_names[4]

{'literacy': 1,
 'literacy_mathematics': 2,
 'literature_writing_mathematics': 3,
 'literacy_literature_writing': 4,
 'mathematics': 5,
 'literature_writing': 6,
 'specialneeds': 7,
 'health_wellness': 8,
 'appliedsciences_mathematics': 9,
 'appliedsciences': 10,
 'literacy_specialneeds': 11,
 'visualarts': 12,
 'gym_fitness_health_wellness': 13,
 'esl_literacy': 14,
 'music': 15,
 'warmth_care_hunger': 16,
 'literature_writing_specialneeds': 17,
 'health_wellness_specialneeds': 18,
 'mathematics_specialneeds': 19,
 'gym_fitness': 20,
 'environmentalscience': 21,
 'teamsports': 22,
 'music_performingarts': 23,
 'appliedsciences_environmentalscience': 24,
 'environmentalscience_health_lifescience': 25,
 'earlydevelopment': 26,
 'other': 27,
 'environmentalscience_mathematics': 28,
 'health_lifescience': 29,
 'health_wellness_nutritioneducation': 30,
 'earlydevelopment_specialneeds': 31,
 'literature_writing_visualarts': 32,
 'earlydevelopment_literacy': 33,
 'esl_literature_writing': 34

In [19]:
#Creating a matrix with rows as words and columns with 50 dim vectors for each word
def embedding_mat(word_index,embedding_dim = 300):
    embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = db.get(word)
        if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

# Tokenizing the Text part

In [20]:


max_review_length = 250
X_train = pad_sequences(train['total_txt'], maxlen=max_review_length)  #padding zeros at the begining of each review to make max len as 200
X_test = pad_sequences(test['total_txt'], maxlen=max_review_length)
X_cv = pad_sequences(cv['total_txt'], maxlen=max_review_length)
print(X_train.shape)
print(X_train[256])

(61178, 250)
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0  160   73
  692   45 1490 1716  401 1164   23    2   19 1054   91  159    5  241
  118  167  692  217   18 1269 3701 3627    7  435   17 2333  846  217
    2  121  351  643  936    1  136   66   89    1  705  174  559    8
  134  459  670   29   41   34   60  134   50    7  203  201   82  132
  692   23    5  217   91    7  209  168  324  157 1816  170  682  403
   41   30  242  619   14    1  682  403  736    1  710   20  170   48
   73  788  279 1526 1470  864  847  721  160   73  151  213 149

# Tokenizing the school state

In [21]:

max_review_length = 1
X_train_school_state = pad_sequences(train['school_state'], maxlen=max_review_length)  #padding zeros at the begining of each review to make max len as 200
X_test_school_state = pad_sequences(test['school_state'], maxlen=max_review_length)
X_cv_school_state = pad_sequences(cv['school_state'], maxlen=max_review_length)
print(X_test_school_state.shape)
print(X_test_school_state[0])

(32775, 1)
[1]


# Tokenizing the project_grade_category

In [22]:
max_review_length = 1
X_train_project_grade = pad_sequences(train['project_grade_category'], maxlen=max_review_length)  #padding zeros at the begining of each review to make max len as 200
X_test_project_grade = pad_sequences(test['project_grade_category'], maxlen=max_review_length)
X_cv_project_grade = pad_sequences(cv['project_grade_category'], maxlen=max_review_length)
print(X_train_project_grade.shape)
print(X_train_project_grade[0])

(61178, 1)
[4]


# Tokenizing the project categories

In [23]:
max_review_length = 1
X_train_project_cat = pad_sequences(train['project_subject_categories'], maxlen=max_review_length)  #padding zeros at the begining of each review to make max len as 200
X_test_project_cat = pad_sequences(test['project_subject_categories'], maxlen=max_review_length)
X_cv_project_cat = pad_sequences(cv['project_subject_categories'], maxlen=max_review_length)
print(X_train_project_cat.shape)
print(X_train_project_cat[0])

(61178, 1)
[2]


# Tokenizing the project subcategories

In [24]:
max_review_length = 1
X_train_project_subcat = pad_sequences(train['project_subject_subcategories'], maxlen=max_review_length)  #padding zeros at the begining of each review to make max len as 200
X_test_project_subcat = pad_sequences(test['project_subject_subcategories'], maxlen=max_review_length)
X_cv_project_subcat = pad_sequences(cv['project_subject_subcategories'], maxlen=max_review_length)
print(X_train_project_subcat.shape)
print(X_train_project_subcat[0])

(61178, 1)
[5]


# Tokenizing the teacher prefix

In [25]:
max_review_length = 1
X_train_teacher_prefix = pad_sequences(train['teacher_prefix'], maxlen=max_review_length)  #padding zeros at the begining of each review to make max len as 200
X_test_teacher_prefix = pad_sequences(test['teacher_prefix'], maxlen=max_review_length)
X_cv_teacher_prefix = pad_sequences(cv['teacher_prefix'], maxlen=max_review_length)
print(X_train_teacher_prefix.shape)
print(X_test_teacher_prefix[0])

(61178, 1)
[1]


In [26]:
train.head()


Unnamed: 0,teacher_prefix,school_state,project_grade_category,project_subject_categories,project_subject_subcategories,total_txt,remaining_input
76202,[2],[2],[4],[2],[5],"[5, 1137, 284, 2520, 1, 2120, 487, 2038, 1909,...",135.99
76133,[2],[1],[2],[1],[34],"[187, 310, 52, 1, 407, 93, 143, 2, 41, 32, 212...",97.44
59232,[1],[27],[3],[2],[10],"[2965, 3162, 128, 1, 419, 5, 41, 19, 9284, 138...",109.15
68658,[1],[1],[2],[3],[3],"[188, 46, 4, 124, 1586, 763, 2, 1, 1586, 300, ...",206.0
55783,[1],[6],[1],[1],[1],"[178, 2699, 2073, 68, 24, 433, 315, 24, 2699, ...",241.99


# Deep Learning Models

 ### Model 1

In [42]:
#AUC score
def auc( y_true, y_pred ) :
    score = tf.py_func( lambda y_true, y_pred : roc_auc_score( y_true, y_pred, average='macro', sample_weight=None).astype('float32'),
                        [y_true, y_pred],
                        'float32',
                        stateful=True,
                        name='sklearnAUC' )
    return score


def step_decay(epoch):
    initial_lrate = 0.0001
    drop = 1e-6
    epochs_drop = 1
    lrate = initial_lrate * math.pow(drop, math.floor((1+epoch)/epochs_drop))
    return lrate

In [47]:
#input 1
input1 = Input(shape=(250,))
x1 = Embedding(input_dim=56906,output_dim= 300,weights=[embedding_mat(feature_names[5])],trainable=False)(input1)
x1 = SpatialDropout1D(0.3)(x1)
x1 = CuDNNLSTM(128,return_sequences=True)(x1)
x1 = Flatten()(x1)

#input 2
input2 = Input(shape=(1,))
x2 = Embedding(input_dim= 52,output_dim= 2)(input2)
#x2 = SpatialDropout1D(0.3)(x2)
x2 = Flatten()(x2)

#input 3
input3 = Input(shape=(1,))
x3 = Embedding(input_dim= 5,output_dim= 2)(input3)
#x3 = SpatialDropout1D(0.3)(x3)
x3 = Flatten()(x3)

#input 4
input4 = Input(shape=(1,))
x4 = Embedding(input_dim=50,output_dim= 2)(input4)
#x4 = SpatialDropout1D(0.3)(x4)
x4 = Flatten()(x4)

#input 5
input5 = Input(shape=(1,))
x5 = Embedding(input_dim= 385,output_dim= 50)(input5)
#x5 = SpatialDropout1D(0.3)(x5)
x5 = Flatten()(x5)

#input 6
input6 = Input(shape=(1,))
x6 = Embedding(input_dim= 6,output_dim= 5)(input6)
#x6 = SpatialDropout1D(0.3)(x6)
x6 = Flatten()(x6)

#input 7
input7 = Input(shape=(1,))
x7 = Dense(16,activation='relu',kernel_initializer=he_normal(),kernel_regularizer=l2(0.0001))(input7)
#x7 = Flatten()(x7)
#merging all the inputs 
concat = concatenate([x1,x2,x3,x4,x5,x6,x7])
#x = BatchNormalization()(concat)

x = Dense(128,activation='relu',kernel_initializer=he_normal(),kernel_regularizer=l2(0.0001))(concat)
x = Dropout(0.5)(x)
x = Dense(64,activation='relu',kernel_initializer=he_normal(),kernel_regularizer=l2(0.0001))(x)
x = Dropout(0.5)(x)
x = BatchNormalization()(x)
x = Dense(32,activation='relu',kernel_initializer=he_normal(),kernel_regularizer=l2(0.0001))(x)
x = Dropout(0.5)(x)
output = Dense(2, activation = 'softmax')(x)
 
# create model with seven inputs
model = Model([input1,input2,input3,input4,input5,input6,input7], output)
tensorboard = TensorBoard(log_dir='logs/{}'.format(time()))
model.compile(loss='categorical_crossentropy', optimizer=keras.optimizers.Adam(lr=0.0006,decay = 1e-4),metrics=[auc])
#lrate = LearningRateScheduler(step_decay)
print(model.summary())

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_51 (InputLayer)           (None, 250)          0                                            
__________________________________________________________________________________________________
embedding_44 (Embedding)        (None, 250, 300)     17071800    input_51[0][0]                   
__________________________________________________________________________________________________
spatial_dropout1d_8 (SpatialDro (None, 250, 300)     0           embedding_44[0][0]               
__________________________________________________________________________________________________
input_52 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
input_53 (

In [48]:
#model fitting
#https://machinelearningmastery.com/check-point-deep-learning-models-keras/
filepath="weights_copy.best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_auc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint,tensorboard]
model.fit([X_train,X_train_school_state,X_train_project_grade,X_train_project_cat,X_train_project_subcat,
           X_train_teacher_prefix,train['remaining_input']], y_train, nb_epoch=20,verbose=1,batch_size=256,
          validation_data=([X_cv,X_cv_school_state,X_cv_project_grade,X_cv_project_cat,X_cv_project_subcat,
           X_cv_teacher_prefix,cv['remaining_input']]  , y_cv),callbacks =callbacks_list,class_weight = class_wght )

Train on 61178 samples, validate on 15295 samples
Epoch 1/20

Epoch 00001: val_auc improved from -inf to 0.59256, saving model to weights_copy.best.hdf5
Epoch 2/20

Epoch 00002: val_auc improved from 0.59256 to 0.65700, saving model to weights_copy.best.hdf5
Epoch 3/20

Epoch 00003: val_auc improved from 0.65700 to 0.65752, saving model to weights_copy.best.hdf5
Epoch 4/20

Epoch 00004: val_auc improved from 0.65752 to 0.70942, saving model to weights_copy.best.hdf5
Epoch 5/20

Epoch 00005: val_auc improved from 0.70942 to 0.71825, saving model to weights_copy.best.hdf5
Epoch 6/20

Epoch 00006: val_auc improved from 0.71825 to 0.73111, saving model to weights_copy.best.hdf5
Epoch 7/20

Epoch 00007: val_auc improved from 0.73111 to 0.73698, saving model to weights_copy.best.hdf5
Epoch 8/20

Epoch 00008: val_auc improved from 0.73698 to 0.74265, saving model to weights_copy.best.hdf5
Epoch 9/20

Epoch 00009: val_auc improved from 0.74265 to 0.74579, saving model to weights_copy.best.hdf5

<keras.callbacks.History at 0x244065fd898>

In [49]:
#input 1
input1 = Input(shape=(250,))
x1 = Embedding(input_dim=56906,output_dim= 300,weights=[embedding_mat(feature_names[5])],trainable=False)(input1)
x1 = SpatialDropout1D(0.3)(x1)
x1 = CuDNNLSTM(128,return_sequences=True)(x1)
x1 = Flatten()(x1)

#input 2
input2 = Input(shape=(1,))
x2 = Embedding(input_dim= 52,output_dim= 2)(input2)
#x2 = SpatialDropout1D(0.3)(x2)
x2 = Flatten()(x2)

#input 3
input3 = Input(shape=(1,))
x3 = Embedding(input_dim= 5,output_dim= 2)(input3)
#x3 = SpatialDropout1D(0.3)(x3)
x3 = Flatten()(x3)

#input 4
input4 = Input(shape=(1,))
x4 = Embedding(input_dim=50,output_dim= 2)(input4)
#x4 = SpatialDropout1D(0.3)(x4)
x4 = Flatten()(x4)

#input 5
input5 = Input(shape=(1,))
x5 = Embedding(input_dim= 385,output_dim= 50)(input5)
#x5 = SpatialDropout1D(0.3)(x5)
x5 = Flatten()(x5)

#input 6
input6 = Input(shape=(1,))
x6 = Embedding(input_dim= 6,output_dim= 5)(input6)
#x6 = SpatialDropout1D(0.3)(x6)
x6 = Flatten()(x6)

#input 7
input7 = Input(shape=(1,))
x7 = Dense(16,activation='relu',kernel_initializer=he_normal(),kernel_regularizer=l2(0.0001))(input7)
#x7 = Flatten()(x7)
#merging all the inputs 
concat = concatenate([x1,x2,x3,x4,x5,x6,x7])
#x = BatchNormalization()(concat)

x = Dense(128,activation='relu',kernel_initializer=he_normal(),kernel_regularizer=l2(0.0001))(concat)
x = Dropout(0.5)(x)
x = Dense(64,activation='relu',kernel_initializer=he_normal(),kernel_regularizer=l2(0.0001))(x)
x = Dropout(0.5)(x)
x = BatchNormalization()(x)
x = Dense(32,activation='relu',kernel_initializer=he_normal(),kernel_regularizer=l2(0.0001))(x)
x = Dropout(0.5)(x)
output = Dense(2, activation = 'softmax')(x)
 
# create model with seven inputs
model = Model([input1,input2,input3,input4,input5,input6,input7], output)
tensorboard = TensorBoard(log_dir='logs/{}'.format(time()))
model.compile(loss='categorical_crossentropy', optimizer=keras.optimizers.Adam(lr=0.0006,decay = 1e-4),metrics=[auc])
model.load_weights("weights_copy.best.hdf5")

In [50]:
print("Auc for test data: %0.3f"%roc_auc_score(y_test,model.predict([X_test,X_test_school_state,X_test_project_grade,X_test_project_cat,X_test_project_subcat,
           X_test_teacher_prefix,test['remaining_input']])))
print("Auc for CV data: %0.3f"%roc_auc_score(y_cv,model.predict([X_cv,X_cv_school_state,X_cv_project_grade,X_cv_project_cat,X_cv_project_subcat,
           X_cv_teacher_prefix,cv['remaining_input']])))
print("Auc for train data: %0.3f"%roc_auc_score(y_train,model.predict([X_train,X_train_school_state,X_train_project_grade,X_train_project_cat,X_train_project_subcat,
           X_train_teacher_prefix,train['remaining_input']])))

Auc for test data: 0.759
Auc for CV data: 0.761
Auc for train data: 0.802


<img src ='model_1_epoch_auc_loss.jpg' >
<img src = 'model_1_epoch_val_auc_loss.jpg'>

# END