#Objective: Predict if an teacher essay gets approved or not

 The competition dataset contains information from teachers' project applications to DonorsChoose.org including teacher attributes, school attributes, and the project proposals including application essays. Your objective is to predict whether or not a DonorsChoose.org project proposal submitted by a teacher will be approved.

File descriptions
train.csv - the training set
test.csv - the test set (we use just the training set and divide it into training and validation)


###Get glove Embeddings from Stanford website

In [0]:
!wget  http://nlp.stanford.edu/data/glove.6B.zip
  
!unzip glove.6B.zip

--2019-03-17 09:19:09--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2019-03-17 09:19:09--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2019-03-17 09:19:42 (25.1 MB/s) - ‘glove.6B.zip’ saved [862182613/862182613]

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [28]:
!ls

glove.6B.100d.txt  sample_data	train.csv


In [0]:
import os
import sys
import numpy as np

import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dense, Input, GlobalMaxPooling1D, LSTM
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.initializers import Constant

In [0]:
BASE_DIR = ''
GLOVE_DIR = os.path.join(BASE_DIR, '')
TEXT_DATA_DIR = os.path.join(BASE_DIR, '.')

MAX_SEQUENCE_LENGTH = 1000
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

### Create a mapping  word to its respective embedding

In [31]:

print('Indexing word vectors.')

embeddings_index = {}
with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt')) as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

Indexing word vectors.
Found 182431 word vectors.


## Load the text samples and process their dataset.We take only the train data here and split it as 80/20 train and validation set for effective generalization

In [33]:
import pandas as pd
data = pd.read_csv('./train.csv',encoding = 'utf8' ,  engine='python', error_bad_lines=False)


Skipping line 7391: unexpected end of data


In [35]:
data.columns

Index(['id', 'teacher_id', 'teacher_prefix', 'school_state',
       'project_submitted_datetime', 'project_grade_category',
       'project_subject_categories', 'project_subject_subcategories',
       'project_title', 'project_essay_1', 'project_essay_2',
       'project_essay_3', 'project_essay_4', 'project_resource_summary',
       'teacher_number_of_previously_posted_projects', 'project_is_approved'],
      dtype='object')

In [36]:
labels = data['project_is_approved'].tolist()
texts = data['project_essay_1'].tolist()
print('Found %s texts.' % len(texts))

Found 7389 texts.


## Tokenize the words 

In [37]:
# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 13527 unique tokens.


## Pad the sequences so that they are all the same

In [0]:
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

## check data and label

In [55]:
set(labels)
labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Shape of data tensor: (7389, 1000)
Shape of label tensor: (7389, 2)


In [57]:
labels

array([[0., 1.],
       [1., 0.],
       [0., 1.],
       ...,
       [0., 1.],
       [0., 1.],
       [0., 1.]], dtype=float32)

### split the data into a training set and a validation set


In [0]:
indices = np.arange(data.shape[0])
indices = np.random.shuffle(indices)
data = data[list(indices)]
#labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])


## Make a train test split

In [0]:
x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]


###Embedding layer set to non trainable mode

In [60]:
print('Preparing embedding matrix.')

# prepare embedding matrix
num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

Preparing embedding matrix.


## Define the model and compile

In [61]:
# train a 1D convnet with global maxpooling
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = GlobalMaxPooling1D()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(2, activation='softmax')(x)


Instructions for updating:
Colocations handled automatically by placer.


In [0]:
model = Model(sequence_input, preds)

model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])


In [75]:
data.shape

array([[   0,    0,    0, ...,    1,  373,  167],
       [   0,    0,    0, ...,   11, 1204,   38],
       [   0,    0,    0, ...,    8,  152,   49],
       ...,
       [   0,    0,    0, ..., 8402,    3,   85],
       [   0,    0,    0, ...,   38,   29,  289],
       [   0,    0,    0, ...,   53,   23,  202]], dtype=int32)

In [0]:

class_weight ={0: 5.,
               1: 1.0}

###Fit a model to train and validation set

In [72]:
model.fit(x_train, y_train,
          batch_size=128,
          epochs=10,
          class_weight = class_weight,
          validation_data=(x_val, y_val))

Train on 5912 samples, validate on 1477 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fcb9e00ff60>

In [76]:
model.fit(x_train, y_train,
          batch_size=128,
          epochs=10,
          validation_data=(x_val, y_val))

Train on 5912 samples, validate on 1477 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fcb9e00f5c0>