In [10]:
import pandas as pd
from collections import Counter

# Set GPU flag to false if running on CPU
GPU = True

## TorchText

In [11]:
import torch
from torchtext import data
from torchtext.vocab import GloVe

TEXT = data.Field(include_lengths=True)
ID = data.Field(sequential=False)
LABEL = data.Field(sequential=False)

def sort_key(ex):
    return len(ex.project_title)

train, val, test = data.TabularDataset.splits(
        path='.', train='train.csv',
        validation='val.csv', test='dev.csv', format='csv', skip_header=True,
        fields=[('id', ID), ('project_title', TEXT),('project_resource_summary', None), 
                ('project_essay_1', TEXT), ('project_essay_2', TEXT), ('project_is_approved', LABEL)])

#vocab is shared across all the text fields
#CAUTION: GloVe will download all embeddings locally (862 MB).  If not interested, remove "vectors"
TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300))
ID.build_vocab(train)
LABEL.build_vocab(train)

#change device to 0 for GPU
train_iter, val_iter, test_iter = data.BucketIterator.splits(
        (train, val, test), sort_key= sort_key, repeat=False,
        batch_size=(64), device=-1 if GPU else 0)

## Model

- Naive LSTM/BiLSTM classifier

In [14]:
import torch.nn as nn
import torch.nn.functional as F
import torch
from torch.autograd import Variable
import torch.optim as optim

class LSTMClassifier(nn.Module):

    def __init__(self, vocab, embedding_dim, hidden_dim, vocab_size, label_size, batch_size):
        super(LSTMClassifier, self).__init__()
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.word_embeddings.weight = nn.Parameter(vocab.vectors)        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim,bidirectional=True)
        self.hidden2label = nn.Linear(2*hidden_dim, label_size)

    def init_hidden(self,batch_size):
        if(GPU):
            h0 = Variable(torch.zeros(2, batch_size, self.hidden_dim).cuda())
            c0 = Variable(torch.zeros(2, batch_size, self.hidden_dim).cuda())
        
        else:
            h0 = Variable(torch.zeros(2, batch_size, self.hidden_dim))
            c0 = Variable(torch.zeros(2, batch_size, self.hidden_dim))
            
        return (h0, c0)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        y  = self.hidden2label(lstm_out[-1])
        return y

from sklearn.metrics import accuracy_score, f1_score
def test_result(net,iter_obj):
    pred = []
    actual = []
    for batch in iter_obj:
        input,label = batch.project_title[0], batch.project_is_approved-1
        if(GPU): input = input.cuda()
        net.hidden = net.init_hidden(input.shape[1])
        scores = net(input)
        pred.extend(scores.cpu().data.numpy().argmax(axis=1))
        actual.extend(label.data.cpu().numpy().tolist())
    return round(accuracy_score(actual,pred),2)

### Train a Naive BiLSTM model
- Input: project_title

In [15]:
# %%time
import time
NUM_EPOCHS = 100
model = LSTMClassifier(vocab = TEXT.vocab, embedding_dim=300, vocab_size=len(TEXT.vocab), 
                       hidden_dim=50, batch_size=64, label_size=2)



if(GPU): model.cuda()

loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

loss_l = []
for i in range(NUM_EPOCHS):
    start = time.time()
    for batch in train_iter:
        model.zero_grad()       
        input,label = batch.project_title[0], batch.project_is_approved-1
        if(GPU): input,label = input.cuda(),label.cuda()
        model.hidden = model.init_hidden(input.shape[1])

        
        scores = model(input)
        loss = loss_function(scores, label)
        loss.backward()
        optimizer.step()
    loss_l.append(loss.cpu().data.numpy())
    if((i+1)%10==0):
        print(f"End of {i+1} epochs")
        print(f"Train accuracy: {test_result(model,train_iter)}, Validation accuracy: {test_result(model,val_iter)}")

End of 10 epochs
Train accuracy: 0.9, Validation accuracy: 0.53
End of 20 epochs
Train accuracy: 0.96, Validation accuracy: 0.53
End of 30 epochs
Train accuracy: 0.97, Validation accuracy: 0.53
End of 40 epochs
Train accuracy: 0.97, Validation accuracy: 0.54
End of 50 epochs
Train accuracy: 0.97, Validation accuracy: 0.53
End of 60 epochs
Train accuracy: 0.97, Validation accuracy: 0.53
End of 70 epochs
Train accuracy: 0.98, Validation accuracy: 0.53
End of 80 epochs
Train accuracy: 0.98, Validation accuracy: 0.53
End of 90 epochs
Train accuracy: 0.98, Validation accuracy: 0.53
End of 100 epochs
Train accuracy: 0.98, Validation accuracy: 0.53
