In [1]:
import sys
import csv
import torch.nn as nn
import torch.optim as optim
sys.path.extend(['/Users/zeerakw/Documents/PhD/projects/active/Generalisable_abuse'])

from gen.shared.data import GeneralDataset
from gen.shared.batching import Batch, BatchExtractor
from gen.shared.base import Field
from gen.neural import RNNClassifier
from gen.shared.clean import Cleaner
from gen.shared.train import run_model
from sklearn.metrics import accuracy_score

Here we set the text fields which correspond to the individual fields in the csv or json file. The distinction between JSON and CSV is set through the ```ftype``` argument in the GeneralDataset class. In the field objects, the ```cname``` attribute should correspond to json field attributes.

NOTE: Only works with top level JSON keys.

In [2]:
text_field = Field('text', train = True, label = False, ignore = False, ix = 6, cname = 'text')
label_field = Field('label', train = False, label = True, cname = 'label', ignore = False, ix = 5)
ignore_field = Field('ignore', train = False, label = False, cname = 'ignore', ignore = True)

davidson_fields = [ignore_field, ignore_field, ignore_field, ignore_field, ignore_field, label_field, text_field]

dataset = GeneralDataset(data_dir = '~/PhD/projects/active/Generalisable_abuse/data/',
                         ftype = 'csv', fields = davidson_fields, train = 'davidson_train.csv', dev = None,
                         test = None, train_labels = None, tokenizer = lambda x: x.split(),
                         lower = True, preprocessor = None, transformations = None,
                         label_processor = None, sep = ',', name = 'Davidson et al.')
dataset.load('train')

loading Davidson et al. (train): 887it [00:00, 23182.19it/s]


We then split our dataset, and build our vocabulary and labels on only our training set. 
We then encode the documents in into onehot tensors and process the labels.

In [3]:
train, dev, test = dataset.split(dataset.data, [0.8, 0.1, 0.1])
dataset.build_token_vocab(train)
dataset.build_label_vocab(train)
dataset.process_labels(train)

dataset.process_labels(dev)

train = dataset.encode(train, onehot = True)
dev = dataset.encode(dev, onehot = True)
test = dataset.encode(test, onehot = True)
dataset.process_labels(test)

Building vocabulary: 100%|██████████| 709/709 [00:00<00:00, 40202.26it/s]
Encoding data: 100%|██████████| 709/709 [00:01<00:00, 380.80it/s]
Encoding data: 0it [00:00, ?it/s]
Encoding data: 100%|██████████| 88/88 [00:00<00:00, 265.95it/s]


We finally batch the dataset and access the encoded data and processed labels.

In [4]:
batched = Batch(64, train)
batched.create_batches()
batched_train = BatchExtractor('encoded', 'label', batched)

batched = Batch(64, dev)
batched.create_batches()
batched_dev = BatchExtractor('encoded', 'label', batched)

batched = Batch(len(test), test)
batched.create_batches()
batched_test = BatchExtractor('encoded', 'label', batched)

Finally, we can initialise the model and losses, train and evaluate using it

In [6]:
model = RNNClassifier(len(dataset.stoi), hidden_dim = 128, output_dim = 3, batch_first = True)
optimizer = optim.Adam(model.parameters(), lr = 0.01)
loss = nn.NLLLoss()
print(type(optimizer))
print(type(loss))
writer = csv.writer(open('tmp', 'a', encoding = 'utf-8'), delimiter = '\t')

model_args = {'model': model, 'optimizer': optimizer, 'loss_func': loss, 'batches': batched_train,
              'dev_batches': batched_dev, 'epochs': 5, 'metrics': {'accuracy': accuracy_score}, 'batch_first': True, 
              'embedding_dim': 300, 'hidden_dim': 300, 'input_dim': dataset.vocab_size(), 
              'output_dim': dataset.label_count(), 'data_name': dataset.name}
info = [] # Should be the model parameters to be written out
run_model('pytorch', train = True, writer = writer, model_info = info, head_len = 35, **model_args)

  0%|          | 0/5 [00:00<?, ?it/s]

<class 'torch.optim.adam.Adam'>
<class 'torch.nn.modules.loss.NLLLoss'>


100%|██████████| 5/5 [00:10<00:00,  2.20s/it]


In [7]:
eval_args = {'model': model, 'loss_func': loss, 'iterator': batched_test, 'metrics': {'accuracy': accuracy_score},
             'epochs': 1, 'data_name': dataset.name}
run_model('pytorch', train = False, writer = writer, model_info = [], head_len = 35, **eval_args)

In [None]:
reader = csv.reader(open('tmp', 'r'), delimiter = '\t')

for line in reader:
    print(line)

reader.close()