In [1]:
import os, sys, time, random, torch
import pandas as pd
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import matplotlib.pyplot as plt
from sklearn.metrics import cohen_kappa_score
from tqdm import tqdm
import hyp
import models
import state_data as aug
import preprocess_data as prep

def store_result(store_epoch_acc_val, store_epoch_loss_val, store_qwk_epoch_loss_val, HIDDEN, ONE_HOT, DATA_AUG, training_dir):
    most_acc = max(store_epoch_acc_val)
    min_loss = min(store_epoch_loss_val)
    qwk_max_loss = max(store_qwk_epoch_loss_val)
    print("\nHighest accuracy of {} occured at {}...\nMinimum loss occured at {}... \nMaximum QWK metric of {} occured at {}".format(
        most_acc, store_epoch_acc_val.index(most_acc)+1, 
        store_epoch_loss_val.index(min_loss)+1, 
        qwk_max_loss, store_qwk_epoch_loss_val.index(qwk_max_loss)+1))
    with open(training_dir+"/HYP.txt","w+") as f:
        f.write("EPOCH = {} \n".format(hyp.EPOCHS))
        f.write("LR = {} \n".format(hyp.LR))
        f.write("HIDDEN_LAYERS = {} \n".format(HIDDEN))
        f.write("ONE_HOT = {} \n".format(ONE_HOT))
        f.write("DATA_AUG = {} \n".format(DATA_AUG))
        f.write("Highest accuracy of {} occured at {}...\nMinimum loss of {} occured at {}... \nMaximum QWK metric of {} occured at {}".format(
        most_acc, store_epoch_acc_val.index(most_acc)+1, 
        min_loss, store_epoch_loss_val.index(min_loss)+1, 
        qwk_max_loss, store_qwk_epoch_loss_val.index(qwk_max_loss)+1))
    checkpoints = os.listdir(training_dir)
    for checkpoint in checkpoints:
        if "checkpoint" in checkpoint:
            checkpoint_num = int(checkpoint[checkpoint.index("_")+1:checkpoint.index(".")])
            if checkpoint_num not in [store_qwk_epoch_loss_val.index(qwk_max_loss)+1,
                                      store_epoch_loss_val.index(min_loss)+1,
                                      store_epoch_acc_val.index(most_acc)+1]:
                os.remove(training_dir+"/"+checkpoint)

def train(model, HIDDEN, ONE_HOT, DATA_AUG, data_train_loader, data_val_loader):
    print("Training...")
    training_dir = './training_{}+{}_{}_{}_{}'.format(ONE_HOT, DATA_AUG, len(HIDDEN), max(HIDDEN), time.time())
    os.mkdir(training_dir)
    os.mkdir(training_dir+'/misclassified')
    model.train()
    optimizer = optim.Adam(model.parameters(), lr=hyp.LR, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False)
    
    qwk_loss = cohen_kappa_score
    ce_loss = nn.CrossEntropyLoss().cuda()
    epoch = 0
    store_epoch_loss = []
    store_qwk_epoch_loss = []
    store_epoch_loss_val = []
    store_qwk_epoch_loss_val = []
    store_epoch_acc_val = []
    try:
        for e in tqdm(range(hyp.EPOCHS)):
            #scheduler.step()
            epoch = e + 1
            epoch_loss = 0
            qwk_epoch_loss = 0
            store_batch_loss = []
            store_qwk_batch_loss = []
            
            for batch_num, (X, y) in enumerate(data_train_loader):
                optimizer.zero_grad()
                prediction = model.forward(X.cuda())
                batch_loss = ce_loss(prediction, y)
                batch_loss.backward()
                qwk_batch_loss = qwk_loss(y.clone().detach().cpu().numpy(), 
                                          np.argmax(prediction.clone().detach().cpu().numpy(), axis=1), 
                                          weights="quadratic")
                optimizer.step()
                store_batch_loss.append(batch_loss.clone().cpu())
                store_qwk_batch_loss.append(qwk_batch_loss)
                epoch_loss = torch.FloatTensor(store_batch_loss).mean()
                qwk_epoch_loss = torch.FloatTensor(store_qwk_batch_loss).mean()
                
            store_epoch_loss.append(epoch_loss)
            store_qwk_epoch_loss.append(qwk_epoch_loss)
            torch.save(model.state_dict(), "{}/checkpoint_{}.pth".format(training_dir, epoch))

            model.eval()
            epoch_loss_val = 0
            qwk_epoch_loss_val = 0
            epoch_acc_val = 0
            store_batch_loss_val = []
            store_qwk_batch_loss_val = []
            store_batch_acc_val = []
            misclassified_images = []
            for batch_num, (X, y) in enumerate(data_val_loader):
                with torch.no_grad():
                    prediction = model.forward(X.cuda())
                batch_loss = ce_loss(prediction, y)
                qwk_batch_loss = qwk_loss(y.clone().detach().cpu().numpy(), 
                                          np.argmax(prediction.clone().detach().cpu().numpy(), axis=1), 
                                          weights="quadratic")
                misclassified = prediction.max(-1)[-1].squeeze().cpu() != y.cpu()
                misclassified_images.append(X[misclassified==1].cpu())
                batch_acc = misclassified.float().mean()
                store_batch_loss_val.append(batch_loss)
                store_qwk_batch_loss_val.append(qwk_batch_loss)
                store_batch_acc_val.append(batch_acc)
                epoch_loss_val = torch.FloatTensor(store_batch_loss_val).mean()
                qwk_epoch_loss_val = torch.FloatTensor(store_qwk_batch_loss_val).mean()
                epoch_acc_val = torch.FloatTensor(store_batch_acc_val).mean()
            store_epoch_loss_val.append(epoch_loss_val)
            store_qwk_epoch_loss_val.append(qwk_epoch_loss_val)
            store_epoch_acc_val.append(1-epoch_acc_val)
            plt.plot(store_epoch_loss_val[1:], label="Validation Loss")
            plt.plot(store_qwk_epoch_loss_val[1:], label="Validation Metric(QWK)")
            plt.plot(store_epoch_acc_val[1:], label="Validation Accuracy")
            plt.legend()
            plt.grid()
            plt.savefig("{}/Loss.png".format(training_dir))
            plt.close()
            if len(misclassified_images) > 0:
                misclassified_images = np.concatenate(misclassified_images,axis=0)
                validation_dir = training_dir+'/misclassified/checkpoint_{}'.format(epoch)
                os.mkdir(validation_dir)
            model.train()
        store_result(store_epoch_acc_val, store_epoch_loss_val, store_qwk_epoch_loss_val, HIDDEN, ONE_HOT, DATA_AUG, training_dir)

    except KeyboardInterrupt:
        store_result(store_epoch_acc_val, store_epoch_loss_val, store_qwk_epoch_loss_val, HIDDEN, ONE_HOT, DATA_AUG, training_dir)

if __name__ == "__main__":
    for ONE_HOT in [0,1]: # for MaturitySize, FurLength, Health
        for DATA_AUG in [0,1]: # for state data
            data_train_loader, data_val_loader = prep.preprocess_data(ONE_HOT, DATA_AUG)
            train(models.Model(hyp.HIDDEN_LIST[2], ONE_HOT, DATA_AUG).cuda(), hyp.HIDDEN_LIST[2], ONE_HOT, DATA_AUG, data_train_loader, data_val_loader)

  0%|          | 0/1000 [00:00<?, ?it/s]

Training...


  0%|          | 2/1000 [00:05<49:19,  2.97s/it]



Highest accuracy of 0.3899478316307068 occured at 2...
Minimum loss occured at 2... 
Maximum QWK metric of 0.33045652508735657 occured at 2


  0%|          | 0/1000 [00:00<?, ?it/s]

Training...


  0%|          | 2/1000 [00:05<47:47,  2.87s/it]



Highest accuracy of 0.2976999282836914 occured at 1...
Minimum loss occured at 1... 
Maximum QWK metric of 0.03216612711548805 occured at 2


  0%|          | 0/1000 [00:00<?, ?it/s]

Training...


  0%|          | 2/1000 [00:05<45:49,  2.76s/it]



Highest accuracy of 0.39671987295150757 occured at 2...
Minimum loss occured at 2... 
Maximum QWK metric of 0.33799639344215393 occured at 2


  0%|          | 0/1000 [00:00<?, ?it/s]

Training...


  0%|          | 2/1000 [00:05<45:08,  2.71s/it]


Highest accuracy of 0.2771621346473694 occured at 1...
Minimum loss occured at 2... 
Maximum QWK metric of 0.01687517948448658 occured at 1





In [15]:
import pandas as pd
import json
import os

dir = "data/train_sentiment"

for file in os.listdir(dir):
    petid = file[:file.index(".")]
    with open(dir+"/"+file) as f:
        data = json.load(f)
        print(data[documentSentiment])

{'sentences': [{'text': {'content': 'The dog has completed all vaccinations.', 'beginOffset': -1}, 'sentiment': {'magnitude': 0.2, 'score': 0.2}}, {'text': {'content': 'Healthy but not toilet trained.', 'beginOffset': -1}, 'sentiment': {'magnitude': 0.2, 'score': -0.2}}, {'text': {'content': 'We are looking for serious owners and will not charge an fee.', 'beginOffset': -1}, 'sentiment': {'magnitude': 0, 'score': 0}}, {'text': {'content': 'Pick up by the owner.', 'beginOffset': -1}, 'sentiment': {'magnitude': 0, 'score': 0}}], 'tokens': [], 'entities': [{'name': 'dog', 'type': 'OTHER', 'metadata': {}, 'salience': 0.5227934, 'mentions': [{'text': {'content': 'dog', 'beginOffset': -1}, 'type': 'COMMON'}]}, {'name': 'vaccinations', 'type': 'OTHER', 'metadata': {}, 'salience': 0.24842142, 'mentions': [{'text': {'content': 'vaccinations', 'beginOffset': -1}, 'type': 'COMMON'}]}, {'name': 'owners', 'type': 'PERSON', 'metadata': {}, 'salience': 0.08223261, 'mentions': [{'text': {'content': 'o

{'sentences': [{'text': {'content': 'Chewy is an adorable and cuddly 5mths old puppy.', 'beginOffset': -1}, 'sentiment': {'magnitude': 0.9, 'score': 0.9}}, {'text': {'content': 'fantastic companion to go for a jog, desire to be around with human beings and loves to drink water and chew bones.', 'beginOffset': -1}, 'sentiment': {'magnitude': 0.9, 'score': 0.9}}, {'text': {'content': 'it was given to me and my husband few mths ago, but unfortunately we are no longer allow to keep him chewy has been a good dog and good companion to us we hope to find him a good safe place (prefer owner with experience and a dog lover)', 'beginOffset': -1}, 'sentiment': {'magnitude': 0.7, 'score': 0.7}}], 'tokens': [], 'entities': [{'name': 'puppy', 'type': 'OTHER', 'metadata': {}, 'salience': 0.2867762, 'mentions': [{'text': {'content': 'puppy', 'beginOffset': -1}, 'type': 'COMMON'}]}, {'name': 'husband', 'type': 'PERSON', 'metadata': {}, 'salience': 0.22441743, 'mentions': [{'text': {'content': 'husband'

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



{'sentences': [{'text': {'content': 'All of them are born in my house.', 'beginOffset': -1}, 'sentiment': {'magnitude': 0, 'score': 0}}, {'text': {'content': 'We brought in their grandma when we moved from Pandan Jaya.', 'beginOffset': -1}, 'sentiment': {'magnitude': 0, 'score': 0}}, {'text': {'content': 'They never get out from my house and only mixed among themselves.', 'beginOffset': -1}, 'sentiment': {'magnitude': 0.3, 'score': -0.3}}, {'text': {'content': 'They eat whiskas.', 'beginOffset': -1}, 'sentiment': {'magnitude': 0, 'score': 0}}, {'text': {'content': 'The female cat is going to deliver her kittens soon.', 'beginOffset': -1}, 'sentiment': {'magnitude': 0.1, 'score': 0.1}}, {'text': {'content': 'Two of my children including me are now suffering from bronchial asthma and the doctor advised not to keep anymore cats at home.', 'beginOffset': -1}, 'sentiment': {'magnitude': 0.1, 'score': -0.1}}, {'text': {'content': "Thus I'm now looking for the new owner for my pets.", 'beginO

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

