In [None]:
import os, sys, time, random, torch
import pandas as pd
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import matplotlib.pyplot as plt
from sklearn.metrics import cohen_kappa_score
from tqdm import tqdm
import hyp
import models
import state_data as aug
import preprocess_data as prep

def store_result(store_epoch_acc_val, store_epoch_loss_val, store_qwk_epoch_loss_val, HIDDEN, ONE_HOT, DATA_AUG, training_dir):
    most_acc = max(store_epoch_acc_val)
    min_loss = min(store_epoch_loss_val)
    qwk_max_loss = max(store_qwk_epoch_loss_val)
    print("\nHighest accuracy of {} occured at {}...\nMinimum loss occured at {}... \nMaximum QWK metric of {} occured at {}".format(
        most_acc, store_epoch_acc_val.index(most_acc)+1, 
        store_epoch_loss_val.index(min_loss)+1, 
        qwk_max_loss, store_qwk_epoch_loss_val.index(qwk_max_loss)+1))
    with open(training_dir+"/HYP.txt","w+") as f:
        f.write("EPOCH = {} \n".format(hyp.EPOCHS))
        f.write("LR = {} \n".format(hyp.LR))
        f.write("HIDDEN_LAYERS = {} \n".format(HIDDEN))
        f.write("ONE_HOT = {} \n".format(ONE_HOT))
        f.write("DATA_AUG = {} \n".format(DATA_AUG))
        f.write("Highest accuracy of {} occured at {}...\nMinimum loss of {} occured at {}... \nMaximum QWK metric of {} occured at {}".format(
        most_acc, store_epoch_acc_val.index(most_acc)+1, 
        min_loss, store_epoch_loss_val.index(min_loss)+1, 
        qwk_max_loss, store_qwk_epoch_loss_val.index(qwk_max_loss)+1))
    checkpoints = os.listdir(training_dir)
    for checkpoint in checkpoints:
        if "checkpoint" in checkpoint:
            checkpoint_num = int(checkpoint[checkpoint.index("_")+1:checkpoint.index(".")])
            if checkpoint_num not in [store_qwk_epoch_loss_val.index(qwk_max_loss)+1,
                                      store_epoch_loss_val.index(min_loss)+1,
                                      store_epoch_acc_val.index(most_acc)+1]:
                os.remove(training_dir+"/"+checkpoint)

def train(model, HIDDEN, ONE_HOT, DATA_AUG, NUM_FOLDS, data_train_loader, data_val_loader):
    print("Training...")
    training_dir = './training_{}_{}+{}+{}+{}'.format(int(time.time()), ONE_HOT, DATA_AUG, len(HIDDEN), max(HIDDEN))
    os.mkdir(training_dir)
    os.mkdir(training_dir+'/misclassified')
    model.train()
    optimizer = optim.Adam(model.parameters(), lr=hyp.LR, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False)
    
    qwk_loss = cohen_kappa_score
    ce_loss = nn.CrossEntropyLoss().cuda()
    epoch = 0
    store_epoch_loss = []
    store_qwk_epoch_loss = []
    store_epoch_loss_val = []
    store_qwk_epoch_loss_val = []
    store_epoch_acc_val = []
    try:
        for e in tqdm(range(hyp.EPOCHS)):
            #scheduler.step()
            epoch = e + 1
            epoch_loss = 0
            qwk_epoch_loss = 0
            store_batch_loss = []
            store_qwk_batch_loss = []
            
            for batch_num, (X, y) in enumerate(data_train_loader):
                optimizer.zero_grad()
                prediction = model.forward(X.cuda())
                batch_loss = ce_loss(prediction, y)
                batch_loss.backward()
                qwk_batch_loss = qwk_loss(y.clone().detach().cpu().numpy(), 
                                          np.argmax(prediction.clone().detach().cpu().numpy(), axis=1), 
                                          weights="quadratic")
                optimizer.step()
                store_batch_loss.append(batch_loss.clone().cpu())
                store_qwk_batch_loss.append(qwk_batch_loss)
                epoch_loss = torch.FloatTensor(store_batch_loss).mean()
                qwk_epoch_loss = torch.FloatTensor(store_qwk_batch_loss).mean()
                
            store_epoch_loss.append(epoch_loss)
            store_qwk_epoch_loss.append(qwk_epoch_loss)
            torch.save(model.state_dict(), "{}/checkpoint_{}.pth".format(training_dir, epoch))

            model.eval()
            epoch_loss_val = 0
            qwk_epoch_loss_val = 0
            epoch_acc_val = 0
            store_batch_loss_val = []
            store_qwk_batch_loss_val = []
            store_batch_acc_val = []
            misclassified_images = []
            for batch_num, (X, y) in enumerate(data_val_loader):
                with torch.no_grad():
                    prediction = model.forward(X.cuda())
                batch_loss = ce_loss(prediction, y)
                qwk_batch_loss = qwk_loss(y.clone().detach().cpu().numpy(), 
                                          np.argmax(prediction.clone().detach().cpu().numpy(), axis=1), 
                                          weights="quadratic")
                misclassified = prediction.max(-1)[-1].squeeze().cpu() != y.cpu()
                misclassified_images.append(X[misclassified==1].cpu())
                batch_acc = misclassified.float().mean()
                store_batch_loss_val.append(batch_loss)
                store_qwk_batch_loss_val.append(qwk_batch_loss)
                store_batch_acc_val.append(batch_acc)
                epoch_loss_val = torch.FloatTensor(store_batch_loss_val).mean()
                qwk_epoch_loss_val = torch.FloatTensor(store_qwk_batch_loss_val).mean()
                epoch_acc_val = torch.FloatTensor(store_batch_acc_val).mean()
            store_epoch_loss_val.append(epoch_loss_val)
            store_qwk_epoch_loss_val.append(qwk_epoch_loss_val)
            store_epoch_acc_val.append(1-epoch_acc_val)
            plt.plot(store_epoch_loss_val[1:], label="Validation Loss")
            plt.plot(store_qwk_epoch_loss_val[1:], label="Validation Metric(QWK)")
            plt.plot(store_epoch_acc_val[1:], label="Validation Accuracy")
            plt.legend()
            plt.grid()
            plt.savefig("{}/Loss.png".format(training_dir))
            plt.close()
            if len(misclassified_images) > 0:
                misclassified_images = np.concatenate(misclassified_images,axis=0)
                validation_dir = training_dir+'/misclassified/checkpoint_{}'.format(epoch)
                os.mkdir(validation_dir)
            model.train()
        store_result(store_epoch_acc_val, store_epoch_loss_val, store_qwk_epoch_loss_val, HIDDEN, ONE_HOT, DATA_AUG, training_dir)

    except KeyboardInterrupt:
        store_result(store_epoch_acc_val, store_epoch_loss_val, store_qwk_epoch_loss_val, HIDDEN, ONE_HOT, DATA_AUG, training_dir)

if __name__ == "__main__":
    for ONE_HOT in [0,1]: # for MaturitySize, FurLength, Health
        for DATA_AUG in [0,1]: # for state data
            for HIDDEN_LIST in hyp.HIDDEN_LIST:
                for NUM_FOLDS in [5, 10]:
                    data_train_loader, data_val_loader = prep.preprocess_data(ONE_HOT, DATA_AUG)
                    train(models.Model(HIDDEN_LIST, ONE_HOT, DATA_AUG).cuda(), HIDDEN_LIST, ONE_HOT, DATA_AUG, NUM_FOLDS, data_train_loader, data_val_loader)

In [None]:
import pandas as pd
import json
import os
import numpy as np
import torch, random, models
import torch.utils.data as data
import state_data as aug

ONE_HOT = 0
DATA_AUG = 1
dir = "data/train_sentiment"
        
df = pd.read_csv('data/train.csv')
nPetID = df['PetID']
df = df.drop(['Name', 'RescuerID', 'PetID', 'Description'], axis=1)
d = torch.FloatTensor(df.values)

# Type,Name,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,
#    0,   x,  1,     2,     3,     4,     5,     6,     7,           8,
# FurLength,Vaccinated,Dewormed,Sterilized,Health,Quantity,Fee,State,RescuerID,
#         9,        10,      11,        12,    13,      14, 15,   16,        x,      
# VideoAmt,Description,PetID,PhotoAmt,AdoptionSpeed
#       17,          x,    x,      18,           19

# df['Type']
nType = np.array([[0.5,0.5]]*d.size(0)).astype(float)
nType[df['Type'].values.astype(int)==1] = [0.,1.]
nType[df['Type'].values.astype(int)==2] = [1.,0.]

# df['Breed1']
idx = d[:,2]
nBreed1 = torch.zeros(len(idx), int(idx.max())+1).scatter_(1, idx.view(len(idx)).long().unsqueeze(1), 1.)[:,1:]

# df['Breed2']
idx = d[:,3]
nBreed2 = torch.zeros(len(idx), int(idx.max())+1).scatter_(1, idx.view(len(idx)).long().unsqueeze(1), 1.)[:,1:]

# df['Gender']
nGender = np.array([[0.5,0.5]]*d.size(0)).astype(float)
nGender[df['Gender'].values.astype(int)==1] = [0.,1.]
nGender[df['Gender'].values.astype(int)==2] = [1.,0.]

# df['Color1']
idx = d[:,5]
nColor1 = torch.zeros(len(idx), int(idx.max())+1).scatter_(1, idx.view(len(idx)).long().unsqueeze(1), 1.)[:,1:]

# df['Color2']
idx = d[:,6]
nColor2 = torch.zeros(len(idx), int(idx.max())+1).scatter_(1, idx.view(len(idx)).long().unsqueeze(1), 1.)[:,1:]

# df['Color3']
idx = d[:,7]
nColor3 = torch.zeros(len(idx), int(idx.max())+1).scatter_(1, idx.view(len(idx)).long().unsqueeze(1), 1.)[:,1:]

# df['Vaccinated']
nVaccinated = np.array([[0.5,0.5]]*d.size(0)).astype(float)
nVaccinated[df['Vaccinated'].values.astype(int)==1] = [0.,1.]
nVaccinated[df['Vaccinated'].values.astype(int)==2] = [1.,0.]

# df['Dewormed']
nDewormed = np.array([[0.5,0.5]]*d.size(0)).astype(float)
nDewormed[df['Dewormed'].values.astype(int)==1] = [0.,1.]
nDewormed[df['Dewormed'].values.astype(int)==2] = [1.,0.]

# df['Sterilized']
nSterilized = np.array([[0.5,0.5]]*d.size(0)).astype(float)
nSterilized[df['Sterilized'].values.astype(int)==1] = [0.,1.]
nSterilized[df['Sterilized'].values.astype(int)==2] = [1.,0.]

# df['State']
idx = d[:,16]
nState = torch.zeros(len(idx), int(idx.max()-idx.min())+1).scatter_(1, idx.view(len(idx)).long().unsqueeze(1)-idx.min(), 1.)

# sentiment analysis
# if no textual description available, 0 for sentimental analysis column
nSentiment = np.array([[0]]*d.size(0)).astype(float)

for file in os.listdir(dir):
    petid = file[:file.index(".")]
    print(petid)
    with open(dir+"/"+file) as f:
        data = json.load(f)
        mag = data["documentSentiment"]["magnitude"]
        score = data["documentSentiment"]["score"]
        df.loc[nPetID == petid] == mag*score
print(df[])
if DATA_AUG:
    state_data = {}
    for k, v in aug.state_gdp.items():
        state_data[k] = np.array([v, aug.state_population[k], aug.state_area[k]]).astype(float)

    nState = np.array([[0, 0, 0]]*d.size(0)).astype(float)
    for k,v in state_data.items():
        nState[df['State'].values.astype(int)==k] = v

if ONE_HOT:
    idx = d[:,8]
    nMaturitySize = torch.zeros(len(idx), int(idx.max())+1).scatter_(1, idx.view(len(idx)).long().unsqueeze(1), 1.)[:,1:]
    idx = d[:,9]
    nFurLength = torch.zeros(len(idx), int(idx.max())+1).scatter_(1, idx.view(len(idx)).long().unsqueeze(1), 1.)[:,1:]
    idx = d[:,13]
    nHealth = torch.zeros(len(idx), int(idx.max())+1).scatter_(1, idx.view(len(idx)).long().unsqueeze(1), 1.)[:,1:]
    d = torch.cat([torch.FloatTensor(nType),
                   d[:,1:2],
                   torch.FloatTensor(nBreed1),
                   torch.FloatTensor(nBreed2),
                   torch.FloatTensor(nGender),
                   torch.FloatTensor(nColor1),
                   torch.FloatTensor(nColor2),
                   torch.FloatTensor(nColor3),
                   torch.FloatTensor(nMaturitySize),
                   torch.FloatTensor(nFurLength),
                   torch.FloatTensor(nVaccinated),
                   torch.FloatTensor(nDewormed),
                   torch.FloatTensor(nSterilized),
                   torch.FloatTensor(nHealth),
                   d[:,13:16],
                   torch.FloatTensor(nState),
                   d[:,17:]
                  ], dim=1).cuda()
else:
    d = torch.cat([torch.FloatTensor(nType),
                   d[:,1:2],
                   torch.FloatTensor(nBreed1),
                   torch.FloatTensor(nBreed2),
                   torch.FloatTensor(nGender),
                   torch.FloatTensor(nColor1),
                   torch.FloatTensor(nColor2),
                   torch.FloatTensor(nColor3),
                   d[:,8:10],
                   torch.FloatTensor(nVaccinated),
                   torch.FloatTensor(nDewormed),
                   torch.FloatTensor(nSterilized),
                   d[:,14:16],
                   torch.FloatTensor(nState),
                   d[:,17:]
                  ], dim=1).cuda()

random.shuffle(d)
partition = {}
validation = d[:len(d)//10]
partition['validation'] = models.CSVDataset(validation)
train_set = d[len(d)//10:]
partition['train'] = models.CSVDataset(train_set)
data_train_loader = data.DataLoader(partition['train'], shuffle=True, batch_size=32)
data_val_loader = data.DataLoader(partition['validation'], batch_size=32)
return data_train_loader, data_val_loader