In [3]:
import torch.nn as nn
from torch.nn import functional as F
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np 
import torch.optim as optim
import torch 
from torch.autograd import Variable
from torch.utils.data import TensorDataset, DataLoader
import gc
import torch.utils.data as D
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

In [4]:
class Lstm(nn.Module):
    def __init__(self, act_function, vocab_size, output_dim, embedding_dim, hidden_dim, n_layers, cuda, batch_size, drop_prob=0.5):
        super(Lstm, self).__init__()
        self.device = torch.device("cuda" if cuda else "cpu")
        self.act_function = act_function
        self.cuda = cuda
        self.output_dim = output_dim
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size
        self.embedding_dim = embedding_dim
        
        # initiate layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers)#, batch_first=True
        self.lin1 = nn.Linear(hidden_dim, hidden_dim) 
        self.lin2 = nn.Linear(hidden_dim, output_dim)

    def init_hidden(self, batch_size):
        #if self.cuda:
        #    return (torch.randn(self.n_layers, batch_size, self.hidden_dim).cuda(), torch.randn(self.n_layers, batch_size, self.hidden_dim).cuda())
        #else:
        #    return (torch.randn(self.n_layers, batch_size, self.hidden_dim), torch.randn(self.n_layers, batch_size, self.hidden_dim))
        return (torch.randn(self.n_layers, self.embedding_dim, self.hidden_dim), torch.randn(self.n_layers, self.embedding_dim, self.hidden_dim))
        
    def forward(self, x, hidden):
        print(x.shape)
        x = x.long() 
        x = self.embedding(x) 
        x, hidden = self.lstm(x, hidden)
        x = self.act_function(self.lin1(x[:,-1,:]))
        x = self.act_function(self.lin2(x))
        return x, hidden

In [5]:
class MyDataset(D.Dataset):
    def __init__(self, x_tensor, y_tensor):
        self.x = torch.from_numpy(x_tensor)
        self.y = torch.from_numpy(y_tensor)
        
    def __getitem__(self, index):
        return (self.x[index], self.y[index])

    def __len__(self):
        return len(self.x)

In [6]:
def log(summary, file):
    log = open(file, "a")
    log.write(summary)
    log.close()
    print(summary)

In [19]:
def make_data(datasets, batch_size, debug): 
    print("loading datasets")
    datasets_list = []
    pos_datasets = []
    target = []

    # load all tweet datasets and merge them into one
    for f in datasets:
        datasets_list.append(pd.read_csv("../cleaned/" + f + "_clean.csv"))
        pos_datasets.append(pd.read_csv("../cleaned/" + f + "_pos.csv"))
    dataset = pd.concat(datasets_list, axis=0, ignore_index=True, sort=False)
    target = dataset["a"]
    pos_dataset = pd.concat(pos_datasets, axis=0, ignore_index=True, sort=False)
    print("jh", np.asarray(pos_dataset.values.tolist()).shape)
    # split data into test and training data and return
    train_x, test_x, train_y, test_y = train_test_split(pos_dataset, target, test_size=0.2)
    train_data = MyDataset(train_x.to_numpy(), train_y.to_numpy())
    test_data = MyDataset(test_x.to_numpy(), test_y.to_numpy())
    train_loader = DataLoader(dataset=train_data, batch_size=batch_size)
    test_loader = DataLoader(dataset=test_data, batch_size=1)
    
    return train_loader, test_loader 

In [8]:
def train(train_laoder, net, epochs, criterion, print_every, save_name, cuda, lr, batch_size, clip):
    #open("../logs/" + save_name + "_train", "w").close()
    #optimizer = optim.SGD(net.parameters(), lr=lr, momentum=0.5)
    optimizer = torch.optim.Adam(net.parameters(), lr=lr)
    error_curve = []
    net.train()
    hidden = net.init_hidden(batch_size)
    for epoch in range(epochs): 
        for index, (inputs, targets) in enumerate(train_loader): 
            hidden = tuple([item.data for item in hidden])
            net.zero_grad()
            output, hidden = net(inputs, hidden)
            if(cuda):
                ouput = output.to("cuda")
                targets = targets.to("cuda")
            loss = criterion(output.float(), targets)
            loss.backward(retain_graph=True)
            nn.utils.clip_grad_norm_(net.parameters(), clip)
            optimizer.step()
            gc.collect()
        #if (epoch % print_every == 0):
        #    log("epoch {}/{} \n... loss: {}\n".format((epoch), epochs, loss.item()), 
        #        "../logs/" + save_name + "_train")
        #    torch.save(net.state_dict(), "nets/" + save_name + str(epoch) + ".pt")  
        #    error_curve.append([epoch, loss.item()])
    #log("\n" + str(error_curve), "../logs/" + save_name + "_train")
    #plt.clf()
    #plt.plot([item[0] for item in error_curve], [item[1] for item in error_curve])
    #plt.ylabel('loss')
    #plt.xlabel('epochs')
    #plt.savefig("../img/" + save_name+"_train_error.png")

    

In [13]:
# create variables 
print("creating variables")
tweet_dataset = ["norm_test"]
act_function = torch.sigmoid
criterion = nn.CrossEntropyLoss()
cuda = torch.cuda.is_available()
batch_size = 25
vocab_size = 19
embedding_dim_emotion = 179
embedding_dim_tweet = 85
num_layers = 5
hidden_dim = 256
output_dim = 4
epochs = 101
print_every = 100
lr = 0.1
clip = 5

creating variables


In [20]:
# tweet dataset debug training
net = Lstm(act_function, vocab_size, output_dim, embedding_dim_tweet, hidden_dim, num_layers, cuda, batch_size)
train_loader, test_loader = make_data(tweet_dataset, batch_size, debug=True)
train(train_loader, net, epochs, criterion, print_every, "rnn_debug", cuda, lr, batch_size, clip)

loading datasets
jh (5903, 81)
torch.Size([25, 81])


RuntimeError: index out of range: Tried to access index 92 out of table with 18 rows. at /pytorch/aten/src/TH/generic/THTensorEvenMoreMath.cpp:418