In [1]:
import torch.nn as nn
from torch.nn import functional as F
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np 
import torch.optim as optim
import torch 
from torch.autograd import Variable
from torch.utils.data import TensorDataset, DataLoader
import gc
import torch.utils.data as D

In [36]:
class Lstm(nn.Module):
    def __init__(self, act_function, vocab_size, output_dim, embedding_dim, hidden_dim, n_layers, cuda, batch_size, drop_prob=0.5):
        super(Lstm, self).__init__()
        self.device = torch.device("cuda" if cuda else "cpu")
        self.act_function = act_function
        self.cuda = cuda
        self.output_dim = output_dim
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size
        self.embedding_dim = embedding_dim
        # initiate layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers)#, batch_first=True
        self.lin1 = nn.Linear(hidden_dim, hidden_dim) 
        self.lin2 = nn.Linear(hidden_dim, output_dim)

    def init_hidden(self, batch_size):
        #if self.cuda:
        #    return (torch.randn(self.n_layers, batch_size, self.hidden_dim).cuda(), torch.randn(self.n_layers, batch_size, self.hidden_dim).cuda())
        #else:
        #    return (torch.randn(self.n_layers, batch_size, self.hidden_dim), torch.randn(self.n_layers, batch_size, self.hidden_dim))
        return (torch.randn(self.n_layers, self.embedding_dim, self.hidden_dim), torch.randn(self.n_layers, self.embedding_dim, self.hidden_dim))
        
    def forward(self, x, hidden):
        x = x.long() 
        x = self.embedding(x) 
        x, hidden = self.lstm(x, hidden)
        x = self.act_function(self.lin1(x[:,-1,:]))
        x = self.act_function(self.lin2(x))
        return x, hidden

In [4]:
class MyDataset(D.Dataset):
    def __init__(self, x_tensor, y_tensor):
        self.x = torch.from_numpy(x_tensor)
        self.y = torch.from_numpy(y_tensor)
        
    def __getitem__(self, index):
        return (self.x[index], self.y[index])

    def __len__(self):
        return len(self.x)

In [5]:
def log(summary, file):
    log = open(file, "a")
    log.write(summary)
    log.close()
    print(summary)

In [21]:
def make_data(datasets, batch_size, debug): 
    print("loading datasets")
    datasets_list = []
    pos_datasets = []
    target = []

    # load all tweet datasets and merge them into one
    for f in datasets:
        datasets_list.append(pd.read_csv("../" + f + "_clean.csv"))
        pos_datasets.append(pd.read_csv("../" + f + "_pos.csv"))
    dataset = pd.concat(datasets_list, axis=0, ignore_index=True, sort=False)
    target = dataset["affect"]
    pos_dataset = pd.concat(pos_datasets, axis=0, ignore_index=True, sort=False)

    # split data into test and training data and return
    train_x, test_x, train_y, test_y = train_test_split(pos_dataset, target, test_size=0.2)
    train_data = MyDataset(train_x.to_numpy(), train_y.to_numpy())
    test_data = MyDataset(test_x.to_numpy(), test_y.to_numpy())
    train_loader = DataLoader(dataset=train_data, batch_size=batch_size)
    test_loader = DataLoader(dataset=test_data, batch_size=1)
    
    if debug: 
        pos_dataset = pos_dataset.iloc[:10]
        target = target[:10]
        train_x, test_x, train_y, test_y = train_test_split(pos_dataset, target, test_size=0.8)
        train_data = MyDataset(train_x.to_numpy(), train_y.to_numpy())
        train_loader = DataLoader(dataset=train_data, batch_size=batch_size)
        test_loader = DataLoader(dataset=train_data, batch_size=1)
        
    return train_loader, test_loader 

In [90]:
def train(train_laoder, net, epochs, criterion, print_every, save_name, cuda, lr, batch_size, clip):
    open(save_name + "_train_log", "w").close()
    optimizer = optim.SGD(net.parameters(), lr=lr, momentum=0.5)
    error_curve = []
    net.train()
    hidden = net.init_hidden(batch_size)
    sum_loss = 0
    for epoch in range(epochs): 
        for index, (inputs, targets) in enumerate(train_loader): 
            hidden = tuple([item.data for item in hidden])
            net.zero_grad()
            # das lstm hat 5 schichten. hidden ist daher ein 5-Tupel. Da werden die netzwerkparameter (gewichte)
            # der letzten 5 eingaben gespeichert
            output, hidden = net(inputs, hidden)
            if(cuda):
                ouput = output.to("cuda")
                targets = targets.to("cuda")
            loss = criterion(output.float(), targets)
            
            if index % print_every == 0:
                log("batch: {}/{} in epoch {}/{} \n... loss: {}\n".
                    format((index+1), len(train_loader), (epoch+1), epochs, loss.item()), 
                    save_name + "_train_log")
            # retain_graph ist nötig, weil er sonst nach jedem durchgang im batch die rückpropagation löscht
            loss.backward(retain_graph=True)
            sum_loss += loss.data
            # prevents exploding gradient in lstm 
            nn.utils.clip_grad_norm_(net.parameters(), clip)
            optimizer.step()
            gc.collect()
        # save network after every epoch
        torch.save(net.state_dict(), save_name + ".pt")  
        # after every epoch save the error
        error_curve.append([epoch, loss.item()])
    log("\n" + str(error_curve), save_name + "_train_log")
    return error_curve

    print("Average train loss: ", (sum_loss/runs)) 

In [89]:
# create variables 
print("creating variables")
tweet_dataset = ["crowdflower"]
act_function = torch.sigmoid
criterion = nn.CrossEntropyLoss()
cuda = torch.cuda.is_available()
batch_size = 25
vocab_size = 19
embedding_dim_emotion = 179
embedding_dim_tweet = 85
num_layers = 5
hidden_dim = 256
output_dim = 4
epochs = 10
print_every = 100
lr = 0.1
clip = 5

# tweet dataset debug training
train_loader, test_loader = make_data(tweet_dataset, batch_size, debug=True)
net = Lstm(act_function, vocab_size, output_dim, embedding_dim_tweet, hidden_dim, num_layers, cuda, batch_size)
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
train(train_loader, net, epochs, criterion, print_every, "../logs/debug_rnn", cuda, lr, batch_size, clip)


creating variables
loading datasets
torch.Size([2, 4])
5
torch.Size([5, 85, 256])
batch: 1/1 in epoch 1/10 
... loss: 1.3885366916656494

torch.Size([2, 4])
5
torch.Size([5, 85, 256])
batch: 1/1 in epoch 2/10 
... loss: 1.1394736766815186

torch.Size([2, 4])
5
torch.Size([5, 85, 256])
batch: 1/1 in epoch 3/10 
... loss: 0.966407299041748

torch.Size([2, 4])
5
torch.Size([5, 85, 256])
batch: 1/1 in epoch 4/10 
... loss: 0.8773331642150879

torch.Size([2, 4])
5
torch.Size([5, 85, 256])
batch: 1/1 in epoch 5/10 
... loss: 0.8293141722679138

torch.Size([2, 4])
5
torch.Size([5, 85, 256])
batch: 1/1 in epoch 6/10 
... loss: 0.8034317493438721

torch.Size([2, 4])
5
torch.Size([5, 85, 256])
batch: 1/1 in epoch 7/10 
... loss: 0.7889198064804077

torch.Size([2, 4])
5
torch.Size([5, 85, 256])
batch: 1/1 in epoch 8/10 
... loss: 0.7802171111106873

torch.Size([2, 4])
5
torch.Size([5, 85, 256])
batch: 1/1 in epoch 9/10 
... loss: 0.7746008038520813

torch.Size([2, 4])
5
torch.Size([5, 85, 256])
b

[[0, 1.3885366916656494],
 [1, 1.1394736766815186],
 [2, 0.966407299041748],
 [3, 0.8773331642150879],
 [4, 0.8293141722679138],
 [5, 0.8034317493438721],
 [6, 0.7889198064804077],
 [7, 0.7802171111106873],
 [8, 0.7746008038520813],
 [9, 0.7707176208496094]]

In [79]:
# fills all nan values with 18
def make_data2(f): 
    dataset = pd.read_csv("../" + f + "_pos.csv")
    #dataset = dataset.fillna(18)
    #dataset.to_csv("../" + f + "_pos.csv", sep=",", float_format='%.3f')
    print(len(dataset.columns))
make_data2("crowdflower")

85
