In [77]:
import torch.nn as nn
from torch.nn import functional as F
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np 
import torch.optim as optim
import torch 
from torch.autograd import Variable
from torch.utils.data import TensorDataset, DataLoader
import gc
import torch.utils.data as D
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from gensim.models import FastText
from sklearn.metrics import classification_report
from joblib import dump, load
from nltk.tokenize import sent_tokenize, word_tokenize 

In [56]:
class Lstm(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim, n_layers, drop_prob=0.5):
        super(Lstm, self).__init__()
        self.act_function = nn.ReLU()
        self.input_dim = input_dim
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        # initiate layers
        # embedding dim = input dim im linearen ntz (79 für topic tweet zB )
        # input dim = dimension of vector at each time step
        #  (1, 1, 5) which represents (batch size, sequence length, input dimension). bei mir also: (16, 81, 79/32)
        lstm_layer = nn.LSTM(input_dim, hidden_dim, n_layers, batch_first=True)
        self.lstm = nn.LSTM(self.input_dim, self.hidden_dim, self.n_layers)#, batch_first=True
        self.lin1 = nn.Linear(self.hidden_dim, self.hidden_dim) 
        self.lin2 = nn.Linear(self.hidden_dim, output_dim)

    def init_hidden(self, batch_size, cuda):
        #if cuda: return (torch.randn(self.n_layers, batch_size, self.hidden_dim).cuda(), torch.randn(self.n_layers, batch_size, self.hidden_dim).cuda())
        #else: return (torch.randn(self.n_layers, batch_size, self.hidden_dim), torch.randn(self.n_layers, batch_size, self.hidden_dim))
        # The hidden state and cell state is stored in a tuple with the format (hidden_state, cell_state).
        # hidden_state = torch.randn(n_layers, batch_size, hidden_dim) ???
        if cuda: return (torch.randn(self.n_layers, self.input_dim, self.hidden_dim).cuda(), torch.randn(self.n_layers, self.input_dim, self.hidden_dim)).cuda()
        else: return (torch.randn(self.n_layers, self.input_dim, self.hidden_dim), torch.randn(self.n_layers, self.input_dim, self.hidden_dim))
        
    def forward(self, x, hidden):
        # seq len ist im falle der topics 79/186
        # im falle der vec 32
        x = x.long() 
        #x = self.embedding(x) 
        # first axis: sequence itself, second: instances in mini-batch, third: elements of the input.
        print(x.shape)
        #print(hidden.shape)
        x, hidden = self.lstm(x, hidden)
        x = self.act_function(self.lin1(x[:,-1,:]))
        x = self.act_function(self.lin2(x))
        return x, hidden
    
class MyDataset(D.Dataset):
    def __init__(self, x_tensor, y_tensor):
        self.x = torch.from_numpy(x_tensor)
        self.y = torch.from_numpy(y_tensor)
        
    def __getitem__(self, index):
        return (self.x[index], self.y[index])

    def __len__(self):
        return len(self.x)

In [121]:
def load_vector_data(dataset_name, bgr=False, split_factor=0.2):
    print("loading vector data for", dataset_name)
    sentences = pd.read_csv("../cleaned/" + dataset_name + "_stems.csv", delimiter=",").astype(str).values.tolist()[:2]
    targets = pd.read_csv("../cleaned/" + dataset_name + "_clean.csv")["a"]
    vector_model = FastText.load("../models/word_embeddings/" + dataset_name + "_fasttext")
    # replace placeholders (" "), make one-string-sentences
    inputs = [" ".join(sentence) for sentence in sentences]
    tokenized = sentences
    print(tokenized)
    print(type(tokenized))
    print(type(tokenized[0]))
    a = np.array(tokenized)
    print(a.shape)
    if bgr:
        bigram = Phraser.load("../models/bigrams/bigram_" + dataset_name + ".pkl")
        bigrammed = [bigram[sentence] for sentence in sentences]
        tokenized = bigrammed
    inputs = a
    #print(h.shape)
    #print(tokenized)
    print("tokenizing")
    for i, sample in enumerate(inputs):
        #print(sample, "...wird zu...")
        inputs[i] = [vector_model.wv[token] for token in sample]
        #print(type(inputs[i]))
        #print("...", inputs[i])
    #print("number samples:", len(inputs))
    print(inputs.shape)
    #print("every seq consists of: (sequence length)", len(inputs[0]), "vectors")
    #print("every vector is", len(inputs[0][0]), "elements big")
    a = np.array(inputs)
    #print(a[:10])
    #print("a shape", a.shape)
    #inputs = [np.sum(vector_model.wv[sent], 0).tolist() if sent else np.zeros(32) for sent in tokenized]   
    #inputs = np.array(inputs)
    train_loader, val_loader, test_loader = make_loader(inputs, targets, split_factor)
    return len(inputs[0]), train_loader, val_loader, test_loader

def load_topic_data(dataset_name, split_factor=0.2):
    print("loading lex data", dataset_name, feature_set_name)
    inputs = []
    num_topics = num_topics_dict[dataset_name]
    dataset = pd.read_csv("../cleaned/" + dataset_name + "_clean.csv")
    targets = dataset["a"]
    dataset = dataset.astype(str).values.tolist() 
    dic = gs.corpora.Dictionary.load("../models/dictionary/" + dataset_name + "_dictionary")
    lda_model = gensim.models.ldamulticore.LdaMulticore.load("../models/topic_models/" + dataset_name + "_ldamodel")   
    print("../models/topic_models/" + dataset_name + "_ldamodel")
    for index, sample in enumerate(dataset): 
        dataset[index] = list(filter((" ").__ne__, sample))
    for i, sample in enumerate(dataset):
        sentence = dic.doc2bow(dataset[i])
        topics = lda_model.get_document_topics(sentence, minimum_probability=0.0)
        topic_vec = [topics[i][1] for i in range(num_topics)] 
        inputs.append(topic_vec)
    train_loader, val_loader, test_loader = make_loader(inputs, targets, split_factor)
    topics_num = len(lda_model.get_topics())
    return topics_num, train_loader, val_loader, test_loader

def make_loader(inputs, targets, test_size):
    # make train and test sets
    train_x, val_x, train_y, val_y = train_test_split(inputs, targets, test_size=test_size)
    train_x, test_x, train_y, test_y = train_test_split(train_x, train_y, test_size=test_size)
    train_data = MyDataset(np.asarray(train_x), np.asarray(train_y))
    val_data = MyDataset(np.asarray(val_x), np.asarray(val_y))
    test_data = MyDataset(np.asarray(test_x), np.asarray(test_y))
    train_loader = DataLoader(dataset=train_data, batch_size=batch_size)
    val_loader = DataLoader(dataset=val_data, batch_size=round(batch_size*test_size))
    test_loader = DataLoader(dataset=test_data, batch_size=1)
    return train_loader, val_loader, test_loader

In [27]:
def convert_to_cuda(cuda, inputs, targets, net):
    if cuda: return inputs.cuda(), targets.cuda(), net.cuda()
    else: return inputs, targets, net

In [47]:
def train(train_loader, val_loader, net, epochs, cuda, lr, file_name, print_every, batch_size, clip):
    optimizer = torch.optim.Adam(net.parameters(), lr=lr)
    error_curve = []
    net.train()
    hidden = net.init_hidden(batch_size, cuda)
    for epoch in range(epochs): 
        for index, (inputs, targets) in enumerate(train_loader): 
            hidden = tuple([item.data for item in hidden])
            net.zero_grad()
            output, hidden = net(inputs, hidden)
            if(cuda):
                ouput = output.to("cuda")
                targets = targets.to("cuda")
            loss = criterion(output.float(), targets)
            loss.backward(retain_graph=True)
            nn.utils.clip_grad_norm_(net.parameters(), clip)
            optimizer.step()
            gc.collect()

def run(dataset_name, feature_set_name):
    file_name = "net_lin_{}({})".format(dataset_name, feature_set_name)
    print("running ", file_name)
    open("{}{}_{}".format("../logs/", file_name, ".txt"), "w").close()
    if feature_set_name == "topics":
        num_topics, train_loader, val_loader, test_loader = load_topic_data(dataset_name)
        net = Lstm(num_topics, output_dim, hidden_dim, n_layers)
        train(train_loader, val_loader, net, epochs, cuda, lr, file_name, print_every, batch_size, clip)
        test(test_loader, net, file_name)
    elif feature_set_name == "vec-unigram":
        input_dim, train_loader, val_loader, test_loader = load_vector_data(dataset_name)
        net = Lstm(input_dim, output_dim, hidden_dim, n_layers)
        train(train_loader, val_loader, net, epochs, cuda, lr, file_name, print_every, batch_size, clip)
        test(test_loader, net, file_name)
    elif feature_set_name == "vec-bigram":
        input_dim, train_loader, val_loader, test_loader = load_vector_data(dataset_name, True)
        net = Lstm(input_dim, output_dim, hidden_dim, n_layers)
        train(train_loader, val_loader, net, epochs, cuda, lr, file_name, print_every, batch_size, clip)
        test(test_loader, net, file_name)

In [53]:
dataset_names = ["norm_tweet", "norm_emotion"]
feature_set_names = ["vec-unigram", "vec-bigram", "topic"]
dataset_names = ["norm_test"]
feature_set_names = ["vec-unigram"]
criterion = nn.CrossEntropyLoss()
cuda = torch.cuda.is_available()
batch_size = 256
epochs = 10 + 1
print_every = 2
split_factor = 0.2
output_dim = 4
hidden_dim = 8
n_layers = 2
lr = 0.01
clip = 5

In [None]:
for dataset_name in dataset_names: 
    for feature_set_name in feature_set_names: 
        run(dataset_name, feature_set_name)

running  net_lin_norm_test(vec-unigram)
loading vector data for norm_test
