<a href="https://colab.research.google.com/github/yalopez84/GAN_study/blob/master/Estudiando_KBGAN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import torch as t
import torch.nn as nn
import torch.nn.functional as f
from torch.optim import Adam, SGD, Adagrad
from torch.autograd import Variable
from random import randint
from collections import defaultdict
from numpy.random import choice, randint
import numpy as n
import datetime
import yaml
import sys
import logging
import subprocess
from collections import namedtuple
from itertools import count

In [None]:
#base_model.py
class BaseModule(nn.Module):
    def __init__(self):
        super(BaseModule,self).__init__()
    def score(self,src,rel,dst):
        raise NotImplementedError
    def dist(self,src,rel,dst):
        raise NotImplementedError
    def prob_logit(self,src,rel,dst):
        raise NotImplementedError
    def prob(self,src,rel,dst):
        return f.softmax(self.prob_logit(src,rel,dst))
    def constraint(self):
        pass
    def pair_loss(self,src,rel,dst, src_bad,dst_bad):
        d_good=self.dist(src,rel,dst)
        d_bad=self.dist(src_bad,rel,dst_bad)
        return f.relu(self.margin + d_good - d_bad)

    def softmax_loss(self, src, rel, dst, truth):
        probs=self.prob(src,rel,dst)
        n=probs.size(0)
        truth_probs=t.log(probs[t.arange(0,n).type(t.LongTensor).cuda(),truth]+1e-30)
        return -truth_probs

class BaseModel(object):
    def __init__(self):
        self.mdl= None
        self.weight_decay = 0
    def save(self,filename):
        t.save(self.mdl.state_dict(),filename)

    def load(self,filename):
        self.mdl.load_state_dict(t.load(filename,map_location=lambda storage, location:storage.cuda()))

    def gen_step(self,src,rel,dst,n_sample=1,temperature=1.0,train=True):
        if not hasattr(self,'opt'):
            self.opt=Adam(self.mdl.parameters(), weight_decay=self.weight_decay)
        n,m=dst.size()
        rel_var=Variable(rel.cuda())
        src_var = Variable(src.cuda())
        dst_var = Variable(dst.cuda())
        logits=self.mdl.prob_logit(src_var, rel_var, dst_var)/temperature
        probs=f.softmax(logits)
        row_idx=t.arange(0,n).type(t.LongTensor).unsqueeze(1).expand(n, n_sample)
        sample_idx=t.multinomial(probs,n_sample, replacement=True)
        sample_srcs = src[row_idx, sample_idx.data.cpu()]
        sample_dsts = dst[row_idx, sample_idx.data.cpu()]
        rewards = yield sample_srcs, sample_dsts
        if train:
            self.mdl.zero_grad()
            log_probs = f.log_softmax(logits)
            reinforce_loss = -t.sum(Variable(rewards) * log_probs[row_idx.cuda(), sample_idx.data])
            reinforce_loss.backward()
            self.opt.step()
            self.mdl.constraint()
        yield None

    def dis_step(self,src, rel, dst, src_fake, dst_fake, train=True):
        if not hasattr(self,'opt'):
            self.opt = Adam(self.mdl.parameters(), weight_decay=self.weight_decay)
        src_var = Variable(src.cuda())
        rel_var = Variable(rel.cuda())
        dst_var = Variable(dst.cuda())
        src_fake_var = Variable(src_fake.cuda())
        dst_fake_var = Variable(dst_fake.cuda())
        losses = self.mdl.pair_loss(src_var, rel_var, dst_var, src_fake_var, dst_fake_var)
        fake_scores = self.mdl.score(src_fake_var, rel_var, dst_fake_var)
        if train:
            self.mdl.zero_grad()
            t.sum(losses).backward()
            self.opt.step()
            self.mdl.constraint()
        return losses.data, -fake_scores.data

    def test_link(self, test_data, n_ent, heads, tails, filt=True):
        mrr_tot = 0
        mr_tot = 0
        hit10_tot = 0
        count = 0
        for batch_s, batch_r, batch_t in batch_by_size(config().test_batch_size, *test_data):
            batch_size = batch_s.size(0)
            rel_var = Variable(batch_r.unsqueeze(1).expand(batch_size, n_ent).cuda())
            src_var = Variable(batch_s.unsqueeze(1).expand(batch_size, n_ent).cuda())
            dst_var = Variable(batch_t.unsqueeze(1).expand(batch_size, n_ent).cuda())
            all_var = Variable(t.arange(0, n_ent).unsqueeze(0).expand(batch_size, n_ent)
                               .type(t.LongTensor).cuda(), volatile=True)
            batch_dst_scores = self.mdl.score(src_var, rel_var, all_var).data
            batch_src_scores = self.mdl.score(all_var, rel_var, dst_var).data
            for s, r, t, dst_scores, src_scores in zip(batch_s, batch_r, batch_t, batch_dst_scores, batch_src_scores):
                if filt:
                    if tails[(s, r)]._nnz() > 1:
                        tmp = dst_scores[t]
                        dst_scores += tails[(s, r)].cuda() * 1e30
                        dst_scores[t] = tmp
                    if heads[(t, r)]._nnz() > 1:
                        tmp = src_scores[s]
                        src_scores += heads[(t, r)].cuda() * 1e30
                        src_scores[s] = tmp
                mrr, mr, hit10 = mrr_mr_hitk(dst_scores, t)
                mrr_tot += mrr
                mr_tot += mr
                hit10_tot += hit10
                mrr, mr, hit10 = mrr_mr_hitk(src_scores, s)
                mrr_tot += mrr
                mr_tot += mr
                hit10_tot += hit10
                count += 2
        logging.info('Test_MRR=%f, Test_MR=%f, Test_H@10=%f', mrr_tot / count, mr_tot / count, hit10_tot / count)
        return mrr_tot / count



In [2]:
#data_utils.py
def heads_tails(n_ent, train_data, valid_data=None, test_data=None):
    train_src, train_rel, train_dst = train_data
    if valid_data:
        valid_src, valid_rel, valid_dst = valid_data
    else:
        valid_src = valid_rel = valid_dst = []
    if test_data:
        test_src, test_rel, test_dst = test_data
    else:
        test_src = test_rel = test_dst = []
    all_src = train_src + valid_src + test_src
    all_rel = train_rel + valid_rel + test_rel
    all_dst = train_dst + valid_dst + test_dst
    heads = defaultdict(lambda: set())
    tails = defaultdict(lambda: set())
    for s, r, t in zip(all_src, all_rel, all_dst):
        tails[(s, r)].add(t)
        heads[(t, r)].add(s)
    heads_sp = {}
    tails_sp = {}
    for k in tails.keys():
        tails_sp[k] = t.sparse.FloatTensor(t.LongTensor([list(tails[k])]),
                                               t.ones(len(tails[k])), t.Size([n_ent]))
    for k in heads.keys():
        heads_sp[k] = t.sparse.FloatTensor(t.LongTensor([list(heads[k])]),
                                               t.ones(len(heads[k])), t.Size([n_ent]))
    return heads_sp, tails_sp


def inplace_shuffle(*lists):
    idx = []
    for i in range(len(lists[0])):
        idx.append(randint(0, i))
    for ls in lists:
        for i, item in enumerate(ls):
            j = idx[i]
            ls[i], ls[j] = ls[j], ls[i]


def batch_by_num(n_batch, *lists, n_sample=None):
    if n_sample is None:
        n_sample = len(lists[0])
    for i in range(n_batch):
        head = int(n_sample * i / n_batch)
        tail = int(n_sample * (i + 1) / n_batch)
        ret = [ls[head:tail] for ls in lists]
        if len(ret) > 1:
            yield ret
        else:
            yield ret[0]

def batch_by_size(batch_size, *lists, n_sample=None):
    if n_sample is None:
        n_sample = len(lists[0])
    head = 0
    while head < n_sample:
        tail = min(n_sample, head + batch_size)
        ret = [ls[head:tail] for ls in lists]
        head += batch_size
        if len(ret) > 1:
            yield ret
        else:
            yield ret[0]

In [8]:
#config.py

class ConfigDict(dict):
    __getattr__ = dict.__getitem__

def _make_config_dict(obj):
    if isinstance(obj, dict):
        return ConfigDict({k: _make_config_dict(v) for k, v in obj.items()})
    elif isinstance(obj, list):
        return [_make_config_dict(x) for x in obj]
    else:
        return obj

_config = None

def config():
    global _config
    if _config is None:
        config_path = 'config.yaml'
        for arg in sys.argv[1:]:
            if arg.startswith('--config='):
                config_path = arg[9:]
                break
        print('Reading config from ' + config_path)
        with open(config_path) as f:
            _config = _make_config_dict(yaml.load(f))
        overwrite_config_with_args()
    return _config

def path_set(path, val, sep='.', auto_convert=False):
    steps = path.split(sep)
    obj = _config
    for step in steps[:-1]:
        obj = obj[step]
    old_val = obj[steps[-1]]
    if not auto_convert:
        obj[steps[-1]] = val
    elif isinstance(old_val, bool):
        obj[steps[-1]] = val.lower() == 'true'
    elif isinstance(old_val, float):
        obj[steps[-1]] = float(val)
    elif isinstance(old_val, int):
        try:
            obj[steps[-1]] = int(val)
        except ValueError:
            obj[steps[-1]] = float(val)
    else:
        obj[steps[-1]] = val

def overwrite_config_with_args(args=None, sep='.'):
    if args is None:
        args = sys.argv[1:]
    for arg in args:
        if arg.startswith('--') and '=' in arg:
            path, val = arg[2:].split('=')
            if path != 'config':
                path_set(path, val, sep, auto_convert=True)

def _dump_config(obj, prefix):
    if isinstance(obj, dict):
        for k, v in obj.items():
            _dump_config(v, prefix + (k,))
    elif isinstance(obj, list):
        for i, v in enumerate(obj):
            _dump_config(v, prefix + (str(i),))
    else:
        if isinstance(obj, str):
            rep = obj
        else:
            rep = repr(obj)
        logging.debug('%s=%s', '.'.join(prefix), rep)

def dump_config():
    return _dump_config(_config, tuple())

In [4]:
#metrics.py
def mrr_mr_hitk(scores, target, k=10):
    _, sorted_idx = t.sort(scores)
    find_target = sorted_idx == target
    target_rank = t.nonzero(find_target)[0, 0] + 1
    return 1 / target_rank, target_rank, int(target_rank <= k)

In [None]:
#corrupter.py
def get_bern_prob(data, n_ent, n_rel):
    src, rel, dst = data
    edges = defaultdict(lambda: defaultdict(lambda: set()))
    rev_edges = defaultdict(lambda: defaultdict(lambda: set()))
    for s, r, t in zip(src, rel, dst):
        edges[r][s].add(t)
        rev_edges[r][t].add(s)
    bern_prob = t.zeros(n_rel)
    for r in edges.keys():
        tph = sum(len(tails) for tails in edges[r].values()) / len(edges[r])
        htp = sum(len(heads) for heads in rev_edges[r].values()) / len(rev_edges[r])
        bern_prob[r] = tph / (tph + htp)
    return bern_prob

class BernCorrupter(object):
    def __init__(self, data, n_ent, n_rel):
        self.bern_prob = get_bern_prob(data, n_ent, n_rel)
        self.n_ent = n_ent

    def corrupt(self, src, rel, dst):
        prob = self.bern_prob[rel]
        selection = t.bernoulli(prob).numpy().astype('int64')
        ent_random = choice(self.n_ent, len(src))
        src_out = (1 - selection) * src.numpy() + selection * ent_random
        dst_out = selection * dst.numpy() + (1 - selection) * ent_random
        return t.from_numpy(src_out), t.from_numpy(dst_out)

class BernCorrupterMulti(object):
    def __init__(self, data, n_ent, n_rel, n_sample):
        self.bern_prob = get_bern_prob(data, n_ent, n_rel)
        self.n_ent = n_ent
        self.n_sample = n_sample

    def corrupt(self, src, rel, dst, keep_truth=True):
        n = len(src)
        prob = self.bern_prob[rel]
        selection = t.bernoulli(prob).numpy().astype('bool')
        src_out = n.tile(src.numpy(), (self.n_sample, 1)).transpose()
        dst_out = n.tile(dst.numpy(), (self.n_sample, 1)).transpose()
        rel_out = rel.unsqueeze(1).expand(n, self.n_sample)
        if keep_truth:
            ent_random = choice(self.n_ent, (n, self.n_sample - 1))
            src_out[selection, 1:] = ent_random[selection]
            dst_out[~selection, 1:] = ent_random[~selection]
        else:
            ent_random = choice(self.n_ent, (n, self.n_sample))
            src_out[selection, :] = ent_random[selection]
            dst_out[~selection, :] = ent_random[~selection]
        return t.from_numpy(src_out), rel_out, t.from_numpy(dst_out)

In [7]:
#gan_train.py
logger_init()
t.cuda.set_device(select_gpu())
overwrite_config_with_args()
dump_config()
task_dir = config().task.dir
kb_index = index_ent_rel(os.path.join(task_dir, 'train.txt'),
                         os.path.join(task_dir, 'valid.txt'),
                         os.path.join(task_dir, 'test.txt'))
n_ent, n_rel = graph_size(kb_index)
models = {'TransE': TransE, 'TransD': TransD, 'DistMult': DistMult, 'ComplEx': ComplEx}
Aqui Me quede.



Reading config from config.yaml


FileNotFoundError: ignored

In [6]:
#logger_init.py
def logger_init():
    logging.basicConfig(level=logging.DEBUG, format='%(module)15s %(asctime)s %(message)s', datefmt='%H:%M:%S')
    if config().log.to_file:
        log_filename = os.path.join(config().log.dir,
                                    config().log.prefix + datetime.datetime.now().strftime("%m%d%H%M%S"))
        logging.getLogger().addHandler(logging.FileHandler(log_filename))
    if config().log.dump_config:
        dump_config()

In [None]:
#read_data.py
KBIndex = namedtuple('KBIndex', ['ent_list', 'rel_list', 'ent_id', 'rel_id'])
def index_ent_rel(*filenames):
    ent_set = set()
    rel_set = set()
    for filename in filenames:
        with open(filename) as f:
            for ln in f:
                s, r, t = ln.strip().split('\t')[:3]
                ent_set.add(s)
                ent_set.add(t)
                rel_set.add(r)
    ent_list = sorted(list(ent_set))
    rel_list = sorted(list(rel_set))
    ent_id = dict(zip(ent_list, count()))
    rel_id = dict(zip(rel_list, count()))
    return KBIndex(ent_list, rel_list, ent_id, rel_id)


def graph_size(kb_index):
    return len(kb_index.ent_id), len(kb_index.rel_id)


def read_data(filename, kb_index):
    src = []
    rel = []
    dst = []
    with open(filename) as f:
        for ln in f:
            s, r, t = ln.strip().split('\t')
            src.append(kb_index.ent_id[s])
            rel.append(kb_index.rel_id[r])
            dst.append(kb_index.ent_id[t])
    return src, rel, dst

In [None]:
#select_gpu.py
def select_gpu():
    nvidia_info = subprocess.run('nvidia-smi', stdout=subprocess.PIPE)
    gpu_info = False
    gpu_info_line = 0
    proc_info = False
    gpu_mem = []
    gpu_occupied = set()
    for line in nvidia_info.stdout.split(b'\n'):
        line = line.decode().strip()
        if gpu_info:
            gpu_info_line += 1
            if line == '':
                gpu_info = False
                continue
            if gpu_info_line % 3 == 2:
                mem_info = line.split('|')[2]
                used_mem_mb = int(mem_info.strip().split()[0][:-3])
                gpu_mem.append(used_mem_mb)
        if proc_info:
            if line == '|  No running processes found                                                 |':
                continue
            if line == '+-----------------------------------------------------------------------------+':
                proc_info = False
                continue
            proc_gpu = int(line.split()[1])
            #proc_type = line.split()[3]
            gpu_occupied.add(proc_gpu)
        if line == '|===============================+======================+======================|':
            gpu_info = True
        if line == '|=============================================================================|':
            proc_info = True
    for i in range(len(gpu_mem)):
        if i not in gpu_occupied:
            logging.info('Automatically selected GPU %d because it is vacant.', i)
            return i
    for i in range(len(gpu_mem)):
        if gpu_mem[i] == min(gpu_mem):
            logging.info('All GPUs are occupied. Automatically selected GPU %d because it has the most free memory.', i)
            return i

if __name__ == '__main__':
    print(select_gpu())