In [None]:
import os
import string
import json

from utils import load_dergs
from derg import DERG, KnowledgeGraph

In [22]:
WORD_START = '<start>'
WORD_END = '<end>'
WORD_UNKNOWN = '<unknown>'


class Vocabulary(object):
    """Simple vocabulary wrapper."""
    def __init__(self):
        self.word2idx = {}
        self.idx2word = {}
        self.idx = 0

    def add_word(self, word):
        if not word in self.word2idx:
            self.word2idx[word] = self.idx
            self.idx2word[self.idx] = word
            self.idx += 1

    def __call__(self, word):
        if word not in self.word2idx:
            return self.word2idx[WORD_UNKNOWN]
        return self.word2idx[word]

    def __len__(self):
        return len(self.word2idx)

vocab = Vocabulary()
vocab.add_word(WORD_START)
vocab.add_word(WORD_END)
vocab.add_word(WORD_UNKNOWN)
for c in string.ascii_letters + string.digits + "<>$_":
    vocab.add_word(c)
    
print("Total vocabulary size: %d" % len(vocab))

Total vocabulary size: 69


In [23]:
dergs = load_dergs('/home/liyc/data/dedroid/dergs/', 'obfuscate_derg.json')
print(len(dergs))

718


In [3]:
def load_node_to_kgid_mapping(node_to_kgid_mapping_path):
    f = open(node_to_kgid_mapping_path)
    mapping = {}
    for line in f.readlines()[1:]:
        segs = line.split()
        node_global_name = segs[0]
        kgid = int(segs[1])
        mapping[node_global_name] = kgid
    return mapping

node_to_kgids = load_node_to_kgid_mapping('/home/liyc/data/dedroid/kg/entity2id.txt')
print(len(node_to_kgids))

1114022


In [4]:
def load_kgid_embeddings(kgid_embedding_path):
    embeddings = json.load(open(kgid_embedding_path))
    return embeddings['ent_embeddings']

kgid_embeddings = load_kgid_embeddings('/home/liyc/data/dedroid/kg/embedding_HolE_50/embedding.vec.json')
print(len(kgid_embeddings))

1114022


In [5]:
import random

def split_train_and_test(dergs, train_portion = 0.9):
    random.shuffle(dergs)
    sep = int(len(dergs) * train_portion)
    train_dergs = dergs[:sep]
    test_dergs = dergs[sep:]
    return train_dergs, test_dergs

train_dergs, test_dergs = split_train_and_test(dergs)
print("%d apps for training, %d apps for predicting" % (len(train_dergs), len(test_dergs)))

646 apps for training, 72 apps for predicting


In [6]:
import torch
import torch.utils.data as data
import os
import numpy as np


class DeDroidDataset(data.Dataset):
    """
    Custom Dataset compatible with torch.utils.data.DataLoader.
    """
    def __init__(self, dergs, node_to_kgids, kgid_embeddings, vocab):
        self.dergs = dergs
        self.node_to_names = []
        for g in dergs:
            self.node_to_names.extend(g.get_kg_mappings())
        self.node_to_kgids = node_to_kgids
        self.kgid_embeddings = kgid_embeddings
        self.vocab = vocab

    def __getitem__(self, index):
        """Returns one data pair (embedding and name)."""
        node_global_name, node_name = self.node_to_names[index]
        
        # Temporaral work around
        node_global_name = node_global_name[16:]
        
        kgid = self.node_to_kgids[node_global_name]
        embedding = self.kgid_embeddings[kgid]

        # Convert node name (string) to word ids.
        name_vec = []
        name_vec.append(vocab(WORD_START))
        name_vec.extend([vocab(c) for c in node_name])
        name_vec.append(vocab(WORD_END))
        
        embedding = torch.Tensor(embedding)
        target = torch.Tensor(name_vec)
        return embedding, target

    def __len__(self):
        return len(self.node_to_names)


def collate_fn(data):
    """Creates mini-batch tensors from the list of tuples (embedding, name).
    
    We should build custom collate_fn rather than using default collate_fn, 
    because merging name (including padding) is not supported in default.
    Args:
        data: list of tuple (embedding, name). 
            - embedding: torch tensor of shape (embed_size).
            - name: torch tensor of shape (?); variable length.
    Returns:
        embeddings: torch tensor of shape (batch_size, embed_size).
        targets: torch tensor of shape (batch_size, padded_length).
        lengths: list; valid length for each padded name.
    """
    # Sort a data list by caption length (descending order).
    data.sort(key=lambda x: len(x[1]), reverse=True)
    embeddings, names = zip(*data)

    # Merge images (from tuple of 1D tensor to 2D tensor).
    embeddings = torch.stack(embeddings, 0)

    # Merge captions (from tuple of 1D tensor to 2D tensor).
    lengths = [len(name) for name in names]
    targets = torch.zeros(len(names), max(lengths)).long()
    for i, name in enumerate(names):
        end = lengths[i]
        targets[i, :end] = name[:end]
    return embeddings, targets, lengths


def get_loader(dergs, node_to_kgids, kgid_embeddings, vocab, batch_size, shuffle, num_workers):
    """Returns torch.utils.data.DataLoader for custom DeDroid dataset."""
    # DeDroid dataset
    dedroid_data = DeDroidDataset(dergs, node_to_kgids, kgid_embeddings, vocab)
    
    # Data loader for DeDroid dataset
    # This will return (embeddings, names, lengths) for every iteration.
    # embeddings: tensor of shape (batch_size, embed_size).
    # names: tensor of shape (batch_size, padded_length).
    # lengths: list indicating valid length for each name. length is (batch_size).
    data_loader = torch.utils.data.DataLoader(dataset=dedroid_data, 
                                              batch_size=batch_size,
                                              shuffle=shuffle,
                                              num_workers=num_workers,
                                              collate_fn=collate_fn)
    return data_loader

In [None]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence
from torch.autograd import Variable
    
    
class DeDroidRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers):
        """Set the hyper-parameters and build the layers."""
        super(DeDroidRNN, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True, dropout=0.5)
        self.linear = nn.Linear(hidden_size, vocab_size)
        self.init_weights()
    
    def init_weights(self):
        """Initialize weights."""
        self.embed.weight.data.uniform_(-0.1, 0.1)
        self.linear.weight.data.uniform_(-0.1, 0.1)
        self.linear.bias.data.fill_(0)
        
    def forward(self, embeddings, names, lengths):
        """Decode kg embeddings and generate names."""
        name_embeddings = self.embed(names)
        embeddings = torch.cat((embeddings.unsqueeze(1), name_embeddings), 1)
        packed = pack_padded_sequence(embeddings, lengths, batch_first=True) 
        hiddens, _ = self.lstm(packed)
        outputs = self.linear(hiddens[0])
        return outputs
    
    def sample(self, embeddings, states=None):
        """Samples names for given kg embeddings (Greedy search)."""
        sampled_ids = []
        inputs = embeddings.unsqueeze(1)
        for i in range(50):                                      # maximum sampling length
            hiddens, states = self.lstm(inputs, states)          # (batch_size, 1, hidden_size), 
            outputs = self.linear(hiddens.squeeze(1))            # (batch_size, vocab_size)
            predicted = outputs.max(1)[1]
            sampled_ids.append(predicted)
            inputs = self.embed(predicted)
            inputs = inputs.unsqueeze(1)                         # (batch_size, 1, embed_size)
        sampled_ids = torch.stack(sampled_ids, 1)                # (batch_size, 20)
        return sampled_ids

In [None]:
batch_size = 5000
shuffle = True
num_workers = 4
embed_size = 50
hidden_size = 64
num_layers = 2
lr = 0.005
num_epochs = 5
log_step = 100
save_step = 1000
model_path = '/home/liyc/data/dedroid/model'

def to_var(x, volatile=False):
    if torch.cuda.is_available():
        x = x.cuda()
    return Variable(x, volatile=volatile)

# Build data loader
data_loader = get_loader(
    dergs = train_dergs,
    node_to_kgids = node_to_kgids,
    kgid_embeddings = kgid_embeddings,
    vocab = vocab,
    batch_size = batch_size,
    shuffle = shuffle,
    num_workers = num_workers) 

# Build the models
dedroid_rnn = DeDroidRNN(
    embed_size = embed_size,
    hidden_size = hidden_size,
    vocab_size = len(vocab),
    num_layers = num_layers)

if torch.cuda.is_available():
    dedroid_rnn.cuda()

# Loss and Optimizer
criterion = nn.CrossEntropyLoss()
params = list(dedroid_rnn.parameters())
optimizer = torch.optim.Adam(params, lr=lr)

# Train the Models
total_step = len(data_loader)
for epoch in range(num_epochs):
    for i, (embeddings, names, lengths) in enumerate(data_loader):

        # Set mini-batch dataset
        embeddings = to_var(embeddings, volatile=False)
        names = to_var(names)
        targets = pack_padded_sequence(names, lengths, batch_first=True)[0]

        # Forward, Backward and Optimize
        dedroid_rnn.zero_grad()
        outputs = dedroid_rnn(embeddings, names, lengths)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        # Print log info
        if i % log_step == 0:
            print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f'
                  % (epoch, num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) 

        # Save the models
        if (i+1) % save_step == 0:
            torch.save(dedroid_rnn.state_dict(), 
                       os.path.join(model_path, 'DeDroidRNN-%d-%d.pkl' % (epoch+1, i+1)))

In [7]:
def get_embedding_by_global_name(node_global_name):
    # Temporaral work around
    node_global_name = node_global_name[16:]
    kgid = node_to_kgids[node_global_name]
    return kgid_embeddings[kgid]
    
def predict_names_for_embeddings(embeddings):
    embeddings = torch.Tensor(embeddings)
    embeddings = to_var(embeddings, volatile=True)

    # Generate names from embeddings
    sampled_ids = dedroid_rnn.sample(embeddings)
    print(sampled_ids.size())
    sampled_ids = sampled_ids.cpu().data.numpy().tolist()

    predicted_names = []
    for predicted_char_ids in sampled_ids:
        # Decode word_ids to name
        predicted_chars = []
        for word_id in predicted_char_ids:
            word = vocab.idx2word[word_id]
            if word == WORD_START or word == WORD_UNKNOWN:
                continue
            elif word == '<end>':
                break
            predicted_chars.append(word)
        predicted_name = ''.join(predicted_chars)
        predicted_names.append(predicted_name)
    return predicted_names

In [None]:
# randomly pick a name to predict
global_name_to_original_names = test_dergs[0].get_kg_mappings()
global_name, original_name = random.sample(global_name_to_original_names, 1)[0]
print("predicting for %s, original name is %s" % (global_name, original_name))
embedding = get_embedding_by_global_name(global_name)
predicted_name = predict_names_for_embeddings([embedding])[0]
print("predicted name is %s" % predicted_name)

In [None]:
# get a list of nodes for prediction
global_name_to_names = []
for g in test_dergs[20:80]:
    global_name_to_names.extend(g.get_kg_mappings())
global_names, original_names = zip(*global_name_to_names)
print("predicting for %d nodes, original names are %s, ..." % (len(global_names), ", ".join(original_names[:5])))
embeddings = [get_embedding_by_global_name(global_name) for global_name in global_names]
predicted_names = predict_names_for_embeddings(embeddings)
print("predicted %d names, predicted names are %s" % (len(predicted_names), ", ".join(predicted_names[:5])))
num_correct = sum([1 if origin == predict else 0 for origin, predict in zip(original_names, predicted_names)])
print("correctly predicted %d names, accuracy is %.2f." % (num_correct, float(num_correct)/len(global_names)))

In [16]:
# Using K-means clustering to check the dataset.

import numpy as np

from sklearn.cluster import DBSCAN, KMeans
from sklearn import metrics


global_name_to_names = []
for g in train_dergs[20:30]:
    global_name_to_names.extend(g.get_kg_mappings())
global_names, original_names = zip(*global_name_to_names)
print("using %d nodes for clustering, original names are %s, ..." % (len(global_names), ", ".join(original_names[:5])))
embeddings = [get_embedding_by_global_name(global_name) for global_name in global_names]
X = np.array(embeddings)

# #############################################################################
# Compute K-Means
kmeans = KMeans(n_clusters=1000, random_state=0).fit(X)
label_list = kmeans.labels_.tolist()

def get_label_names(label):
    for i in range(len(original_names)):
        if label_list[i] == label:
            print(original_names[i])
get_label_names(1)

using 17821 nodes for clustering, original names are Loader, android, support, v4, content, ...


array([1195, 1698, 1698, ...,  618,   48,  910], dtype=int32)

getCallback
access$100
getStartActivityDelegate
getAuthorizationClientRequest
statusCallback
access$1000
setCallback
setDefaultAudience
setLoginBehavior
