In [21]:
import networkx as nx
import pandas as pd
import numpy as np
from graph.twittergraph import TwitterGraph as tg
from graph.graph import Graph
import os
import subprocess
from sklearn.ensemble import RandomForestClassifier

In [22]:
graph = tg.rt_graph_from_json('/Users/tomfw/Downloads/DataShared/', 0)

In [23]:
data_folder = '/Users/tomfw/Desktop/temp/'

In [24]:
sgs = graph.subgraphs_of_length(7)
print("Made %d graphs." % len(sgs))

Made 13 graphs.


In [25]:
edges = 0
for i, sg in enumerate(sgs):
    e = sg.nx_graph.number_of_edges()
    edges += e
    print("%d: %d edges,  %d nodes" % (i, e, sg.nx_graph.number_of_nodes()))
print("\nOriginal graph edges: %d\nSum of edges  in subgraphs: %d" % (graph.nx_graph.number_of_edges(), edges))

0: 3315 edges,  20718 nodes
1: 1917 edges,  20718 nodes
2: 3528 edges,  20718 nodes
3: 4479 edges,  20718 nodes
4: 6799 edges,  20718 nodes
5: 4544 edges,  20718 nodes
6: 6528 edges,  20718 nodes
7: 24124 edges,  20718 nodes
8: 4566 edges,  20718 nodes


9: 2839 edges,  20718 nodes
10: 2349 edges,  20718 nodes
11: 1792 edges,  20718 nodes
12: 1081 edges,  20718 nodes

Original graph edges: 58193
Sum of edges  in subgraphs: 67861


In [26]:
core_nodes = []
prev_embeds = []  # subgraphs * len(core_nodes)
for _ in sgs:
    prev_embeds.append([])
for node in graph.nx_graph.nodes_iter():
    is_core = True
    for sg in sgs:
        if sg.nx_graph.degree(node) == 0:
            is_core = False
    if is_core:
        core_nodes.append(node)
            
print("Found %d core nodes." % len(core_nodes))

Found 56 core nodes.


In [27]:
def store_core_embeds(embed_dict):
    embeds = []
    for node in core_nodes:
        embeds.append(embed_dict[node])
    return embeds


def core_movement(embed_dict):
    dist = 0
    for i, node in enumerate(core_nodes):
        dist += embedding_distance(prev_embeds[i], embed_dict[node])
    return dist


def embedding_distance(x1, x2):
    d = 0
    for x, y in zip(x1, x2):
        d += (x - y) ** 2
    return np.sqrt(d)

In [28]:
line_path = '/Users/tomfw/Downloads/temporalnode2vec/lineLinux/line'
rf_path = '/Users/tomfw/Downloads/temporalnode2vec/word2vec/retrofit_word2vec_one'
n2v_path = '/Users/tomfw/Desktop/snap/examples/node2vec/node2vec'

In [29]:
def line_command(train, output, size=128, threads=8, negative=5):
    # todo: order, rho, etc...
    command = [line_path, "-train", train, "-output",  output, "-size", str(size), "-threads", str(threads),
               "-negative", str(negative)]
    return command

In [30]:
def rf_command(input, output, init, beta_file, size=128, window=5, sample=0, negative=5, threads=8, beta=1):
    command = [rf_path,"-train", input, "-init", init, "-output", output,
               "-size", str(size), "-window", str(window), "-sample", str(sample),
               "-negative", str(negative), "-threads", str(threads), "-beta", str(beta), "-cbow", '0']
    return command

In [31]:
def n2v_command(edge_file, output, n_walks=10, walk_length=50, p=1, q=1):
    command = [n2v_path, '-i:' + edge_file, '-o:' + output, '-p:' + str(p), '-q:' + str(q),
               '-r:' + str(n_walks), '-l:' + str(walk_length), '-w', '1', '-v', '1']
    return command

In [32]:
def run_command(command):
    process = subprocess.Popen(command, stderr=subprocess.PIPE)
    err = process.communicate()
    if err[0]:
        print err

In [33]:
embed_file = data_folder + 'embeddings.txt'
walk_file = data_folder + 'walks.txt'
init_file = data_folder + 'init.txt'
beta_file = data_folder + 'betas.txt'
edge_file = data_folder + 'e_list.txt'


emb_command = rf_command(walk_file, embed_file, init_file, beta_file, beta=15)
walk_command = n2v_command(edge_file, walk_file, p=1, q=1)
classifier = None
pred = None
for i, sg in enumerate(sgs):
    print("Current time period: (%d/%d)" % (i + 1, len(sgs)))
    cum = graph.subgraph_within_dates(sgs[0].min_date, sg.max_date).nx_graph

    sg.save_edgelist(edge_file)
    if i == 0:
        run_command(line_command(edge_file, output=embed_file))
        sg.load_embeddings(embed_file)
        sg.save_embeddings(init_file, 128)
        prev_embeds = store_core_embeds(sg.embeddings)
    else:
        prev = sgs[i - 1]
        if i == 4:
            print("\tFit 4-9")
            train_graph = graph.subgraph_within_dates(sg.min_date, sgs[i + 4].max_date)
            train_graph.embeddings = prev.embeddings
            train_graph.emb_cols = prev.emb_cols
            train_pairs = prev.make_pairs_with_edges(train_graph, .5, enforce_non_edge=False, enforce_has_embeddings=True)
            df_train, y_train = prev.to_dataframe(pairs=train_pairs, label_graph=train_graph)
            rf = RandomForestClassifier(n_estimators=500, max_depth=None, min_samples_split=2, random_state=0, n_jobs=-1)
            fields = prev.emb_cols
            x_train = df_train.loc[:, fields]
            classifier = rf.fit(x_train, y_train)
            print("\tModel fitted")
        if i == 8:
            print("\tTesting...")
            test_graph = graph.subgraph_within_dates(sg.min_date, sgs[i+4].max_date)
            test_graph.embeddings = prev.embeddings
            test_graph.emb_cols = prev.emb_cols
            test_pairs = prev.make_pairs_with_edges(test_graph, .5, enforce_non_edge=False, enforce_has_embeddings=True)
            df_test, y_test = prev.to_dataframe(test_pairs, label_graph=test_graph)
            fields = prev.emb_cols
            x_test = df_test.loc[:, fields]
            pred = classifier.predict_proba(x_test)
            print("Prediction made.... Done")
            break
        sg.generate_embeddings_with_prev(prev.embeddings, 128)
        print("\tWalking...")
        run_command(walk_command)
        print("\tUpdating embeddings...")
        run_command(emb_command)
        print("\tMerging updated embeddings")
        sg.load_embeddings(embed_file)  # update embeddings with output from w2v
        sg.save_embeddings(init_file, 128)
        distance = core_movement(sg.embeddings)
        prev_embeds = store_core_embeds(sg.embeddings)
        print("\tDistance this iteration: %.4f" % distance)

Current time period: (1/13)


	Loaded embeddings. Dimensions: (2559, 128)


Current time period: (2/13)


	Walking...


	Updating embeddings...


	Merging updated embeddings


	Loaded embeddings. Dimensions: (3496, 128)


	Distance this iteration: 5.1637
Current time period: (3/13)


	Walking...


	Updating embeddings...


	Merging updated embeddings


	Loaded embeddings. Dimensions: (5037, 128)


	Distance this iteration: 4.0208
Current time period: (4/13)


	Walking...


	Updating embeddings...


	Merging updated embeddings


	Loaded embeddings. Dimensions: (6543, 128)


	Distance this iteration: 4.0508
Current time period: (5/13)


	Fit 4-9


	Found 16027 new edges out of 32054 total pairs
	Using the pairs you provided...


	32054 pairs checked and 32054 pairs in dataframe


	Model fitted
	Walking...


	Updating embeddings...


	Merging updated embeddings


	Loaded embeddings. Dimensions: (8288, 128)


	Distance this iteration: 3.7371
Current time period: (6/13)


	Walking...


	Updating embeddings...


	Merging updated embeddings


	Loaded embeddings. Dimensions: (9442, 128)


	Distance this iteration: 3.7387
Current time period: (7/13)


	Walking...


	Updating embeddings...


	Merging updated embeddings


	Loaded embeddings. Dimensions: (11063, 128)


	Distance this iteration: 3.1477
Current time period: (8/13)


	Walking...


	Updating embeddings...


	Merging updated embeddings


	Loaded embeddings. Dimensions: (16387, 128)


	Distance this iteration: 3.4984
Current time period: (9/13)


	Testing...


	Found 8766 new edges out of 17532 total pairs
	Using the pairs you provided...


	17532 pairs checked and 17532 pairs in dataframe


Prediction made.... Done


In [34]:
from sklearn.metrics import roc_auc_score
print roc_auc_score(y_test, pred[:, 1])

0.879362304496


In [35]:
g1 = sgs[3]

In [36]:
emb = []
for _ in g1.emb_cols:
    emb.append([1])

In [37]:
print len(g1.embeddings[5])
print len(emb)
emb = np.stack((g1.embeddings[2], g1.embeddings[5]))

128
128


In [38]:
print len(emb[0])

128


In [39]:
print len(np.mean((g1.embeddings[2], g1.embeddings[5]), axis=0))

128
