In [1]:
import networkx as nx
import pandas as pd
import numpy as np
from twittergraph import TwitterGraph as tg
import os
import subprocess
from sklearn.ensemble import RandomForestClassifier

In [3]:
graph = tg.rt_graph_from_json('/Users/tomfw/Downloads/DataShared/', 0)
data_folder = '/Users/tomfw/TwitterREU/temp/'

In [4]:
sgs = graph.subgraphs_of_length(7)
print("Made %d graphs." % len(sgs))

Made 13 graphs.


In [5]:
edges = 0
for i, sg in enumerate(sgs):
    e = sg.nx_graph.number_of_edges()
    edges += e
    print("%d: %d edges,  %d nodes" % (i, e, sg.nx_graph.number_of_nodes()))
print("\nOriginal graph edges: %d\nSum of edges  in subgraphs: %d" % (graph.nx_graph.number_of_edges(), edges))

0: 3315 edges,  20718 nodes
1: 1917 edges,  20718 nodes


2: 3528 edges,  20718 nodes
3: 4479 edges,  20718 nodes
4: 6799 edges,  20718 nodes
5: 4544 edges,  20718 nodes
6: 6528 edges,  20718 nodes
7: 24124 edges,  20718 nodes
8: 4566 edges,  20718 nodes
9: 2839 edges,  20718 nodes
10: 2349 edges,  20718 nodes


11: 1792 edges,  20718 nodes
12: 1081 edges,  20718 nodes

Original graph edges: 58193
Sum of edges  in subgraphs: 67861


In [6]:
core_nodes = []
prev_embeds = [] # subgraphs * len(core_nodes)
for _ in sgs:
    prev_embeds.append([])
for node in graph.nx_graph.nodes_iter():
    is_core = True
    for sg in sgs:
        if sg.nx_graph.degree(node) == 0:
            is_core = False
    if is_core:
        core_nodes.append(node)
            
print("Found %d core nodes." % len(core_nodes))

Found 56 core nodes.


In [7]:
def store_core_embeds(embed_dict):
    embeds = []
    for node in core_nodes:
        embeds.append(embed_dict[node])
    return embeds

def core_movement(embed_dict):
    dist = 0
    for i, node in enumerate(core_nodes):
        dist += embedding_distance(prev_embeds[i], embed_dict[node])
    return dist

def embedding_distance(x1, x2):
    d = 0
    for x, y in zip(x1, x2):
        d += (x - y) ** 2
    return np.sqrt(d)

In [8]:
line_path = '/Users/tomfw/Downloads/temporalnode2vec/lineLinux/line'
rf_path = '/Users/tomfw/Downloads/temporalnode2vec/word2vec/retrofit_word2vec_one'

In [9]:
def line_command(train, output, size=128, threads=8, negative=5):
    # todo: order, rho, etc...
    command = [line_path, "-train", train, "-output",  output, "-size", str(size), "-threads", str(threads),
               "-negative", str(negative)]
    return command

In [10]:
def rf_command(input, output, init, size=128, window=5, sample=0, negative=5, threads=8, beta=1):
    command = [rf_path,"-train", input, "-init", init, "-output", output,
               "-size", str(size), "-window", str(window), "-sample",
               str(sample), "-negative", str(negative), "-threads", str(threads), "-beta", str(beta), "-cbow", '0']
    return command

def clear_data():
    process = subprocess.Popen(['/bin/rm', '-rf', data_folder + '*'], stderr=subprocess.PIPE)
    err = process.communicate()
    print err

In [15]:
clear_data()

(None, '')


In [10]:
embed_file = data_folder + 'embeddings.txt'
walk_file = data_folder + 'walks.txt'
init_file = data_folder + 'init.txt'
emb_command = rf_command(walk_file, embed_file, init_file, beta=.5)
classifier = None
pred = None
for i, sg in enumerate(sgs):
    if i == 0:
        edge_file = data_folder + 'e_list.txt'
        sg.save_edgelist(edge_file)
        command = line_command(edge_file, output=embed_file)
        process = subprocess.Popen(command, stderr=subprocess.PIPE)
        err = process.communicate()
        if err[0]:
            print err
        sg.load_embeddings(embed_file)
        sg.save_embeddings(init_file, 128)
        prev_embeds = store_core_embeds(sg.embeddings)
    else:
        prev = sgs[i - 1]
        
        if i == 4:
            print("Fit 4-9")
            train_graph = graph.subgraph_within_dates(sg.min_date, sgs[i + 4].max_date)
            train_graph.embeddings = prev.embeddings
            train_graph.emb_cols = prev.emb_cols
            train_pairs = prev.make_pairs_with_edges(train_graph, .5, enforce_non_edge=False, enforce_has_embeddings=True)
            df_train, y_train = prev.to_dataframe(pairs=train_pairs, label_graph=train_graph)
            rf = RandomForestClassifier(n_estimators=500, max_depth=None, min_samples_split=2, random_state=0, n_jobs=-1)
            fields = prev.emb_cols
            x_train = df_train.loc[:, fields]
            classifier = rf.fit(x_train, y_train)
            print("Model fitted")
        if i == 8:
            test_graph = graph.subgraph_within_dates(sg.min_date, sgs[i+4].max_date)
            test_graph.embeddings = prev.embeddings
            test_graph.emb_cols = prev.emb_cols
            test_pairs = prev.make_pairs_with_edges(test_graph, .5, enforce_non_edge=False, enforce_has_embeddings=True)
            df_test, y_test = prev.to_dataframe(test_pairs, label_graph=test_graph)
            fields = prev.emb_cols
            x_test = df_test.loc[:, fields]
            pred = classifier.predict_proba(x_test)
            print("Prediction made.... Done")
            break
        sg.generate_embeddings_with_prev(prev.embeddings, 128)
        sg.perform_walks()
        sg.save_walks(walk_file)
        process = subprocess.Popen(emb_command, stderr=subprocess.PIPE)
        err = process.communicate()
        if err[0]:
            print(err)
        sg.load_embeddings(embed_file)  # update embeddings with output from w2v
        sg.save_embeddings(init_file, 128)
        distance = core_movement(sg.embeddings)
        prev_embeds = store_core_embeds(sg.embeddings)
        print("Distance this iteration: %.4f" % distance)

Loaded: 1


Loaded embeddings. Dimensions: (2559, 128)


Performing walks.....


Loaded: 1


Loaded embeddings. Dimensions: (3496, 128)


Distance this iteration: 0.2268


Performing walks.....


Loaded: 1


Loaded embeddings. Dimensions: (5037, 128)


Distance this iteration: 0.0174


Performing walks.....


Loaded: 1


Loaded embeddings. Dimensions: (6543, 128)


Distance this iteration: 0.0198
Fit 4-9


Found 16027 new edges out of 32055 total pairs
Using the pairs you provided...
Precomputing katzes....


32055 pairs checked and 32055 pairs in dataframe


Model fitted


Performing walks.....


Loaded: 1


Loaded embeddings. Dimensions: (8288, 128)


Distance this iteration: 0.0109


Performing walks.....


Loaded: 1


Loaded embeddings. Dimensions: (9442, 128)


Distance this iteration: 0.0336


Performing walks.....


  if self.run_code(code, result):


Loaded: 1


Loaded embeddings. Dimensions: (11063, 128)


Distance this iteration: 0.0217


Performing walks.....


Loaded: 1


Loaded: 10001


Loaded embeddings. Dimensions: (16387, 128)


Distance this iteration: 0.0181


Found 8766 new edges out of 17533 total pairs
Using the pairs you provided...
Precomputing katzes....


17533 pairs checked and 17533 pairs in dataframe


Prediction made.... Done


In [11]:
from sklearn.metrics import roc_auc_score
print roc_auc_score(y_test, pred[:, 1])

0.928001510497
