In [78]:
from graph.collabgraph import CollabGraph
from graph.fbgraph import FBGraph
from graph.enrongraph import EnronGraph
from graph.twittergraph import TwitterGraph
import networkx as nx
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, average_precision_score, auc
from sklearn.metrics import precision_recall_curve, precision_recall_fscore_support
from sklearn.ensemble import AdaBoostClassifier
import numpy as np

In [79]:
file_names = {'southafrica': '0',
              'fb': 'FacebookFilteredAdj_90Days_6ActiveTimes_30OutInDeg.mat',
              'enron': 'EnronDirectedWithCc_7days.mat',
              'collab': 'citation2Filtered.mat'}
data_root = '/Volumes/pond/Temp/twitter/'

In [80]:
# graph = CollabGraph.load_collab_graph(data_root + file_names['collab'])
# graph = FBGraph.load_fb_graph(data_root + file_names['fb'])
graph = EnronGraph.load_enron_graph(data_root + file_names['enron'])
# graph = TwitterGraph.rt_graph_from_json(data_root, 0)

In [81]:
sgs = graph.subgraphs_of_length(periods=10)
print(len(sgs))
for sg in sgs:
    print(sg.nx_graph.number_of_edges())

(0, 10)
(10, 20)
(20, 30)
(30, 40)
(40, 50)


(50, 60)
(60, 70)
(70, 80)
(80, 90)
(90, 100)
(100, 110)


(110, 120)
(120, 130)
(130, 140)
(140, 150)
(150, 160)


(160, 170)
(170, 180)
(180, 190)
19
15
5
22
36
51
97
185
180
273
410
452
456
562
623
614
799
518
88
22


In [82]:
g_0 = graph.subgraph_within_dates(sgs[0].min_date, sgs[13].max_date)
print(sgs[0].min_date, sgs[6].max_date)

(0, 70)


In [83]:
g_1 = graph.subgraph_within_dates(sgs[14].min_date, sgs[14].max_date)
g_2 = graph.subgraph_within_dates(sgs[15].min_date, sgs[15].max_date)
print(sgs[7].min_date, sgs[7].max_date)
print(sgs[8].min_date, sgs[8].max_date)

(70, 80)
(80, 90)


In [84]:
repeat_edges = 0
repeat_times = 0
for u, v, data in g_1.nx_graph.edges_iter(data=True):
    if g_0.nx_graph.has_edge(u, v):
        repeat_edges += 1
        for time in data['timestamps']:
            for time2 in g_0.nx_graph.edge[u][v]['timestamps']:
                if time == time2:
                    repeat_times += 1
print("%d edges repeated\n%d timestamps repeated" % (repeat_edges, repeat_times))

357 edges repeated
0 timestamps repeated


In [85]:
print g_0.nx_graph.number_of_edges()
print g_1.nx_graph.number_of_edges()
print g_2.nx_graph.number_of_edges()

1370
614
799


In [58]:
# nx.write_edgelist(g_0.nx_graph, 'fb-fucked.edgelist')
# g_0.save_edgelist('collab_edges.txt')

In [86]:
def dw_predict(prefix, g_0, g_1, g_2, n_files):
    for i in range(0, n_files):
        f_name = prefix + str(i + 1) + '.txt'
        g_0.embeddings = None
        g_0.load_embeddings(f_name, 1024)
        g_1.embeddings = g_0.embeddings
        g_1.emb_cols = g_0.emb_cols

        train_pairs = g_0.make_pairs_with_edges(g_1, .5, enforce_has_embeddings=True, enforce_non_edge=False)
        test_pairs = g_1.make_pairs_with_edges(g_2, 0, enforce_non_edge=False, enforce_has_embeddings=True)

        df_train, y_train = g_0.to_dataframe(pairs=train_pairs, label_graph=g_1)
        rf = RandomForestClassifier(n_estimators=1000, max_depth=None, min_samples_split=2, random_state=0, n_jobs=-1)
        # rf = SVC(kernel='linear', probability=True)
        # rf = AdaBoostClassifier(n_estimators=500)
        fields = g_0.emb_cols
        x_train = df_train.loc[:, fields]
        classifier = rf.fit(x_train, y_train)

        df_test, y_test = g_1.to_embedding_dataframe(test_pairs, g_2)

        # fields = g_1.emb_cols
        x_test = df_test.loc[:, fields]
        pred = classifier.predict_proba(x_test)
        print("Prediction made.... Done")

        #print(roc_auc_score(y_test, pred[:, 1]))
        auc = roc_auc_score(y_test, pred[:, 1])
        prauc = average_precision_score(y_test, pred[:, 1])
        ndcg = ndcg_score(y_test, pred[:, 1], k=50)

        print("%d / %d: AUC: %.4f PR-AUC: %.4f NDCG: %.4f " % (i + 1, n_files, auc, prauc, ndcg))

        # pr_curve = precision_recall_curve(y_test, pred[:, 1])

        # print auc(pr_curve[1], pr_curve[0], reorder=True)
        #print average_precision_score(y_test, pred[:, 1])

In [87]:
def dcg_score(y_true, y_score, k=10, gains="exponential"):
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])

    if gains == "exponential":
        gains = 2 ** y_true - 1
    elif gains == "linear":
        gains = y_true
    else:
        raise ValueError("Invalid gains option.")

    # highest rank is 1 so +2 instead of +1
    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gains / discounts)


def ndcg_score(y_true, y_score, k=10, gains="exponential"):
    best = dcg_score(y_true, y_true, k, gains)
    actual = dcg_score(y_true, y_score, k, gains)
    return actual / best

In [88]:
dw_predict('/Volumes/pond/Temp/walks/enron1024_', g_0, g_1, g_2, 10)

	739 edges out of 15051 pairs
Precomputing katzes....


	1130 pairs checked and 1130 pairs in dataframe


Prediction made.... Done
1 / 10: AUC: 0.8879 PR-AUC: 0.4654 NDCG: 0.7270 


	739 edges out of 15051 pairs
Precomputing katzes....


	1130 pairs checked and 1130 pairs in dataframe


Prediction made.... Done
2 / 10: AUC: 0.8885 PR-AUC: 0.4612 NDCG: 0.7086 


	739 edges out of 15051 pairs
Precomputing katzes....


	1130 pairs checked and 1130 pairs in dataframe


Prediction made.... Done
3 / 10: AUC: 0.8859 PR-AUC: 0.4487 NDCG: 0.7541 


	739 edges out of 15051 pairs
Precomputing katzes....


	1130 pairs checked and 1130 pairs in dataframe


Prediction made.... Done
4 / 10: AUC: 0.8888 PR-AUC: 0.4379 NDCG: 0.6132 


	739 edges out of 15051 pairs
Precomputing katzes....


	1130 pairs checked and 1130 pairs in dataframe


Prediction made.... Done
5 / 10: AUC: 0.8910 PR-AUC: 0.4506 NDCG: 0.7651 


	739 edges out of 15051 pairs
Precomputing katzes....


	1130 pairs checked and 1130 pairs in dataframe


Prediction made.... Done
6 / 10: AUC: 0.8878 PR-AUC: 0.4454 NDCG: 0.7169 


	739 edges out of 15051 pairs
Precomputing katzes....


	1130 pairs checked and 1130 pairs in dataframe


Prediction made.... Done
7 / 10: AUC: 0.8934 PR-AUC: 0.4642 NDCG: 0.8185 


	739 edges out of 15051 pairs
Precomputing katzes....


	1130 pairs checked and 1130 pairs in dataframe


Prediction made.... Done
8 / 10: AUC: 0.8864 PR-AUC: 0.4774 NDCG: 0.8083 


	739 edges out of 15051 pairs
Precomputing katzes....


	1130 pairs checked and 1130 pairs in dataframe


Prediction made.... Done
9 / 10: AUC: 0.8941 PR-AUC: 0.4529 NDCG: 0.5916 


	739 edges out of 15051 pairs
Precomputing katzes....


	1130 pairs checked and 1130 pairs in dataframe


Prediction made.... Done
10 / 10: AUC: 0.8845 PR-AUC: 0.4563 NDCG: 0.7787 


	1064 edges out of 216153 pairs


Precomputing katzes....


	2664 pairs checked and 2664 pairs in dataframe


In [93]:
# df_test, y_test = g_1.to_dataframe(pairs=test_pairs, label_graph=g_2)


Prediction made.... Done


0.903194276162


0.128433247883
0.12804152304


In [85]:
bin_labels = []
for p in pred[: , 1]:
    if p > .5:
        bin_labels.append(1)
    else:
        bin_labels.append(0)

In [86]:
pr, re, fs, su = precision_recall_fscore_support(y_test, bin_labels, average='macro')
print pr
print re

0.509187074254
0.822296250805


In [1]:
# sgs = graph.subgraphs_of_length(periods=1)
# tg = TwitterGraph.rt_graph_from_json(data_root, 0)
# sgs = graph.subgraphs_of_length(days=7)

In [2]:
# confirm that no edge/timestamps are repeated in different subgraphs
for i, sg in enumerate(sgs):
    if i > 0:
        n_edges = 0
        n_time_stamps = 0
        prev = sgs[i - 1]
        for u, v, data in prev.nx_graph.edges_iter(data=True):
            if sg.nx_graph.has_edge(u, v):
                n_edges += 1
                for time in data['timestamps']:
                    for time2 in sg.nx_graph.edge[u][v]['timestamps']:
                        if time == time2:
                            n_time_stamps += 1
                       # delta = time2 - time
                       # if delta.days < .5:
                       #     print time, time2, prev.min_date, sg.max_date
        print("%d - %d\n\t%d edges, %d timestamps" % (i, i -1, n_edges, n_time_stamps))
        print(sg.min_date,sg.max_date)
        print(prev.min_date,prev.max_date)

NameError: name 'sgs' is not defined

In [21]:
print sgs[9].nx_graph.number_of_edges()

2571
