In [1]:
import datetime
import numpy as np
import networkx as nx
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_fscore_support
from itertools import chain
import math
from twittergraph import TwitterGraph as tg
from collections import defaultdict
import random
import re
import json

In [2]:
# graph = tg.rt_graph_from_json('/Users/tomfw/Downloads/DataShared/', 0)
graph = tg.rt_graph_from_json('/Volumes/pond/Temp/twitter/', 0)

In [29]:
first_split = datetime.datetime(2014, 5, 5)  #shorten the middle time period because it has more edges
second_split = datetime.datetime(2014, 5, 10)

g_0 = graph.subgraph_within_dates(graph.min_date, first_split)
g_0.load_embeddings('g_1.walked')

g_1 = graph.subgraph_within_dates(first_split, second_split)
g_1.emb_cols = g_0.emb_cols
g_1.embeddings = g_0.embeddings

g_2 = graph.subgraph_within_dates(second_split, graph.max_date)
g_2.embeddings = g_0.embeddings
g_2.emb_cols = g_0.emb_cols
print("New edges in training set: %d" % (g_1.nx_graph.number_of_edges() - g_0.nx_graph.number_of_edges()))
print("New edges in testing set: %d" % (g_2.nx_graph.number_of_edges() - g_1.nx_graph.number_of_edges()))

Loading embeddings....
('Embedding Dimensions:', (20718, 65))
Loaded: 1


Loaded: 10001


Loaded: 20001


New edges in training set: -10432
New edges in testing set: -8311


In [30]:
dates = []
for u, v in g_1.nx_graph.edges_iter():
    for date in g_1.nx_graph.edge[u][v]['posted']:
        dates.append(date)

In [31]:
print np.max(dates)
print np.min(dates)

2014-05-09 23:59:33
2014-05-05 00:29:04


In [37]:
print("Graph Edges: %d, Nodes: %d" % (graph.nx_graph.number_of_edges(), graph.nx_graph.number_of_nodes()))
print("G0 Edges: %d, Nodes: %d" % (g_0.nx_graph.number_of_edges(), g_0.nx_graph.number_of_nodes()))
print("G1 Edges: %d, Nodes: %d" % (g_1.nx_graph.number_of_edges(), g_1.nx_graph.number_of_nodes()))
print("G2 Edges: %d, Nodes: %d" % (g_2.nx_graph.number_of_edges(), g_2.nx_graph.number_of_nodes()))

Graph Edges: 58193, Nodes: 20718
G0 Edges: 30510, Nodes: 20718
G1 Edges: 20078, Nodes: 20718
G2 Edges: 11767, Nodes: 20718


In [33]:
train_pairs = g_0.make_pairs_with_edges(g_1, .5)
test_pairs = g_1.make_pairs_with_edges(g_2, .5)

Found 17996 new edges out of 35993 total pairs
Found 10665 new edges out of 21331 total pairs


In [18]:
# g_0.katz_for_pairs(train_pairs, '/Users/tomfw/Downloads/SA_RT_ADJ/G_0/SA_RT_ADJ_', max_length=6, beta=0.5)
# g_1.katz_for_pairs(test_pairs, '/Users/tomfw/Downloads/SA_RT_ADJ/G_1/SA_RT_ADJ_', max_length=6, beta=0.5)

In [19]:
print g_0.embeddings[6][0]

-5.148341


In [34]:
df_train, y_train = g_0.to_dataframe(pairs=train_pairs, sampling=None, label_graph=g_1, min_katz=0, verbose=True)

df_test, y_test = g_1.to_dataframe(pairs=test_pairs, sampling=None, label_graph=g_2, min_katz=0, verbose=True)

Using the pairs you provided...
Precomputing katzes....


35993 pairs checked and 35993 pairs in dataframe
Using the pairs you provided...
Precomputing katzes....


21331 pairs checked and 21331 pairs in dataframe


In [35]:
print("Training on %d new edges out of %d pairs " % (np.sum(y_train), df_train.shape[0]))
print("Testing on %d new edges out of %d pairs" % (np.sum(y_test), df_test.shape[0]))

Training on 17996 new edges out of 35993 pairs 
Testing on 10665 new edges out of 21331 pairs


In [36]:
rf = RandomForestClassifier(n_estimators=500, max_depth=None, min_samples_split=2, random_state=0, n_jobs=-1)
# rf = LinearSVC()
#  fields = ['katz_centrality', 'att', 'adam', 'jac',  'nbrs', 'spl']
fields = g_0.emb_cols
x_train = df_train.loc[:, fields]
x_test = df_test.loc[:, fields]

classifier = rf.fit(x_train, y_train)
pred = classifier.predict_proba(x_test)

In [38]:
bin_pred = []
for i in range(len(pred)):
    if pred[i, 1] > .495:
        bin_pred.append(True)
    else:
        bin_pred.append(False)

In [39]:
print(roc_auc_score(y_test, pred[:, 1]))
print(np.sum(bin_pred))

0.886478620455
8566


In [39]:
(pr, re, fs, su) = precision_recall_fscore_support(y_test, bin_pred, average='binary')
print("Precision: %.4f" % pr)
print("Recall: %.4f" % re)
print("F-Score: %.4f" % fs)
# print("Support: %.4f" % su)

Precision: 0.9237
Recall: 0.7419
F-Score: 0.8228


In [26]:
for f, imp in zip(fields, rf.feature_importances_):
    pass  # print("%s - %.3f" % (f, imp))

In [27]:
correct = 0
incorrect = 0
correct_edges = 0
incorrect_edges = 0
false_positive = 0
false_negative = 0

for i in range(0,df_test.shape[0]):
    prediction = bin_pred[i]
    actu = y_test[i]
    if prediction == actu:
        correct += 1
        if actu:
            correct_edges += 1
    else:
        incorrect += 1
        if actu:
            incorrect_edges += 1
        else:
            false_positive += 1

print("Correct predictions: %d" % correct)
print("Incorrect predictions: %d\n" % incorrect)

print("%d true positive" % correct_edges)
print("%d false negative" % incorrect_edges)
print("%d false positives" % false_positive)

Correct predictions: 17932
Incorrect predictions: 3399

7954 true positive
2711 false negative
688 false positives
