In [1]:
import datetime
import numpy as np
import networkx as nx
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_fscore_support
from itertools import chain
import math
from twittergraph import TwitterGraph as tg
from collections import defaultdict
import random
import re
import json

In [3]:
# graph = tg.rt_graph_from_json('/Users/tomfw/Downloads/DataShared/', 0)
graph = tg.rt_graph_from_json('/Users/tomfw/Downloads/DataShared/', 0)

In [5]:
first_split = datetime.datetime(2014, 5, 5)  #shorten the middle time period because it has more edges
second_split = datetime.datetime(2014, 5, 10)

g_0 = tg.tg_by_removing_edges_after_date(graph, first_split)
g_0.load_embeddings('g_1.walked')
#tg.remove_degree_zero_nodes(g_0)

g_1 = tg.tg_by_removing_edges_after_date(graph, second_split)
g_1.load_embeddings('g_1.walked')
#tg.remove_degree_zero_nodes(g_1)

g_2 = tg.tg_with_tg(graph)
g_2.load_embeddings('g_1.walked')
print("New edges in training set: %d" % (g_1.nx_graph.number_of_edges() - g_0.nx_graph.number_of_edges()))
print("New edges in testing set: %d" % (g_2.nx_graph.number_of_edges() - g_1.nx_graph.number_of_edges()))

New edges in training set: 17996


New edges in testing set: 9687


In [7]:
train_pairs = g_0.make_pairs_with_edges(g_1, .5)
test_pairs = g_1.make_pairs_with_edges(g_2, .5)

Found 17996 new edges out of 35993 total pairs
Found 9687 new edges out of 19375 total pairs


In [5]:
# g_0.katz_for_pairs(train_pairs, '/Users/tomfw/Downloads/SA_RT_ADJ/G_0/SA_RT_ADJ_', max_length=6, beta=0.5)
# g_1.katz_for_pairs(test_pairs, '/Users/tomfw/Downloads/SA_RT_ADJ/G_1/SA_RT_ADJ_', max_length=6, beta=0.5)

In [9]:
df_train, y_train = g_0.to_dataframe(pairs=train_pairs, sampling=None, label_graph=g_1, min_katz=0, verbose=True)

df_test, y_test = g_1.to_dataframe(pairs=test_pairs, sampling=None, label_graph=g_2, min_katz=0, verbose=True)

Using the pairs you provided...
Precomputing katzes....


35993 pairs checked and 35993 pairs in dataframe
Using the pairs you provided...
Precomputing katzes....


19375 pairs checked and 19375 pairs in dataframe


In [11]:
print("Training on %d new edges out of %d pairs " % (np.sum(y_train), df_train.shape[0]))
print("Testing on %d new edges out of %d pairs" % (np.sum(y_test), df_test.shape[0]))

Training on 17996 new edges out of 35993 pairs 
Testing on 9687 new edges out of 19375 pairs


In [13]:
rf = RandomForestClassifier(n_estimators=500, max_depth=None, min_samples_split=2, random_state=0, n_jobs=-1)
# rf = LinearSVC()
#  fields = ['katz_centrality', 'att', 'adam', 'jac',  'nbrs', 'spl']
fields = g_0.emb_cols
x_train = df_train.loc[:, fields]
x_test = df_test.loc[:, fields]

classifier = rf.fit(x_train, y_train)
pred = classifier.predict_proba(x_test)

In [15]:
bin_pred = []
for i in range(len(pred)):
    if pred[i, 1] > .495:
        bin_pred.append(True)
    else:
        bin_pred.append(False)

In [17]:
print(roc_auc_score(y_test, pred[:, 1]))
print(np.sum(bin_pred))

0.879949809295
7545


In [19]:
(pr, re, fs, su) = precision_recall_fscore_support(y_test, bin_pred, average='binary')
print("Precision: %.4f" % pr)
print("Recall: %.4f" % re)
print("F-Score: %.4f" % fs)
# print("Support: %.4f" % su)

Precision: 0.9258
Recall: 0.7211
F-Score: 0.8107


In [26]:
print fields
print rf.feature_importances_

['katz_centrality', 'att', 'adam', 'jac', 'nbrs', 'spl']


[ 0.63594961  0.20601965  0.05225267  0.04853852  0.0220972   0.03514235]


In [20]:
correct = 0
incorrect = 0
correct_edges = 0
incorrect_edges = 0
false_positive = 0
false_negative = 0

for i in range(0,df_test.shape[0]):
    prediction = bin_pred[i]
    actu = y_test[i]
    if prediction == actu:
        correct += 1
        if actu:
            correct_edges += 1
    else:
        incorrect += 1
        if actu:
            incorrect_edges += 1
        else:
            false_positive += 1

print("Correct predictions: %d" % correct)
print("Incorrect predictions: %d\n" % incorrect)

print("%d true positive" % correct_edges)
print("%d false negative" % incorrect_edges)
print("%d false positives" % false_positive)

Correct predictions: 16113
Incorrect predictions: 3262

6985 true positive
2702 false negative
560 false positives
