In [1]:
import datetime
import numpy as np
import networkx as nx
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_fscore_support
from itertools import chain
import math
import twittergraph as tg
import random
import re
import json

In [3]:
graph = tg.LoadTwitterGraph('/Volumes/pond/Temp/twitter/', 0)

Loaded 181416 tweets


In [4]:
print("Max Date: %s" % np.max(tg.timeList))
print("Min Date: %s" % np.min(tg.timeList))
print("Graph edges: %d" % graph.number_of_edges())

Max Date: 2014-06-13 22:05:22
Min Date: 2014-03-15 04:26:31


Graph edges: 230889


In [5]:
def remove_edges_after(split, g):
    new_graph = g.copy()
    for u, v in g.edges():
        for i in range(0, len(new_graph.edge[u][v]['posted'])):
            if new_graph.edge[u][v]['posted'][0] > split:
                new_graph.edge[u][v]['posted'].pop(0)
                if not new_graph.node[u]['type'] == 'hashtag' and not new_graph.node[v]['type'] == 'hashtag':
                    new_graph.edge[u][v]['n_links'] -= 1
                else:
                    if new_graph.node[u]['type'] == 'hashtag':
                        new_graph.node[u]['n_uses'] -= 1
                    else:
                        new_graph.node[v]['n_uses'] -= 1
        if len(new_graph.edge[u][v]['posted']) == 0:
            new_graph.remove_edge(u, v)
    return new_graph

In [6]:
first_split = datetime.datetime(2014, 5, 5)  #shorten the middle time period because it has more edges
second_split = datetime.datetime(2014, 5, 10)

g_0 = remove_edges_after(first_split, graph)
tg.remove_degree_zero_nodes(g_0)

g_1 = remove_edges_after(second_split, graph)
tg.remove_degree_zero_nodes(g_1)

g_2 = graph
print("New edges in training set: %d" % (g_1.number_of_edges() - g_0.number_of_edges()))
print("New edges in testing set: %d" % (g_2.number_of_edges() - g_1.number_of_edges()))
deg = 0
for node in g_0.nodes_iter():
    if g_1.degree(node) == 0:
        deg += 1
print("g1 deg 0: %d" % deg)
print("g0 edges: %d" % g_0.number_of_edges())

New edges in training set: 63759
New edges in testing set: 37261
g1 deg 0: 0
g0 edges: 129869


In [22]:
df_train = tg.dataframe_from_graph(g_0, pairs=False, sampling=.001, label_graph=g_1, min_degree=20)
y_train = tg.labels_for_dataframe(df_train, g_1)

df_test = tg.dataframe_from_graph(g_1, pairs=False, sampling=.001, label_graph=g_2, min_degree=20)
y_test = tg.labels_for_dataframe(df_test, g_2)
# force the test set to form roughly 50% links
# clearly cheating, but shows the effect of imbalanced data
# unsurprisingly scores are better... 
# need a magic function that can tell us where nodes will form ~50% of the time

0 in set so far...


53122 pairs and 0 edges in dataframe


0 in set so far...


77839 pairs and 0 edges in dataframe


In [23]:
print("Training on %d new edges " % np.sum(y_train))
print("Testing on %d new edges" % np.sum(y_test))

Training on 23023 new edges 
Testing on 19530 new edges


In [24]:
rf = RandomForestClassifier(n_estimators=500, max_depth=None, min_samples_split=2, random_state=0, n_jobs=-1)
# rf = LinearSVC()
fields = ['adam', 'jac', 'spl', 'nbrs', 'att']
x_train = df_train.loc[:, fields]
x_test = df_test.loc[:, fields]

classifier = rf.fit(x_train, y_train)
pred = classifier.predict_proba(x_test)

In [33]:
bin_pred = []
for i in range(len(pred)):
    if pred[i, 1] > .5:
        bin_pred.append(True)
    else:
        bin_pred.append(False)

In [34]:
print(roc_auc_score(y_test, pred[:, 1]))
print(roc_auc_score(y_test, bin_pred))
print(np.sum(bin_pred))

0.861023932766
0.794070052149
22254


  return a[slice1]-a[slice2]


In [41]:
(pr, re, fs, su) = precision_recall_fscore_support(y_test, bin_pred, average='binary')
print("Precision: %.4f" % pr)
print("Recall: %.4f" % re)
print("F-Score: %.4f" % fs)
# print("Support: %.4f" % su)

Precision: 0.6375
Recall: 0.7265
F-Score: 0.6791


In [36]:
# if we force all new edges into the test set:
# precision: .6375
# recall : .7265
# f-score: .6791

In [37]:
correct = 0
incorrect = 0
correct_edges = 0
incorrect_edges = 0
false_positive = 0
false_negative = 0

for i in range(0,df_test.shape[0]):
    prediction = bin_pred[i]
    actu = y_test[i]
    if prediction == actu:
        correct += 1
        if actu:
            correct_edges += 1
    else:
        incorrect += 1
        if actu:
            incorrect_edges += 1
        else:
            false_positive += 1

print("Correct predictions: %d" % correct)
print("Incorrect predictions: %d\n" % incorrect)

print("%d true positive" % correct_edges)
print("%d false negative" % incorrect_edges)
print("%d false positives" % false_positive)

Correct predictions: 64431
Incorrect predictions: 13408

14188 true positive
5342 false negative
8066 false positives
