In [1]:
import datetime
import numpy as np
import networkx as nx
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_fscore_support
from itertools import chain
import math
import twittergraph as tg
import random
import re
import json

In [3]:
graph = tg.LoadTwitterGraph('/Users/tomfw/Downloads/DataShared/', 0)

Loaded 181416 tweets


In [4]:
print("Max Date: %s" % np.max(tg.timeList))
print("Min Date: %s" % np.min(tg.timeList))
print("Graph edges: %d" % graph.number_of_edges())

Max Date: 2014-06-13 22:05:22
Min Date: 2014-03-15 04:26:31
Graph edges: 171685


In [5]:
def remove_edges_after(split, g):
    new_graph = g.copy()
    for u, v in g.edges():
        for i in range(0, len(new_graph.edge[u][v]['posted'])):
            if new_graph.edge[u][v]['posted'][0] > split:
                new_graph.edge[u][v]['posted'].pop(0)
                new_graph.edge[u][v]['n_links'] -= 1
        if len(new_graph.edge[u][v]['posted']) == 0:
            new_graph.remove_edge(u, v)
    return new_graph

In [6]:
first_split = datetime.datetime(2014, 5, 5)  #shorten the middle time period because it has more edges
second_split = datetime.datetime(2014, 5, 10)

g_0 = remove_edges_after(first_split, graph)
tg.remove_degree_zero_nodes(g_0)

g_1 = remove_edges_after(second_split, graph)
tg.remove_degree_zero_nodes(g_1)

g_2 = graph
print("New edges in training set: %d" % (g_1.number_of_edges() - g_0.number_of_edges()))
print("New edges in testing set: %d" % (g_2.number_of_edges() - g_1.number_of_edges()))
deg = 0
for node in g_0.nodes_iter():
    if g_1.degree(node) == 0:
        deg += 1
print("g1 deg 0: %d" % deg)
print("g0 edges: %d" % g_0.number_of_edges())

New edges in training set: 46261
New edges in testing set: 29901
g1 deg 0: 0
g0 edges: 95523


In [7]:
# train on g_0 fit on g_1
# test on g_1 with labels from g_2

In [8]:
df_train = tg.dataframe_from_graph(g_0, pairs=False, sampling=.01, label_graph=g_1)
y_train = tg.labels_for_dataframe(df_train, g_1)

df_test = tg.dataframe_from_graph(g_1, pairs=False, sampling=.01)
y_test = tg.labels_for_dataframe(df_test, g_2)

0 in set so far...


150000 in set so far...


252478 pairs and 0 edges in dataframe


0 in set so far...


150000 in set so far...


300000 in set so far...


430409 pairs and 0 edges in dataframe


In [9]:
print("Training on %d new edges " % np.sum(y_train))
print("Testing on %d new edges" % np.sum(y_test))

Training on 21858 new edges 
Testing on 193 new edges


In [10]:
rf = RandomForestClassifier(n_estimators=500, max_depth=None, min_samples_split=2, random_state=0, )
# rf = LinearSVC()
fields = ['adam', 'jac', 'spl', 'nbrs', 'att']
x_train = df_train.loc[:, fields]
x_test = df_test.loc[:, fields]

classifier = rf.fit(x_train, y_train)
pred = classifier.predict_proba(x_test)

In [11]:
bin_pred = []
for i in range(len(pred)):
    if pred[i, 1] > .5:
        bin_pred.append(True)
    else:
        bin_pred.append(False)

In [11]:
print(roc_auc_score(y_test, pred[:, 1]))
print(roc_auc_score(y_test, bin_pred))
print(np.sum(bin_pred))

0.815707769304
0.698915165979
9485


In [21]:
(pr, re, fs, su) = precision_recall_fscore_support(y_test, bin_pred, average='macro')
print("Precision: %.4f" % pr)
print("Recall: %.4f" % re)
print("F-Score: %.4f" % fs)
# print("Support: %.4f" % su)

Precision: 0.5041
Recall: 0.6989
F-Score: 0.5028


In [None]:
# if we force all new edges into the test set:
# precision: .5161
# recall : .4518
# f-score: .4818

In [19]:
correct = 0
incorrect = 0
correct_edges = 0
incorrect_edges = 0
false_positive = 0
false_negative = 0

for i in range(0,df_test.shape[0]):
    prediction = bin_pred[i]
    actu = y_test[i]
    if prediction == actu:
        correct += 1
        if actu:
            correct_edges += 1
    else:
        incorrect += 1
        if actu:
            incorrect_edges += 1
        else:
            false_positive += 1

print("Correct predictions: %d" % correct)
print("Incorrect predictions: %d\n" % incorrect)

print("%d true positive" % correct_edges)
print("%d false negative" % incorrect_edges)
print("%d false positives" % false_positive)

Correct predictions: 420893
Incorrect predictions: 9516

81 true positive
112 false negative
9404 false positives


In [87]:
# Obviously this isn't earth shattering, but it definitely seems
# that class imbalance was the issue.  This was cheating, but
# we should work to make sure that the edges-to-be end up in the 
# dataframes