In [1]:
import datetime
import numpy as np
import networkx as nx
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_fscore_support
from itertools import chain
import math
import twittergraph as tg
import ditwittergraph as dtg
from collections import defaultdict
import random
import re
import json

In [3]:
graph = dtg.LoadTwitterGraph('/Users/tomfw/Downloads/DataShared/', 0, hashtags=True)

Loaded 181416 tweets


In [3]:
print("Max Date: %s" % np.max(dtg.timeList))
print("Min Date: %s" % np.min(dtg.timeList))
print("Graph edges: %d" % graph.number_of_edges())

Max Date: 2014-06-13 22:05:22
Min Date: 2014-03-15 04:26:31
Graph edges: 238015


In [5]:
def remove_edges_after(split, g):
    new_graph = g.copy()
    for u, v in g.edges():
        for i in range(0, len(new_graph.edge[u][v]['posted'])):
            if new_graph.edge[u][v]['posted'][0] > split:
                new_graph.edge[u][v]['posted'].pop(0)
                if not new_graph.node[u]['type'] == 'hashtag' and not new_graph.node[v]['type'] == 'hashtag':
                    new_graph.edge[u][v]['n_links'] -= 1
                else:
                    if new_graph.node[u]['type'] == 'hashtag':
                        new_graph.node[u]['n_uses'] -= 1
                    else:
                        new_graph.node[v]['n_uses'] -= 1
        if len(new_graph.edge[u][v]['posted']) == 0:
            new_graph.remove_edge(u, v)
    return new_graph

In [6]:
first_split = datetime.datetime(2014, 5, 5)  #shorten the middle time period because it has more edges
second_split = datetime.datetime(2014, 5, 10)

g_0 = remove_edges_after(first_split, graph)
#tg.remove_degree_zero_nodes(g_0)

g_1 = remove_edges_after(second_split, graph)
#tg.remove_degree_zero_nodes(g_1)

g_2 = graph.copy()
print("New edges in training set: %d" % (g_1.number_of_edges() - g_0.number_of_edges()))
print("New edges in testing set: %d" % (g_2.number_of_edges() - g_1.number_of_edges()))

New edges in training set: 65375
New edges in testing set: 38770


In [8]:
# only run this if you need a fixed set of pairs to run multiple tests on

train_pairs = []
test_pairs = []
train_dict = defaultdict(bool)
test_dict = defaultdict(bool)

for u,v in g_1.edges_iter():
    if g_1.node[u]['type'] != 'hashtag' and g_1.node[v]['type'] != 'hashtag':
        if not g_0.has_edge(u, v) and u in g_0 and v in g_0:
            (u, v) = sorted((u, v))
            train_pairs.append((u, v))
            train_dict[(u, v)] = True
n_pairs = len(train_pairs)
target = 10 * n_pairs
for u, v in nx.non_edges(g_0):
    u, v = sorted((u, v))
    if not train_dict[(u, v)]:
        train_dict[(u, v)] = True
        train_pairs.append((u, v))
        n_pairs += 1
    if n_pairs > target:
        break

for u,v in g_2.edges_iter():
    if g_2.node[u]['type'] != 'hashtag' and g_2.node[v]['type'] != 'hashtag':
        if not g_1.has_edge(u, v) and u in g_1 and v in g_1:
            (u, v) = sorted((u, v))
            test_pairs.append((u, v))
            test_dict[(u, v)] = True
n_pairs = len(test_pairs)
target = 10 * n_pairs
for u, v in nx.non_edges(g_1):
    u, v = sorted((u, v))
    if not test_dict[(u, v)]:
        test_dict[(u, v)] = True
        test_pairs.append((u, v))
        n_pairs += 1
    if n_pairs > target:
        break


del train_dict
del test_dict

print("Train size: %d" % len(train_pairs))
print("Test size: %d" % len(test_pairs))

Train size: 478771
Test size: 314101


In [27]:
df_train, y_train = dtg.dataframe_from_graph(g_0, pairs=False, sampling=0.01, label_graph=g_1, min_katz=0.0069, cheat=False)
#y_train = dtg.labels_for_dataframe(df_train, g_1)

df_test, y_test = dtg.dataframe_from_graph(g_1, pairs=False, sampling=0.01, label_graph=g_2, min_katz=0.0069, cheat=False)
#y_test = dtg.labels_for_dataframe(df_test, g_2)

Precomputing katzes....


1000000 checked... 985908 eliminated


2000000 checked... 1972367 eliminated


3000000 checked... 2957227 eliminated


4000000 checked... 3944499 eliminated


5000000 checked... 4931065 eliminated


6000000 checked... 5918030 eliminated


7000000 checked... 6904464 eliminated


7057858 pairs eliminated.... 
7156023 pairs and 0 edges in dataframe
Precomputing katzes....


1000000 checked... 984917 eliminated


2000000 checked... 1970110 eliminated


3000000 checked... 2954359 eliminated


4000000 checked... 3941305 eliminated


5000000 checked... 4927499 eliminated


6000000 checked... 5914538 eliminated


7000000 checked... 6900584 eliminated


7041728 pairs eliminated.... 
7143399 pairs and 0 edges in dataframe


In [28]:
print("Training on %d new edges out of %d pairs " % (np.sum(y_train), df_train.shape[0]))
print("Testing on %d new edges out of %d pairs" % (np.sum(y_test), df_test.shape[0]))

Training on 24252 new edges out of 98165 pairs 
Testing on 15125 new edges out of 101671 pairs


In [29]:
rf = RandomForestClassifier(n_estimators=500, max_depth=None, min_samples_split=2, random_state=0, n_jobs=-1)
# rf = LinearSVC()
fields = ['katz', 'att', 'adam', 'jac',  'nbrs', 'spl']
x_train = df_train.loc[:, fields]
x_test = df_test.loc[:, fields]

classifier = rf.fit(x_train, y_train)
pred = classifier.predict_proba(x_test)

In [34]:
bin_pred = []
for i in range(len(pred)):
    if pred[i, 1] > float(2)/300:
        bin_pred.append(True)
    else:
        bin_pred.append(False)

In [35]:
print(roc_auc_score(y_test, pred[:, 1]))
print(roc_auc_score(y_test, bin_pred))
print(np.sum(bin_pred))

0.780067159241
0.632405848091
65611


In [36]:
(pr, re, fs, su) = precision_recall_fscore_support(y_test, bin_pred, average='binary')
print("Precision: %.4f" % pr)
print("Recall: %.4f" % re)
print("F-Score: %.4f" % fs)
# print("Support: %.4f" % su)

Precision: 0.2007
Recall: 0.8707
F-Score: 0.3262


In [37]:
# if we force all new edges into the test set:
# precision: .6375
# recall : .7265
# f-score: .6791

In [38]:
print fields
print rf.feature_importances_

['katz', 'att', 'adam', 'jac', 'nbrs', 'spl']


[ 0.59163531  0.13310947  0.09030346  0.08016763  0.02009843  0.0846857 ]


In [39]:
correct = 0
incorrect = 0
correct_edges = 0
incorrect_edges = 0
false_positive = 0
false_negative = 0

for i in range(0,df_test.shape[0]):
    prediction = bin_pred[i]
    actu = y_test[i]
    if prediction == actu:
        correct += 1
        if actu:
            correct_edges += 1
    else:
        incorrect += 1
        if actu:
            incorrect_edges += 1
        else:
            false_positive += 1

print("Correct predictions: %d" % correct)
print("Incorrect predictions: %d\n" % incorrect)

print("%d true positive" % correct_edges)
print("%d false negative" % incorrect_edges)
print("%d false positives" % false_positive)

Correct predictions: 47275
Incorrect predictions: 54396

13170 true positive
1955 false negative
52441 false positives


In [22]:
print np.max(df_test.jac)

0.5
