In [1]:
import datetime
import numpy as np
import networkx as nx
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_fscore_support
from itertools import chain
import math
import twittergraph as tg
import random

In [3]:
graph = tg.LoadTwitterGraph('/Volumes/pond/Temp/twitter/', 0)

Loaded 181416 tweets


In [4]:
g_train = graph.copy()
g_test = graph.copy()

for u, v in g_train.edges():
    if random.random() < .1:
        g_train.remove_edge(u, v)

for u, v in g_test.edges():
    if random.random() < .1:
        g_test.remove_edge(u, v)

for node in g_test.nodes():
    nTest = nx.degree(g_test, node)
    nTrain = nx.degree(g_train, node)
    
    if nTrain < 3 or nTest < 3:
        g_test.remove_node(node)
        g_train.remove_node(node)


print(g_train.number_of_edges())
print(g_train.number_of_nodes())

146747
18865


In [35]:
print("Computing training features...")
df_train = tg.dataframe_from_graph(g_train, sampling=0.001)
print("Computing testing features...")
df_test = tg.dataframe_from_graph(g_test, sampling=0.001, pairs=False)
labels = tg.labels_for_dataframe(df_test, graph)

Computing training features...


178190 pairs and 142 edges in dataframe
Computing testing features...


177534 pairs and 0 edges in dataframe


In [36]:
print("%d actual edges in test set" % np.sum(labels))

13 actual edges in test set


In [37]:
rf = RandomForestClassifier(n_estimators=500, max_depth=None,
   min_samples_split=2, random_state=0, )
# rf = LinearSVC()
fields = ['spl', 'nbrs', 'adam', 'att', 'jac']
x_train = df_train.loc[:, fields]
y_train = np.reshape(df_train.link, (-1, 1))

x_test = df_test.loc[:, fields]
classifier = rf.fit(x_train, y_train)
pred = classifier.predict_proba(x_test)



In [40]:
bin_pred = []
for i in range(len(pred)):
    if pred[i, 1] > .001:
        bin_pred.append(1)
    else:
        bin_pred.append(0)

In [44]:
print(roc_auc_score(labels, pred[:, 1]))
print(roc_auc_score(labels, bin_pred))
print(np.sum(pred[:, 1]))

0.875299477028
0.870693088098
120.980747745


In [45]:
(pr, re, fs, su) = precision_recall_fscore_support(labels, bin_pred, average='binary')
print("Precision: %.4f Recall: %.4f F-Score: %.4f Support: %.4f" % (pr, re, fs, su))

Precision: 0.0020 Recall: 0.7692 F-Score: 0.0040 Support: 13.0000


In [None]:
# AUC is good this way when using probabilities
# AUC can be good with binary predictions with significant adjustment of the threshold
# similarly recall or precision may ok, but F-score is always very very bad
# Problem might be due to imbalance of the data... There are ~180K pairs, but only 150 or so edges in the training set
# I could force there to be more edges in the training set to see if that improves the situation.
# relative to the number of pairs of nodes, however, edges are very rare in the data so 
# i'm not sure if adjusting the ratio in the training set would be that helpful