In [4]:
import datetime
import numpy as np
import networkx as nx
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_fscore_support
from itertools import chain
import math
import twittergraph as tg
import random

In [5]:
graph = tg.LoadTwitterGraph('/Users/tomfw/Downloads/DataShared/', 0)

Loaded 181416 tweets


In [95]:
g_train = graph.copy()
g_test = graph.copy()

for u, v in g_train.edges():
    if random.random() < .2:
        g_train.remove_edge(u, v)

for u, v in g_test.edges():
    if random.random() < .2:
        g_test.remove_edge(u, v)

In [5]:
print("Computing training features...")
df_train = tg.dataframe_from_graph(g_train, sampling=.5, pairs=False)
labels_train = tg.labels_for_dataframe(df_train, graph)
print("Computing testing features...")
df_test = tg.dataframe_from_graph(g_test, sampling=.5, pairs=False)
labels_test = tg.labels_for_dataframe(df_test, graph)

Computing training features...
0 in set so far...
50000 in set so far...
100000 in set so far...
150000 in set so far...
200000 in set so far...
250000 in set so far...
300000 in set so far...
311310 pairs and 0 edges in dataframe
Computing testing features...
0 in set so far...
50000 in set so far...
100000 in set so far...
150000 in set so far...
200000 in set so far...
250000 in set so far...
300000 in set so far...
310036 pairs and 0 edges in dataframe


In [6]:
print("%d actual edges in test set" % np.sum(labels_test))

7831 actual edges in test set


In [23]:
rf = RandomForestClassifier(n_estimators=500, max_depth=None,
   min_samples_split=2, random_state=0, )
# rf = LinearSVC()
fields = ['adam', 'jac', 'spl', 'nbrs', 'att']
x_train = df_train.loc[:, fields]
y_train = labels_train

x_test = df_test.loc[:, fields]
classifier = rf.fit(x_train, y_train)
pred = classifier.predict_proba(x_test)

In [79]:
bin_pred = []
for i in range(len(pred)):
    if pred[i, 1] > .35:
        bin_pred.append(True)
    else:
        bin_pred.append(False)

In [80]:
print(roc_auc_score(labels_test, pred[:, 1]))
print(roc_auc_score(labels_test, bin_pred))
print(np.sum(bin_pred))

0.770259428555
0.565606108853
3699


In [88]:
(pr, re, fs, su) = precision_recall_fscore_support(labels_test, bin_pred, average='macro')
print("Precision: %.4f" % pr)
print("Recall: %.4f" % re)
print("F-Score: %.4f" % fs)
# print("Support: %.4f" % su)

Precision: 0.6370
Recall: 0.5656
F-Score: 0.5873


In [None]:
# AUC is sometimes good this way when using probabilities
# AUC can be good with binary predictions with significant adjustment of the threshold
# Adjusting too much to achieve higher AUC results in lower F-Measure
# Also when using binary F-Measure is reported to be < .3, with macro ~.6, with micro > .95
# Pretty sure this is because of the significant imbalance of labels

In [94]:
correct = 0
incorrect = 0
correct_edges = 0
incorrect_edges = 0
false_positive = 0
false_negative = 0

for i in range(0,df_test.shape[0]):
    prediction = bin_pred[i]
    actu = labels_test[i]
    if prediction == actu:
        correct += 1
        if actu:
            correct_edges += 1
    else:
        incorrect += 1
        if actu:
            incorrect_edges += 1
        else:
            false_positive += 1

print("Correct predictions: %d" % correct)
print("Incorrect predictions: %d\n" % incorrect)

print("%d true positive" % correct_edges)
print("%d false negative" % incorrect_edges)
print("%d false positives" % false_positive)

Correct predictions: 300696
Incorrect predictions: 9340

1095 true positive
6736 false negative
2604 false positives
