In [1]:
import time
import calendar
import codecs
import datetime
import json
import sys
import gzip
import string
import glob
import re
import os
import numpy as np
from os import listdir
from os.path import isfile, join
import csv
import networkx as nx
from demjson import decode
import pandas as pd
import random
from collections import defaultdict
from nltk.corpus import stopwords
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
from itertools import chain

In [2]:
globalTweetCounter = 0

timeFormat = "%Y-%m-%dT%H:%M:%S.%fZ"

In [3]:
countries=['Kenya-community-relevant-restricted.json']

for m in range(len(countries)):
    file_string=countries[m]
    mypath='/Volumes/pond/Temp/twitter/'+file_string
    G=nx.Graph()
    tweetList = []
    timeList = []
    userList = []
    
    with open(mypath) as f1:
        for line in f1:
           if random.random() < .07: # < 1 if you want less data
                tweetObj = json.loads(line)
                currentTime = datetime.datetime.strptime(tweetObj['postedTime'], timeFormat)
                id2=int(re.findall('^.*:([0-9]+)$',str(tweetObj['actor']['id']))[0])
                G.add_node(id2, parent=True, child=False)
                for ui in tweetObj['twitter_entities']['user_mentions']:
                    id1=ui['id']
                    G.add_node(id1, child=True)
                    G.add_edge(id1,id2, posted=currentTime, message=tweetObj['body'])
                    #print id1, id2
                try:
                    if (not tweetObj['body'].lower().startswith("rt")):
                        # Increment tweet count
                        globalTweetCounter += 1
                        tweetList.append(tweetObj['body'].lower())
                        timeList.append(currentTime)
                        userList.append(tweetObj['actor']['id'])
                        #print globalTweetCounter, tweetObj['body'].lower()
                except:
                    pass
print globalTweetCounter

41633


In [4]:
print np.min(timeList)
print np.max(timeList)
split = datetime.datetime(2013, 3, 1) #train/test split

2013-01-08 01:06:03
2013-04-02 23:31:23


In [5]:
print G.number_of_edges(), G.number_of_nodes(), len(timeList)

72219 28539 41633


In [6]:
for node in G.nodes():
    nTrain = 0
    nTest = 0
    for nbr in G.neighbors(node):
        if G.edge[node][nbr]['posted'] < split:
            nTrain += 1
        else:
            nTest += 1
    if nTrain < 3 or nTest < 3:
        G.remove_node(node)

In [7]:
G_train = G.copy()

for u,v in G_train.edges():
    if G_train[u][v]['posted'] > split:
        G_train.remove_edge(u,v)


In [8]:
print("Full graph: %d edges %d nodes" % (G.number_of_edges(), G.number_of_nodes()))
print("Training graph: %d edges %d nodes" % (G_train.number_of_edges(), G_train.number_of_nodes()))

Full graph: 18286 edges 2057 nodes
Training graph: 8763 edges 2057 nodes


In [9]:
jacDict = {}
adamDict = {}
nbrDict = {}
attDict = {}

for node in G_train.nodes():
    jacDict[node] = {}
    adamDict[node] = {}
    nbrDict[node] = {}
    attDict[node] = {}

def get_jac(u, v):
    (u,v) = sorted((u,v))
    if v not in jacDict[u]:
        j = nx.jaccard_coefficient(G_train, [(n1, n2)])
        for x,y,p in j:
            jacDict[u][v] = p
    return jacDict[u][v]

def get_adam(u, v):
    (u,v) = sorted((u,v))
    if v not in adamDict[u]:
        j = nx.adamic_adar_index(G_train, [(n1, n2)])
        try:
            for x,y,p in j:
                adamDict[u][v] = p
        except:
            adamDict[u][v] = 0
    return adamDict[u][v]

def get_att(u,v):
    (u,v) = sorted((u,v))
    if v not in attDict[u]:
        j = nx.preferential_attachment(G_train, [(n1, n2)])
        for x,y,p in j:
            attDict[u][v] = p
    return attDict[u][v]

def get_nbrs(u, v):
    (u,v) = sorted((u,v))
    if v not in nbrDict[u]:
        nbrs = 0
        for nbr in nx.common_neighbors(G_train, u, v): nbrs += 1
        nbrDict[u][v] = nbrs
    return nbrDict[u][v]
        
def all_pairs(graph):
    return chain(graph.edges(), nx.non_edges(graph))

In [10]:
progress = 0

for n1, n2 in all_pairs(G_train):
    progress += 1
    if progress % 1000000 == 0: print progress
    get_nbrs(n1, n2)
    get_jac(n1, n2)
    get_adam(n1, n2)
    get_att(n1, n2)
            
        

1000000
2000000


In [11]:
u = []
v = []
has_links = []
jac_co = []
adam = []
att = []
nbrs = []
count = 0
for n1, n2 in all_pairs(G_train):
    count += 1
    u.append(n1)
    v.append(n2)
    has_links.append(G_train.has_edge(n1,n2))
    jac_co.append(get_jac(n1,n2))
    adam.append(get_adam(n1, n2))
    att.append(get_att(n1, n2))
    nbrs.append(get_nbrs(n1, n2))
                
df_train = pd.DataFrame()
df_train['u'] = u
df_train['v'] = v
df_train['link'] = has_links
df_train['jac'] = jac_co
df_train['adam'] = adam
df_train['nbrs'] = nbrs
df_train['att'] = att
print("%d pairs in training set, %d edges" % (count, np.count_nonzero(has_links)))

2114840 pairs in training set, 8763 edges


In [12]:
u = []
v = []
has_links = []
jac_co = []
adam = []
att = []
nbrs = []
count = 0

for n1, n2 in nx.non_edges(G_train):
    u.append(n1)
    v.append(n2)
    has_links.append(G.has_edge(n1,n2))
    jac_co.append(get_jac(n1,n2))
    adam.append(get_adam(n1, n2))
    att.append(get_att(n1, n2))
    nbrs.append(get_nbrs(n1, n2))
    count += 1

df_test = pd.DataFrame()
df_test['u'] = u
df_test['v'] = v
df_test['link'] = has_links
df_test['jac'] = jac_co
df_test['adam'] = adam
df_test['att'] = att
df_test['nbrs'] = nbrs
print("%d pairs in test set, %d true edges" % (count, np.count_nonzero(has_links)))

2106077 pairs in test set, 9271 true edges


In [13]:
rf = RandomForestClassifier(n_estimators=10, max_depth=None,
    min_samples_split=2, random_state=0)
#rf = SVC(C=.001, gamma=1, probability=True)
fields = ['att']
x_train = df_train.loc[:, fields]
y_train = np.reshape(df_train.link, (-1, 1))

x_test = df_test.loc[:, fields]
classifier = rf.fit(x_train, y_train)
pred = classifier.predict_proba(x_test)



In [14]:
roc_auc_score(np.reshape(df_test.link, (-1, 1)), pred[:,1])

0.78073559874141929