# <center>Creating dictionaries with infos on each user, for each dataset </center>

In [95]:
import networkx as nx
from time import time
from networkx.algorithms.centrality import closeness_centrality, betweenness_centrality, katz_centrality

Choose dataset.

In [86]:
dataset = 'test_rtu' # wcano, russian_rtu, russian_rtid, weibo_rtu, weibo_rtid, tdn10, tdn11, tdnT, test_rtu, test_rtid
cascade = False


# wcano
if dataset == 'wcano':
    data_path = "../Datasets/WCano/wcano_tronc.txt"
    out_path = "../Datasets/WCano/stats/"
    RTU = False
    truegraph = False

# russian
elif dataset == 'russian_rtu':
    data_path = "../Datasets/russian_election_2018/russian_election_2018_rtu.txt"
    out_path = "../Datasets/russian_election_2018/results/"
    RTU = True
    truegraph = False
elif dataset == 'russian_rtid':
    data_path = "../Datasets/russian_election_2018/russian_election_2018_rtid.txt"
    out_path = "../Datasets/russian_election_2018/results/"
    RTU = False
    truegraph = False

# weibo
elif dataset == 'weibo_rtu':
    data_path = "../Datasets/influence_locality/total_rtu.txt"
    out_path = "../Datasets/influence_locality/results/"
    RTU = True
    truegraph = False
elif dataset == 'weibo_rtid':
    data_path = "../Datasets/influence_locality/total_rtid.txt"
    out_path = "../Datasets/influence_locality/results/"
    RTU = False
    truegraph = False

# tdn
elif dataset == 'tdn10':
    data_path = "../Datasets/twitter_dynamic_net/tweets2010_clean.txt"
    out_path = "../Datasets/twitter_dynamic_net/results2010/"
    RTU = True
    truegraph = False
elif dataset == 'tdn11':
    data_path = "../Datasets/twitter_dynamic_net/tweets2011_clean.txt"
    out_path = "../Datasets/twitter_dynamic_net/results2011/"
    RTU = True
    truegraph = False
elif dataset == 'tdnT':
    adjacency_list = "../Datasets/twitter_dynamic_net/adjacency_list.txt"
    out_path = "../Datasets/twitter_dynamic_net/truegraph_results/"
    RTU = False
    truegraph = True

# test
elif dataset == 'test_rtu':
    data_path = "../Datasets/test/test_rtu.txt"
    out_path = "../Datasets/test/results/"
    RTU = True
    truegraph = False
elif dataset == 'test_rtid':
    data_path = "../Datasets/test/test_rtid.txt"
    out_path = "../Datasets/test/results/"
    RTU = False
    truegraph = False
    
else:
    print("Non existing dataset.")

Edit `out_path`.

In [58]:
if RTU:
    out_path += "rtu/"
elif cascade:
    out_path += "cascade/"
else:
    if truegraph:
        out_path += "truegraph/"
    else:
        out_path += "rtid/" 

## 1. Get $\lambda, \mu, \nu$
**Important :** if we don't know the author of some RTid, the reposting user is assumed to be the author. In this case we increase his/her $\mu$.

In [87]:
start = time()
users = set()
Lambda, Mu, Nu = dict(), dict(), dict()
tweets = open(data_path, 'r')


#--- cas RTu ---#
if RTU:
    
    # parcourt tweets
    for i,tweet in enumerate(tweets):
        tweet = tweet.split()
        uid, rtu = int(tweet[2]), int(tweet[-1])

        # si user non connu on crée des nouvelles entrées de dictionnaire
        if uid not in users:
            users.add(uid)
            Lambda[uid], Mu[uid], Nu[uid] = 0, 0, 0

        # si tweet original update nb_tweets
        if rtu == -1:
            Lambda[uid] += 1

        # si retweet update nb_retweets et nb_retweeted, ajoute rtu à users
        else:
            Mu[uid] += 1
            if rtu not in users:
                users.add(rtu)
                Lambda[rtu], Mu[rtu], Nu[rtu] = 0, 0, 1
            else:
                Nu[rtu] += 1
            G.add_edge(rtu, uid)    
                
        # on enregistre le ts du 1er tweet et celui du dernier tweet (update à chaque étape car tweets pas forcément pas ordre chrono)
        ts = int(tweet[1])
        if i==0:
            first_ts = ts
            last_ts = ts
        elif ts < first_ts:
            first_ts = ts
        elif ts > last_ts:
            last_ts = ts

            
#--- cas RTid ---#       
else:
    
    # si cascade on doit recréer le LastPublisher dict
    if cascade:
        LastPublisher = dict()
        for tweet in tweets:
            tweet = tweet.split()
            twid, uid = int(tweet[0]), int(tweet[2])
            LastPublisher[twid] = uid
            # si user non connu on crée des nouvelles entrées de dictionnaire
            if uid not in users:
                users.add(uid)
                Lambda[uid], Mu[uid], Nu[uid] = 0, 0, 0
            
    # sinon on recrée le author dict
    else:
        Author = dict()
        for tweet in tweets:
            tweet = tweet.split()
            twid, uid = int(tweet[0]), int(tweet[2])
            Author[twid] = uid
            # si user non connu on crée des nouvelles entrées de dictionnaire
            if uid not in users:
                users.add(uid)
                Lambda[uid], Mu[uid], Nu[uid] = 0, 0, 0
    
    # parcourt tweets
    tweets.seek(0)
    for i,tweet in enumerate(tweets):
        tweet = tweet.split()
        uid, rtid = int(tweet[2]), int(tweet[-1])

        # si tweet original update nb_tweets
        if rtid == -1:
            Lambda[uid] += 1

        # si retweet update nb_retweets et nb_retweeted (si retweeted user connu)
        else:
            if cascade:
                if rtid in LastPublisher:
                    Mu[uid] += 1
                    rtu = LastPublisher[rtid]
                    Nu[rtu] += 1
                    G.add_edge(rtu, uid)
                else:
                    Lambda[uid] += 1
                LastPublisher[rtid] = uid
            else:
                if rtid in Author:
                    Mu[uid] += 1
                    rtu = Author[rtid]
                    Nu[rtu] += 1
                    G.add_edge(rtu, uid)
                else:
                    Author[rtid] = uid
                    Lambda[uid] += 1

        # on enregistre le ts du 1er tweet et celui du dernier tweet (update à chaque étape car tweets pas forcément pas ordre chrono)
        ts = int(tweet[1])
        if i==0:
            first_ts = ts
            last_ts = ts
        elif ts < first_ts:
            first_ts = ts
        elif ts > last_ts:
            last_ts = ts
            

total_time = last_ts - first_ts
print("Tps ex : ", time()-start)
tweets.close()

Tps ex :  0.0


In [88]:
print(users == set(Lambda.keys()))
print(users == set(Mu.keys()))
print(users == set(Nu.keys()))

True
True
True


Now for each user u we divide `Lambda[u]`, `Mu[u]` and `Nu[u]` by `total_time` to get the final values for $\lambda, \mu, \nu$.

In [89]:
for u in users:
    Lambda[u] /= total_time
    Mu[u] /= total_time
    Nu[u] /= total_time

Create the main dictionary `MainDict` and add $\lambda, \mu, \nu$ to it.

In [90]:
MainDict = dict()
for u in users:
    MainDict[u] = {'lambda':Lambda[u], 'mu':Mu[u], 'nu':Nu[u]}

Delete `Lambda`, `Mu` and `Nu` to save memory (they're not useful anymore).

In [91]:
del Lambda, Mu, Nu

## 2. Create user graph

In [92]:
G = nx.DiGraph()
start = time()


# si on étudie un vrai graphe (adjacency list)
if truegraph:
    for i,line in enumerate(open(adjacency_list, 'r')):
        line = line.split()
        G.add_edge(int(line[0]), int(line[1]))

        
# sinon on utilise une trace avec rtu, cascade ou rtid
else:
    
    # get data
    tweets = open(data_path, 'r')

    # si on utilise des rtu
    if RTU:
        for tweet in tweets:
            tweet = tweet.split()
            uid, rtu = int(tweet[2]), int(tweet[-1])
            if rtu != -1 :
                G.add_edge(rtu, uid)
         
        
    # si on utilise cascade (avec rtid donc)
    elif cascade:
        
        # last publisher dict
        LastPublisher = dict()
        for tweet in tweets:
            tweet = tweet.split()
            twid, uid = int(tweet[0]), int(tweet[2])
            LastPublisher[twid] = uid
        
        # create edges
        tweets.seek(0)
        for tweet in tweets:
            tweet = tweet.split()
            uid, rtid = int(tweet[2]), int(tweet[-1])
            if rtid != -1:
                if rtid in LastPublisher:
                    G.add_edge(LastPublisher[rtid], uid)
                LastPublisher[rtid] = uid
    
    
    # dernier cas : rtid simple (sans cascade)
    else: 
        
        # author dict
        Author = dict()
        for tweet in tweets:
            tweet = tweet.split()
            twid, uid = int(tweet[0]), int(tweet[2])
            Author[twid] = uid
        
        # create edges
        tweets.seek(0)
        for tweet in tweets:
            tweet = tweet.split()
            uid, rtid = int(tweet[2]), int(tweet[-1])
            if rtid != -1:
                if rtid in Author:
                    G.add_edge(Author[rtid], uid)
                else:
                    Author[rtid] = uid

    
    # close
    tweets.close()

# fin
print("Tps ex : ", time()-start)

Tps ex :  0.0


Add to `MainDict`:
- in and out degrees
- closeness, betweenness and Katz centrality

In [111]:
betweenness = betweenness_centrality(G)

for u in MainDict:
    MainDict[u]['in_degree'] = G.in_degree[u]
    MainDict[u]['out_degree'] = G.out_degree[u]
    MainDict[u]['closeness_centrality'] = closeness_centrality(G, u)
    MainDict[u]['betweenness_centrality'] = betweenness_centrality(G)[u]

Compute closeness centrality and add result to `MainDict`.

In [112]:
MainDict

{0: {'lambda': 0.2,
  'mu': 0.4,
  'nu': 0.2,
  'in_degree': 2,
  'out_degree': 1,
  'closeness_centrality': 0.6666666666666666,
  'betweenness_centrality': 0.16666666666666666},
 1: {'lambda': 0.2,
  'mu': 0.4,
  'nu': 0.0,
  'in_degree': 2,
  'out_degree': 0,
  'closeness_centrality': 0.75,
  'betweenness_centrality': 0.0},
 2: {'lambda': 0.2,
  'mu': 0.0,
  'nu': 0.4,
  'in_degree': 0,
  'out_degree': 2,
  'closeness_centrality': 0.0,
  'betweenness_centrality': 0.0},
 77: {'lambda': 0.0,
  'mu': 0.0,
  'nu': 0.2,
  'in_degree': 0,
  'out_degree': 1,
  'closeness_centrality': 0.0,
  'betweenness_centrality': 0.0}}