dataset: https://snap.stanford.edu/data/web-Stanford.html 

***Dataset statistics***  
Nodes	281903  
Edges	2312497  
Nodes in largest WCC	255265 (0.906)  
Edges in largest WCC	2234572 (0.966)  
Nodes in largest SCC	150532 (0.534)  
Edges in largest SCC	1576314 (0.682)  
Average clustering coefficient	0.5976  
Number of triangles	11329473  
Fraction of closed triangles	0.002889  
Diameter (longest shortest path)	674  
90-percentile effective diameter	9.7  

### 0. Preprocessing

In [1]:
import numpy as np
import random

In [13]:
# import the graph dataset
dataset = set()
with open("web-Stanford.txt") as f:
    for i in range(4, 10004): # discard desciptions and pick first 10000 elements to ease the computation
        content = next(f).strip().split()
        if content[0] != content[1]:
            dataset.add((content[0], content[1]))
print(f'dataset contains {len(dataset)} edges')

dataset contains 10000 edges


### 1. TRIEST-BASE

In [91]:
def TRIEST_BASE(M, dataset=dataset):
    """
    This function estimates number of global triangles by TRIEST_BASE algorithm.
    
    argument: 
        M: size of reservoir sample, M>=6
        dataset, =dataset defaultly.
    return:
        number of global triangle count.
    """
    S = set() # edge sample from reservoir sampling
    t = 0 # time
    tau = 0 # global counter for the estimation of the global number of triangles

    
    def sample_edge(edge, t, S, tau, M=M):
        """
        This function is the reservoir sampling process.
        Notice that each edge item in the sample has equal probability.
    
        argument:
            edge: an arbitrary edge.
            t: time instance.
            S: edge sample.
            M: size of reservoir sample.
            tau: global counter.
        return:
            True or False: whether the input edge will replace an existing edge in the edge sample.
            tau: if updated.
        """
        # if t<=M, the edge on the stream at time t 
        # is deterministically inserted in S
        if t <= M:
            return [True]
    
        # if t>M, TRIEST-BASE flips a biased coin with heads probability M/t
        # If the outcome is heads, it chooses an existing edge in S uniformly at random,
        # remove it and insert the current edge on time t into S 
        elif random.random() <= (M/t): 
            del_edge = random.sample(S, 1)[0]
            S.remove(del_edge)
            #print(1,tau)
            tau = update_counters('-', del_edge, S, tau) 
            #print(2,tau)
            return [True, tau]
    
        # Otherwise S is not modified
        return [False]
    
    
    def update_counters(operation, edge, S, tau):
        """
        This function compute neighborhood of vertices and update global and local counters.
    
        argument:
            operation: '+' or '-', means insert or delete respectively.
            edge: an arbitrary edge.
            S: edge sample.
            tau: global counter.    
        return:
            tau: updated global counter.
        """
        u = edge[0]
        v = edge[1]
    
        # construct neighborhood of u and v respectively
        # $ N^S_u = {v in V^(t): (u,v) in S} $
        u_neighbor = set()
        v_neighbor = set()
        for vertices_pair in S:
            # neighborhood of u
            if u == vertices_pair[0]:
                u_neighbor.add(vertices_pair[1])
            elif u == vertices_pair[1]:
                u_neighbor.add(vertices_pair[0])
            # neighborhood of v
            if v == vertices_pair[0]:
                v_neighbor.add(vertices_pair[1])
            elif v == vertices_pair[1]:
                v_neighbor.add(vertices_pair[0])
        
        # construct shared neighborhood of u and v
        # $ N^S_u,v = intersection(N^S_u, N^S_v) $
        shared_neighbor = set.intersection(u_neighbor, v_neighbor)
        #if shared_neighbor != set():
        #   print(operation, len(shared_neighbor))
    
        # update counters
        for c in (u_neighbor & v_neighbor):
            if operation == '+':
                tau += 1
                tau_u = len(u_neighbor) + 1 # local counter for a subset of the nodes u in V^(t)
                tau_v = len(v_neighbor) + 1 # local counter for a subset of the nodes v in U^(t)
                tau_c = len(shared_neighbor) + 1 # local counter for a subset of the nodes in shared neighborhood of u and v
            elif operation == '-':
                tau -= 1
                tau_u = len(u_neighbor) - 1
                if tau_u <= 0:
                    del tau_u
                tau_v = len(v_neighbor) - 1
                if tau_v <= 0:
                    del tau_v
                tau_c = len(shared_neighbor) - 1
                if tau_c <= 0:
                    del tau_c
        return tau
    
    
    for edge in dataset:
        if t%1000 == 0:
            print(f'element index = {t}, tau = {tau}')
            
        t += 1 # update time
        result = sample_edge(edge, t, S, tau)
        if result[0] is True:
            S.add(edge)
            if len(result) > 1:
                tau = result[1] # update tau value of deletion step
            tau = update_counters('+', edge, S, tau)  
        
    epsilon = (t*(t-1)*(t-2)) / (M*(M-1)*(M-2))
    if epsilon < 1:
        epsilon = 1
    return epsilon*tau

#### Test

In [96]:
print('Compute true value:')
true_value = TRIEST_BASE(len(dataset)) # M = len(dataset) = 10000
print(f'True value is {true_value}.')

print('\nCompute estimate value by reservoir sampling:')
estimate_value = TRIEST_BASE(1000) # pick arbitrary M=1000
print(f'Estimate value is {estimate_value}.')

print(f'\nError: {abs(estimate_value - true_value)} triangles.')
print(f'Error rate: {abs(estimate_value - true_value)/true_value}.')

Compute true value:
element index = 0, tau = 0
element index = 1000, tau = 38
element index = 2000, tau = 353
element index = 3000, tau = 1012
element index = 4000, tau = 2311
element index = 5000, tau = 4472
element index = 6000, tau = 7453
element index = 7000, tau = 11718
element index = 8000, tau = 16982
element index = 9000, tau = 22681
True value is 29882.0.

Compute estimate value by reservoir sampling:
element index = 0, tau = 0
element index = 1000, tau = 38
element index = 2000, tau = 44
element index = 3000, tau = 46
element index = 4000, tau = 38
element index = 5000, tau = 39
element index = 6000, tau = 40
element index = 7000, tau = 44
element index = 8000, tau = 45
element index = 9000, tau = 33
Estimate value is 31083.890122587516.

Error: 1201.890122587516 triangles.
Error rate: 0.04022120750242675.
