# ID2222 Homework 3: Mining Data Streams

## Hongyi Luo (hluo@kth.se) & Yage Hao (yage@kth.se)

### 0. Preprocessing

dataset: https://snap.stanford.edu/data/web-Stanford.html 

***Dataset statistics***  
Nodes	281903  
Edges	2312497  
Nodes in largest WCC	255265 (0.906)  
Edges in largest WCC	2234572 (0.966)  
Nodes in largest SCC	150532 (0.534)  
Edges in largest SCC	1576314 (0.682)  
Average clustering coefficient	0.5976  
Number of triangles	11329473  
Fraction of closed triangles	0.002889  
Diameter (longest shortest path)	674  
90-percentile effective diameter	9.7  

In [1]:
import numpy as np
import random

In [2]:
# import the graph dataset
dataset = set()
with open("web-Stanford.txt") as f:
    for i in range(4, 10000+4): # discard desciptions and pick first 10000 elements to ease the computation
        content = next(f).strip().split()
        if content[0] != content[1]:
            dataset.add((content[0], content[1]))
print(f'dataset contains {len(dataset)} edges')

dataset contains 10000 edges


### 1. TRIEST-BASE

In [26]:
def TRIEST_BASE(M, dataset=dataset):
    """
    This function estimates number of global triangles by TRIEST_BASE algorithm.
    
    argument: 
        M: size of reservoir sample, M>=6
        dataset, =dataset defaultly.
    return:
        number of global triangle count.
        number of local triangle count.
    """
    S = set() # edge sample from reservoir sampling
    t = 0 # time
    tau = 0 # global counter for the estimation of the global number of triangles
    tau_local = {} # key: vertex or edge, value: counter
    
    def sample_edge(edge, t, S, tau, tau_local, M=M):
        """
        This function is the reservoir sampling process.
        Notice that each edge item in the sample has equal probability.
    
        argument:
            edge: an arbitrary edge.
            t: time instance.
            S: edge sample.
            M: size of reservoir sample.
            tau: global counter.
            tau_local: local counters.
        return:
            True or False: whether the input edge will replace an existing edge in the edge sample.
            tau: global counter if updated.
            tau_local: local counters if updated.
        """
        # if t<=M, the edge on the stream at time t 
        # is deterministically inserted in S
        if t <= M:
            return [True]
    
        # if t>M, TRIEST-BASE flips a biased coin with heads probability M/t
        # If the outcome is heads, it chooses an existing edge in S uniformly at random,
        # remove it and insert the current edge on time t into S 
        elif random.random() <= (M/t): 
            del_edge = random.sample(S, 1)[0]
            S.remove(del_edge)
            result = update_counters('-', del_edge, S, tau, tau_local)
            tau = result[0]
            tau_local = result[1]
            return [True, tau, tau_local]
    
        # Otherwise S is not modified
        return [False]
    
    
    def update_counters(operation, edge, S, tau, tau_local):
        """
        This function compute neighborhood of vertices and update global and local counters.
    
        argument:
            operation: '+' or '-', means insert or delete respectively.
            edge: an arbitrary edge.
            S: edge sample.
            tau: global counter.
            tau_local: local counters.
        return:
            tau: updated global counter.
            tau_local: updated local counters.
        """
        u = edge[0]
        v = edge[1]
    
        # construct neighborhood of u and v respectively
        # $ N^S_u = {v in V^(t): (u,v) in S} $
        u_neighbor = set()
        v_neighbor = set()
        for vertices_pair in S:
            # neighborhood of u
            if u == vertices_pair[0]:
                u_neighbor.add(vertices_pair[1])
            elif u == vertices_pair[1]:
                u_neighbor.add(vertices_pair[0])
            # neighborhood of v
            if v == vertices_pair[0]:
                v_neighbor.add(vertices_pair[1])
            elif v == vertices_pair[1]:
                v_neighbor.add(vertices_pair[0])
        
        # construct shared neighborhood of u and v
        # $ N^S_u,v = intersection(N^S_u, N^S_v) $
        shared_neighbor = set.intersection(u_neighbor, v_neighbor)
        #if shared_neighbor != set():
        #   print(operation, len(shared_neighbor))
    
        # update counters
        for c in shared_neighbor:
            if operation == '+':
                tau += 1
                tau_local[u] = tau_local.get(u, 0) + 1 # local counter for a subset of the nodes u in V^(t)
                tau_local[v] = tau_local.get(v, 0) + 1 # local counter for a subset of the nodes v in U^(t)
                tau_local[c] = tau_local.get(c, 0) + 1 # local counter for a subset of the nodes in shared neighborhood of u and v
            elif operation == '-':
                tau -= 1
                tau_local[u] = tau_local.get(u, 0) - 1
                if tau_local[u] <= 0:
                    del tau_local[u]
                tau_local[v] = tau_local.get(v, 0) - 1
                if tau_local[v] <= 0:
                    del tau_local[v]
                tau_local[c] = tau_local.get(c, 0) - 1
                if tau_local[c] <= 0:
                    del tau_local[c]
        return [tau, tau_local]
    
    
    for edge in dataset:
        #if t%1000 == 0:
        #    print(f'element index = {t}, tau = {tau}')
            
        t += 1 # update time
        result = sample_edge(edge, t, S, tau, tau_local)
        if result[0] is True:
            S.add(edge)
            if len(result) > 1:
                tau = result[1] # update tau value of deletion step
                tau_local = result[2]
            result = update_counters('+', edge, S, tau, tau_local)  
            tau = result[0] # update tau value of insert step
            tau_local = result[1]
    
    # compute estimation of numbers of triangles
    epsilon = (t*(t-1)*(t-2)) / (M*(M-1)*(M-2))
    if epsilon < 1:
        epsilon = 1
    global_triangles = epsilon*tau
    local_triangles = {}
    for key in tau_local:
        local_triangles[key] = tau_local[key] * epsilon
    
    return [global_triangles, local_triangles]

#### Test 1.1

We set M=5000 in the above sampling case and remember the size of the dataset we use is 10000.  
We can see that when t<=M, our estimate number of triangles by TRIEST_BASE algorithm is exactly the same as true value.

In [24]:
print('Compute true value:')
true_value = TRIEST_BASE(len(dataset))[0] # M = len(dataset) = 10000
print(f'True value is {true_value}.')

print('\nCompute estimate value by reservoir sampling:')
estimate_value = TRIEST_BASE(5000)[0] # pick arbitrary M=5000
print(f'Estimate value is {estimate_value}.')

print(f'\nError: {abs(estimate_value - true_value)} triangles.')
print(f'Error rate: {abs(estimate_value - true_value)/true_value}.')

Compute true value:
element index = 0, tau = 0
element index = 1000, tau = 45
element index = 2000, tau = 364
element index = 3000, tau = 1272
element index = 4000, tau = 2806
element index = 5000, tau = 4991
element index = 6000, tau = 7781
element index = 7000, tau = 11870
element index = 8000, tau = 16893
element index = 9000, tau = 22858
True value is 29906.0.

Compute estimate value by reservoir sampling:
element index = 0, tau = 0
element index = 1000, tau = 45
element index = 2000, tau = 364
element index = 3000, tau = 1272
element index = 4000, tau = 2806
element index = 5000, tau = 4991
element index = 6000, tau = 4793
element index = 7000, tau = 4523
element index = 8000, tau = 4514
element index = 9000, tau = 4421
Estimate value is 34306.292917166866.

Error: 4400.292917166866 triangles.
Error rate: 0.14713746128425284.


#### Test 1.2

With multiple runs, we hypothesis that the results (estimation of number of global triangles) distributed normally. And if the sample mean is biased compared to the true value, we suppose that the algorithm is somehow biased.

In [98]:
estimate_list = []
for i in range(20):
    print(i)
    estimate_value = TRIEST_BASE(5000)[0] # pick arbitrary M=5000
    estimate_list.append(estimate_value)
    
mean_value = np.mean(estimate_list)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19


In [102]:
print(f'run TRIEST_BASE for 20 times with M=5000, result list: \n{estimate_list}')
print(f'\nmean value is {mean_value}')

run TRIEST_BASE for 20 times with M=5000, result list: 
[35994.79951980792, 38411.52460984394, 41004.30252100841, 38107.43337334934, 40796.24009603842, 35786.73709483794, 38643.59423769508, 37131.140456182475, 38267.48139255703, 38891.66866746699, 36042.81392557023, 39291.7887154862, 41188.357743097244, 35594.67947178872, 34530.360144057624, 38947.685474189675, 38195.45978391357, 37459.23889555823, 37731.32052821129, 38827.64945978392]

mean value is 38042.21380552221


#### Test 1.3

The estimation of global number of triangles is very sensative to the choice of M.
As M tends to the length of the original dataset, the estimation tends to the true value and the variance of results in multiple runs shrinks.

In [104]:
print(f'true value = {true_value}')
for M in [1000, 3000, 5000, 7000, 9000]:
    estimate_value = TRIEST_BASE(M)[0]
    print(f'M = {M}, estimate number of global triangles = {estimate_value}')

true value = 30190.0
M = 1000, estimate number of global triangles = 61165.07411218834
M = 3000, estimate number of global triangles = 37655.9891144502
M = 5000, estimate number of global triangles = 36290.88835534214
M = 7000, estimate number of global triangles = 33878.99204053664
M = 9000, estimate number of global triangles = 30470.151635973678


#### Test 1.4

Here we show part of the result in estimating local number of triangles.

In [36]:
local_triangles = TRIEST_BASE(5000)[1]

dataset_example = []
for example in list(dataset)[:10]:
    dataset_example.append(example[0])
    dataset_example.append(example[1])
for i in dataset_example:
    print(f'The local number of triangles to vertex {i} is {local_triangles.get(i, 0)}.')

The local number of triangles to vertex 252300 is 8.002400960384154.
The local number of triangles to vertex 87 is 32.00960384153662.
The local number of triangles to vertex 141934 is 8.002400960384154.
The local number of triangles to vertex 172382 is 0.
The local number of triangles to vertex 251990 is 720.2160864345739.
The local number of triangles to vertex 63133 is 0.
The local number of triangles to vertex 232037 is 312.093637454982.
The local number of triangles to vertex 249008 is 0.
The local number of triangles to vertex 175144 is 72.02160864345738.
The local number of triangles to vertex 61833 is 0.
The local number of triangles to vertex 84382 is 256.07683073229293.
The local number of triangles to vertex 199822 is 192.0576230492197.
The local number of triangles to vertex 225872 is 112.03361344537817.
The local number of triangles to vertex 167080 is 0.
The local number of triangles to vertex 62574 is 648.1944777911165.
The local number of triangles to vertex 121154 is 22

### 2. TRIEST-IMPR

In [42]:
def TRIEST_IMPR(M, dataset=dataset):
    """
    This function estimates number of global triangles by TRIEST_BASE algorithm.
    
    argument: 
        M: size of reservoir sample, M>=6
        dataset, =dataset defaultly.
    return:
        number of global triangle count.
        number of local triangle count.
    """
    S = set() # edge sample from reservoir sampling
    t = 0 # time
    tau = 0 # global counter for the estimation of the global number of triangles
    tau_local = {} # key: vertex or edge, value: counter
    
    
    def sample_edge(edge, t, S, M=M):
        """
        This function is the reservoir sampling process.
        Notice that each edge item in the sample has equal probability.
    
        argument:
            edge: an arbitrary edge.
            t: time instance.
            S: edge sample.
            M: size of reservoir sample.
        return:
            True or False: whether the input edge will replace an existing edge in the edge sample.
        """
        # if t<=M, the edge on the stream at time t 
        # is deterministically inserted in S
        if t <= M:
            return True
    
        # if t>M, TRIEST-BASE flips a biased coin with heads probability M/t
        # If the outcome is heads, it chooses an existing edge in S uniformly at random,
        # remove it and insert the current edge on time t into S 
        elif random.random() <= (M/t): 
            del_edge = random.sample(S, 1)[0]
            S.remove(del_edge)
            # TRIEST-IMPR never decrements the counters when an edge is removed from S
            return True
    
        # Otherwise S is not modified
        return False
    
    
    def update_counters(edge, S, tau, tau_local, t, M=M):
        """
        This function compute neighborhood of vertices and update global and local counters.
    
        argument:
            edge: an arbitrary edge.
            S: edge sample.
            tau: global counter.  
            tau_local: local counters.
            t: time instance.
            M: sample size, default =M.
        return:
            tau: updated global counter.
            tau_local: updated local counters.
        """
        u = edge[0]
        v = edge[1]
    
        # construct neighborhood of u and v respectively
        # $ N^S_u = {v in V^(t): (u,v) in S} $
        u_neighbor = set()
        v_neighbor = set()
        for vertices_pair in S:
            # neighborhood of u
            if u == vertices_pair[0]:
                u_neighbor.add(vertices_pair[1])
            elif u == vertices_pair[1]:
                u_neighbor.add(vertices_pair[0])
            # neighborhood of v
            if v == vertices_pair[0]:
                v_neighbor.add(vertices_pair[1])
            elif v == vertices_pair[1]:
                v_neighbor.add(vertices_pair[0])
        
        # construct shared neighborhood of u and v
        # $ N^S_u,v = intersection(N^S_u, N^S_v) $
        shared_neighbor = set.intersection(u_neighbor, v_neighbor)
        #if shared_neighbor != set():
        #    print(len(shared_neighbor))
    
        # update counters
        # all the calls to update_counters have operation '+'
        # preform a weighted increase using weight
        # $ eta^t = max{1, (t-1)(t-2)/(M(M-1))} $
        eta = ((t-1)*(t-2)) / (M*(M-1))
        if eta < 1:
            eta = 1
           
        for c in shared_neighbor:
            tau += eta
            tau_local[u] = tau_local.get(u, 0) + eta # local counter for a subset of the nodes u in V^(t)
            tau_local[v] = tau_local.get(v, 0) + eta # local counter for a subset of the nodes v in U^(t)
            tau_local[c] = tau_local.get(c, 0) + eta # local counter for a subset of the nodes in shared neighborhood of u and v
        return [tau, tau_local]
    
    
    for edge in dataset:
        #if t%1000 == 0:
        #    print(f'element index = {t}, tau = {tau}')
            
        t += 1 # update time
        result = update_counters(edge, S, tau, tau_local, t) # call update_counters unconditionally
        tau = result[0]
        tau_local = result[1]
        
        if sample_edge(edge, t, S) == True:
            S.add(edge)
        
    return [tau, tau_local]

#### Test 2.1

Similar to test 1.1, we first set M=5000 in the above improved sampling case and the size of the dataset we use is 10000.  
We can see that when t<=M, our estimate number of triangles by TRIEST_IMPR algorithm is exactly the same as true value.

In [40]:
print('Compute true value:')
true_value = TRIEST_IMPR(len(dataset))[0] # M = len(dataset) = 10000
print(f'True value is {true_value}.')

print('\nCompute estimate value by reservoir sampling:')
estimate_value = TRIEST_IMPR(5000)[0] # pick arbitrary M=4000
print(f'Estimate value is {estimate_value}.')

print(f'\nError: {abs(estimate_value - true_value)} triangles.')
print(f'Error rate: {abs(estimate_value - true_value)/true_value}.')

Compute true value:
element index = 0, tau = 0
element index = 1000, tau = 45
element index = 2000, tau = 364
element index = 3000, tau = 1272
element index = 4000, tau = 2806
element index = 5000, tau = 4991
element index = 6000, tau = 7781
element index = 7000, tau = 11870
element index = 8000, tau = 16893
element index = 9000, tau = 22858
True value is 29906.

Compute estimate value by reservoir sampling:
element index = 0, tau = 0
element index = 1000, tau = 45
element index = 2000, tau = 364
element index = 3000, tau = 1272
element index = 4000, tau = 2806
element index = 5000, tau = 4991
element index = 6000, tau = 7849.278360552148
element index = 7000, tau = 12357.376413362705
element index = 8000, tau = 18256.19833822773
element index = 9000, tau = 25494.269504861073
Estimate value is 34532.72692218469.

Error: 4626.726922184687 triangles.
Error rate: 0.15470898556091378.


#### Test 2.2

We run 20 times of TRIEST_IMPR algorithm with M=5000.  
Comparing results with the ones obtained by TRIEST_BASE algorithm, we can see that the variance of improved algorithm is lower and the mean of the improved algorithm is closer to the true value as we expected.

In [107]:
estimate_list_impr = []
for i in range(20):
    print(i)
    estimate_value = TRIEST_IMPR(5000)[0] # pick arbitrary M=5000
    estimate_list_impr.append(estimate_value)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19


In [113]:
mean_base = np.mean(estimate_list)
mean_impr = np.mean(estimate_list_impr)
var_base = np.var(estimate_list)
var_impr = np.var(estimate_list_impr)

print(f'true value is {true_value}')
print(f'mean from TRIEST_BASE is {mean_base}')
print(f'mean from TRIEST_IMPR is {mean_impr}')
print(f'variance from TRIEST_BASE is {var_base}')
print(f'variance from TRIEST_IMPR is {var_impr}')

true value is 30190.0
mean from TRIEST_BASE is 38042.21380552221
mean from TRIEST_IMPR is 35072.02691547922
variance from TRIEST_BASE is 3170998.664030402
variance from TRIEST_IMPR is 423135.8354098646


#### Test 2.3

Here we show part of the result in estimating local number of triangles.

In [43]:
local_triangles = TRIEST_IMPR(5000)[1]

dataset_example = []
for example in list(dataset)[:10]:
    dataset_example.append(example[0])
    dataset_example.append(example[1])
for i in dataset_example:
    print(f'The local number of triangles to vertex {i} is {local_triangles.get(i, 0)}.')

The local number of triangles to vertex 252300 is 9.186463452690537.
The local number of triangles to vertex 87 is 45.99086321264253.
The local number of triangles to vertex 141934 is 70.52146957391477.
The local number of triangles to vertex 172382 is 0.
The local number of triangles to vertex 251990 is 611.1803283056612.
The local number of triangles to vertex 63133 is 0.
The local number of triangles to vertex 232037 is 199.4619611922385.
The local number of triangles to vertex 249008 is 0.
The local number of triangles to vertex 175144 is 97.59409617923585.
The local number of triangles to vertex 61833 is 0.
The local number of triangles to vertex 84382 is 335.1431927185437.
The local number of triangles to vertex 199822 is 263.89288937787563.
The local number of triangles to vertex 225872 is 232.98058859771953.
The local number of triangles to vertex 167080 is 0.
The local number of triangles to vertex 62574 is 380.8097123424688.
The local number of triangles to vertex 121154 is 3