# Data Preparation

## Setup

In [1]:
from neo4j import GraphDatabase

uri = "neo4j://localhost:7687"
driver = GraphDatabase.driver(uri)

## Ground Truth Generation

Calculate Neighborhood Overlap:

In [None]:
with driver.session() as session:
    result = session.run('''
    CALL {
        MATCH p=(u1:User)-[i:INTERACTION]-(u2:User) 
        WHERE u1.id < u2.id 
        RETURN u1, u2, p, i
    }
    WITH u1, u2, i
    CALL {
        WITH u1, u2
        MATCH pn=(ut)-[:INTERACTION]-(un:User) 
        WHERE (ut.id = u1.id OR ut.id = u2.id) AND un.id <> ut.id
        RETURN COUNT(DISTINCT un) as un_union
    }
    CALL {
        WITH u1, u2
        MATCH pn=(u1)-[:INTERACTION]-(un:User)-[:INTERACTION]-(u2)
        WHERE un.id <> u1.id AND un.id <> u2.id      
        RETURN COUNT(DISTINCT un) as un_intersection
    }
    WITH i, toFloat(un_intersection) / (toFloat(un_union)-2.0) as un_overlap
    SET i.neighbourhood_overlap = un_overlap
''')

Local Clustering Coefficient:

In [None]:
with driver.session() as session:
    result = session.run('''
    CALL {
        MATCH p=(u:User)-[i:INTERACTION]-(un:User)
        WHERE u.id < un.id 
        RETURN u, COUNT(i) as k
    }
    WITH u, k
    CALL {
        WITH u
        MATCH pn=(un1:User)-[:INTERACTION]-(u)-[:INTERACTION]-(un2:User),
        (un1)-[i:INTERACTION]-(un2)
        WHERE un1.id < un2.id
        RETURN COUNT(i) as e
    }
    WITH u, toFloat(k) as float_k, toFloat(e) as float_e
    SET u.local_clustering_coefficient = (2.0 * float_e) / (float_k*(float_k - 1.0))
''')

Local Clustering Difference:

In [None]:
with driver.session() as session:
    result = session.run('''
    CALL {
        MATCH p=(u1:User)-[i:INTERACTION]-(u2:User) 
        WHERE u1.id < u2.id 
        RETURN u1, u2, p, i
    }
    WITH u1, u2, i
    SET i.local_clustering_deviation = abs(u1.local_clustering_coefficient - u2.local_clustering_coefficient)
''')

Tie Strength:

In [None]:
with driver.session() as session:
    result = session.run('''
    CALL {
        MATCH p=(u1:User)-[i:INTERACTION]-(u2:User) 
        WHERE u1.id < u2.id 
        RETURN u1, u2, p, i
    }
    WITH i
    SET i.gt_tie_strength = ((1 - i.local_clustering_deviation) * 0.5) + (i.neighbourhood_overlap * 0.5) 
''')

## Feature Genration

## Sampling

Sample random interactions:

In [2]:
sample_size = 10000

In [3]:
with driver.session() as session:
    result = session.run(f'''
    CALL {{
        MATCH p=(u1:User)-[i:INTERACTION]-(u2:User) 
        WHERE u1.id < u2.id 
        RETURN p, rand() as r, i 
        ORDER BY r
        LIMIT {sample_size}
    }}
    SET i.set = "default"
''')

## Split Generation