# Data Preparation

## Setup

In [1]:
from neo4j import GraphDatabase

uri = "neo4j://localhost:7687"
driver = GraphDatabase.driver(uri)

## Sampling

Sample random interactions:

In [7]:
sample_size = 10000

In [8]:
with driver.session() as session:
    result = session.run(f'''
    CALL {{
        MATCH p=(u1:User)-[i:INTERACTION]-(u2:User) 
        WHERE u1.id < u2.id 
        RETURN p, rand() as r, i 
        ORDER BY r
        LIMIT {sample_size}
    }}
    SET i.set = "default"
''')

In [13]:
with driver.session() as session:
    result = session.run('''
    WITH "MATCH (u1:User) -[i:INTERACTION]- (u2:User)
        WHERE u1.id < u2.id
        RETURN 'u' + u1.id AS `:START_ID`, 'u' + u2.id AS `:END_ID`, i.postings AS `postings:long`, i.channels AS `channels:long`, i.upvotes AS `upvotes:long`, i.downvotes AS `downvotes:long`, i.follows AS `follows:long`, i.ignores AS `ignores:long`, i.set
        ORDER BY u1.id, u2.id" AS query
    CALL apoc.export.csv.query(query, "interactions.csv", {quotes: "ifNeeded"})
    YIELD file, source, format, nodes, relationships, properties, time, rows, batchSize, batches, done, data
    RETURN file, source, format, nodes, relationships, properties, time, rows, batchSize, batches, done, data
''')

In [12]:
!docker cp tie-strength-prediction-neo4j-1:/var/lib/neo4j/import/interactions.csv graph/

[sPreparing to copy...[?25l[u[2KCopying from container - 0B[26G[0K22.4MB[26G[0K45.4MB[26G[0K60.6MB[26G[0K67.1MB[?25h[u[2KSuccessfully copied 76.6MB to /home/christopher/projects/tu-vienna/tie-strength-prediction/graph/


## Ground Truth Generation

Calculate Neighborhood Overlap:

In [2]:
with driver.session() as session:
    result = session.run('''
    CALL {
        MATCH p=(u1:User)-[i:INTERACTION]-(u2:User) 
        WHERE u1.id < u2.id AND i.set = "default"
        RETURN u1, u2, p, i
    }
    WITH u1, u2, i
    CALL {
        WITH u1, u2
        MATCH pn=(ut)-[r:INTERACTION]-(un:User) 
        WHERE (ut.id = u1.id OR ut.id = u2.id) AND un.id <> ut.id AND r.set = "default"
        RETURN COUNT(DISTINCT un) as un_union
    }
    CALL {
        WITH u1, u2
        MATCH pn=(u1)-[r:INTERACTION]-(un:User)-[:INTERACTION]-(u2)
        WHERE un.id <> u1.id AND un.id <> u2.id AND r.set = "default" 
        RETURN COUNT(DISTINCT un) as un_intersection
    }
    WITH i, toFloat(un_intersection) / (toFloat(un_union)-2.0) as un_overlap
    CALL {
        WITH i, un_overlap
        SET i.neighbourhood_overlap = un_overlap          
    } IN TRANSACTIONS OF 100 ROWS
''')

In [12]:
with driver.session() as session:
    result = session.run('''
    WITH "MATCH (u1:User) -[i:INTERACTION]- (u2:User)
        WHERE u1.id < u2.id
        RETURN 'u' + u1.id AS `:START_ID`, 'u' + u2.id AS `:END_ID`, i.postings AS `postings:long`, i.channels AS `channels:long`, i.upvotes AS `upvotes:long`, i.downvotes AS `downvotes:long`, i.follows AS `follows:long`, i.ignores AS `ignores:long`, i.set, i.neighbourhood_overlap AS `neighbourhood_overlap:float`
        ORDER BY u1.id, u2.id" AS query
    CALL apoc.export.csv.query(query, "interaction.csv", {quotes: "ifNeeded"})
    YIELD file, source, format, nodes, relationships, properties, time, rows, batchSize, batches, done, data
    RETURN file, source, format, nodes, relationships, properties, time, rows, batchSize, batches, done, data
''')

In [13]:
!docker cp tie-strength-prediction-neo4j-1:/var/lib/neo4j/import/interaction.csv graph/

[sPreparing to copy...[?25l[u[2KCopying from container - 0B

[26G[0K23.6MB[26G[0K42.3MB[26G[0K67.6MB[?25h[u[2KSuccessfully copied 79.7MB to /home/christopher/projects/tu-vienna/tie-strength-prediction/graph/


Local Clustering Coefficient:

In [20]:
with driver.session() as session:
    result = session.run('''
    CALL {
        MATCH p=(u:User)-[i:INTERACTION]-(un:User)
        WHERE u.id < un.id AND i.set = "default"
        RETURN u, COUNT(i) as k
    }
    WITH u, k
    CALL {
        WITH u
        MATCH pn=(un1:User)-[r1:INTERACTION]-(u)-[r2:INTERACTION]-(un2:User),
        (un1)-[i:INTERACTION]-(un2)
        WHERE un1.id < un2.id AND u.id < un1.id AND u.id < un2.id AND r1.set = "default" AND r2.set = "default" AND i.set = "default"
        RETURN COUNT(DISTINCT i) as e
    }
    WITH u, toFloat(k) as float_k, toFloat(e) as float_e
    SET u.local_clustering_coefficient = (2.0 * float_e) / (float_k*(float_k - 1.0))
''')

Min Local Clustering Coefficient:

In [22]:
with driver.session() as session:
    result = session.run('''
    CALL {
        MATCH p=(u1:User)-[i:INTERACTION]-(u2:User) 
        WHERE u1.id < u2.id AND i.set = "default"
        RETURN u1, u2, p, i
    }
    WITH u1, u2, i,
        CASE WHEN u1.local_clustering_coefficient < u2.local_clustering_coefficient
            THEN u1.local_clustering_coefficient
            ELSE u2.local_clustering_coefficient
        END AS min_local_clusterin_coefficient
    SET i.min_local_clustering_coefficient = min_local_clusterin_coefficient
''')

In [23]:
with driver.session() as session:
    result = session.run('''
    WITH "MATCH (u1:User) -[i:INTERACTION]- (u2:User)
        WHERE u1.id < u2.id
        RETURN 'u' + u1.id AS `:START_ID`, 'u' + u2.id AS `:END_ID`, i.postings AS `postings:long`, i.channels AS `channels:long`, i.upvotes AS `upvotes:long`, i.downvotes AS `downvotes:long`, i.follows AS `follows:long`, i.ignores AS `ignores:long`, i.set, i.neighbourhood_overlap AS `neighbourhood_overlap:float`, i.min_local_clustering_coefficient AS `min_local_clustering_coefficient:float`
        ORDER BY u1.id, u2.id" AS query
    CALL apoc.export.csv.query(query, "interaction.csv", {quotes: "ifNeeded"})
    YIELD file, source, format, nodes, relationships, properties, time, rows, batchSize, batches, done, data
    RETURN file, source, format, nodes, relationships, properties, time, rows, batchSize, batches, done, data
''')

Failed to read from defunct connection IPv4Address(('localhost', 7687)) (ResolvedIPv4Address(('127.0.0.1', 7687)))


In [24]:
!docker cp tie-strength-prediction-neo4j-1:/var/lib/neo4j/import/interaction.csv graph/

[sPreparing to copy...[?25l[u[2KCopying from container - 0B[26G[0K10.4MB[26G[0K23.9MB[26G[0K34.7MB[26G[0K39.8MB[26G[0K42.3MB[26G[0K43.2MB[26G[0K44.9MB[26G[0K51.5MB[26G[0K59.7MB[26G[0K67.8MB[?25h[u[2KSuccessfully copied 82.1MB to /home/christopher/projects/tu-vienna/tie-strength-prediction/graph/


Tie Strength:

In [2]:
with driver.session() as session:
    result = session.run('''
    CALL {
        MATCH p=(u1:User)-[i:INTERACTION]-(u2:User) 
        WHERE u1.id < u2.id AND i.set = "default"
        RETURN u1, u2, p, i
    }
    WITH i
    SET i.gt_tie_strength = (i.min_local_clustering_coefficient * 0.5) + (i.neighbourhood_overlap * 0.5) 
''')

Unable to retrieve routing information


ServiceUnavailable: Unable to retrieve routing information

## Feature Genration

Interaction Frequency:

In [None]:
with driver.session() as session:
    result = session.run('''
    CALL {
        MATCH p=(u1:User)-[i:INTERACTION]-(u2:User) 
        WHERE u1.id < u2.id AND i.set = "default"
        RETURN u1, u2, p, i
    }
    WITH i
    SET i.interaction_frequency = i.upvotes + i.downvotes + i.postings
''')

Sentiment:

In [None]:
with driver.session() as session:
    result = session.run('''
    CALL {
        MATCH p=(u1:User)-[i:INTERACTION]-(u2:User) 
        WHERE u1.id < u2.id AND i.set = "default"
        RETURN u1, u2, p, i
    }
    WITH i
    SET i.sentiment = ((i.upvotes / (i.upvotes + i.downvotes)) * (0.5)) + ((i.follows / (i.follows + i.ignores)) * (0.5))
''')

Closeness:

In [None]:
with driver.session() as session:
    result = session.run('''
    CALL {
        MATCH p=(u1:User)-[i:INTERACTION]-(u2:User) 
        WHERE u1.id < u2.id AND i.set = "default"
        RETURN u1, u2, p, i
    }
    WITH u1, u2, p, i
    CALL {
        WITH u1, u2
        MATCH pn=(u1)-[i1:INTERACTION]-(un:User)-[i2:INTERACTION]-(u2)
        WHERE un.id <> u1.id AND un.id <> u2.id AND i1.following = 2 AND i2.following = 2 AND i1.set = "default" AND i1.set = "default" 
        RETURN COUNT(DISTINCT un) as common_friends
    }
    WITH i, common_friends
    SET i.closeness = common_friends
''')

Reciprocity

In [None]:
with driver.session() as session:
    result = session.run('''
    CALL {
        MATCH p=(u1:User)-[i:INTERACTION]-(u2:User) 
        WHERE u1.id < u2.id AND i.set = "default"
        RETURN u1, u2, p, i
    }
    WITH u1, u2, p, i
    WITH i, toInteger(i.following = 2) as bidirectional_following
    SET i.reciprocity = bidirectional_following
''')

Multiplexity

In [None]:
with driver.session() as session:
    result = session.run('''
    CALL {
        MATCH p=(u1:User)-[i:INTERACTION]-(u2:User) 
        WHERE u1.id < u2.id AND i.set = "default"
        RETURN u1, u2, p, i
    }
    WITH i
    SET i.multiplexity = channels
''')

## Split Generation