# Data Preparation

## Setup

In [None]:
from neo4j import GraphDatabase

uri = "neo4j://localhost:7687"
driver = GraphDatabase.driver(uri)

## Project Graph

In [None]:
with driver.session() as session:
    result = session.run('''
        CALL gds.graph.project(
            'userInteractionGraph',
            ['User'],
            [
                {INTERACTION: {orientation: 'UNDIRECTED',aggregation: 'SINGLE'} }
            ]
        )
        YIELD graphName AS graph, nodeProjection, nodeCount AS nodes, relationshipProjection, relationshipCount AS rels
''')

In [None]:
with driver.session() as session:
    session.run('''
    CALL gds.graph.drop('userInteractionGraph', false) YIELD graphName;
''')

## Ground Truth Generation

Calculate Neighborhood Overlap:

In [None]:
with driver.session() as session:
    result = session.run('''
        CALL{
            CALL gds.nodeSimilarity.stream('userInteractionGraph', { topK: 10, bottomk: 10 })
            YIELD node1, node2, similarity
            WITH gds.util.asNode(node1) AS u1, gds.util.asNode(node2) AS u2, similarity
            MATCH (u1)-[i:INTERACTION]-(u2)
            SET i.neighbourhoodOverlap = similarity          
        } IN TRANSACTIONS OF 1000 ROWS
''')

In [None]:
with driver.session() as session:
    result = session.run('''
    WITH "MATCH (u1:User) -[i:INTERACTION]- (u2:User)
        WHERE u1.id < u2.id
        RETURN 'u' + u1.id AS `:START_ID`, 'u' + u2.id AS `:END_ID`, i.postings AS `postings:long`, i.channels AS `channels:long`, i.upvotes AS `upvotes:long`, i.downvotes AS `downvotes:long`, i.follows AS `follows:long`, i.ignores AS `ignores:long`, i.neighbourhoodOverlap AS `neighbourhoodOverlap:float`
        ORDER BY u1.id, u2.id" AS query
    CALL apoc.export.csv.query(query, "interaction.csv", {quotes: "ifNeeded"})
    YIELD file, source, format, nodes, relationships, properties, time, rows, batchSize, batches, done, data
    RETURN file, source, format, nodes, relationships, properties, time, rows, batchSize, batches, done, data
''')

In [None]:
!docker cp tie-strength-prediction-neo4j-1:/var/lib/neo4j/import/interaction.csv graph/

Local Clustering Coefficient:

In [None]:
with driver.session() as session:
    result = session.run('''
    CALL gds.localClusteringCoefficient.write('userInteractionGraph', {
    writeProperty: 'localClusteringCoefficient'
    })
    YIELD averageClusteringCoefficient, nodeCount
''')

Min Local Clustering Coefficient:

In [None]:
with driver.session() as session:
    result = session.run('''
    CALL {
        MATCH p=(u1:User)-[i:INTERACTION]-(u2:User) 
        RETURN u1, u2, i
    }
    WITH u1, u2, i,
        CASE WHEN u1.localClusteringCoefficient < u2.localClusteringCoefficient
            THEN u1.localClusteringCoefficient
            ELSE u2.localClusteringCoefficient
        END AS minLocalClusterinCoefficient       
    CALL {
        WITH u1, u2, i, minLocalClusterinCoefficient
        SET i.minLocalClusterinCoefficient = minLocalClusterinCoefficient
    } IN TRANSACTIONS OF 1000 ROWS
''')

In [None]:
with driver.session() as session:
    result = session.run('''
    WITH "MATCH (u1:User) -[i:INTERACTION]- (u2:User)
        WHERE u1.id < u2.id
        RETURN 'u' + u1.id AS `:START_ID`, 'u' + u2.id AS `:END_ID`, i.postings AS `postings:long`, i.channels AS `channels:long`, i.upvotes AS `upvotes:long`, i.downvotes AS `downvotes:long`, i.follows AS `follows:long`, i.ignores AS `ignores:long`, i.neighbourhoodOverlap AS `neighbourhoodOverlap:float`, i.minLocalClusterinCoefficient AS `minLocalClusterinCoefficient:float`
        ORDER BY u1.id, u2.id" AS query
    CALL apoc.export.csv.query(query, "interaction.csv", {quotes: "ifNeeded"})
    YIELD file, source, format, nodes, relationships, properties, time, rows, batchSize, batches, done, data
    RETURN file, source, format, nodes, relationships, properties, time, rows, batchSize, batches, done, data
''')

In [None]:
!docker cp tie-strength-prediction-neo4j-1:/var/lib/neo4j/import/interaction.csv graph/

Tie Strength:

In [None]:
with driver.session() as session:
    result = session.run('''
    CALL {
        MATCH p=(u1:User)-[i:INTERACTION]-(u2:User) 
        RETURN u1, u2, p, i
    }
    WITH u1, u2, i,
        CASE WHEN i.minLocalClusterinCoefficient IS NULL OR i.neighbourhoodOverlap IS NULL
            THEN NULL
            ELSE (i.minLocalClusterinCoefficient * 0.5) + (i.neighbourhoodOverlap * 0.5) 
        END AS tieStrength
    CALL {
        WITH u1, u2, i, tieStrength
        SET i.tieStrength = tieStrength
    } IN TRANSACTIONS OF 1000 ROWS
''')

In [None]:
with driver.session() as session:
    result = session.run('''
    WITH "MATCH (u1:User) -[i:INTERACTION]- (u2:User)
        WHERE u1.id < u2.id
        RETURN 'u' + u1.id AS `:START_ID`, 'u' + u2.id AS `:END_ID`, i.postings AS `postings:long`, i.channels AS `channels:long`, i.upvotes AS `upvotes:long`, i.downvotes AS `downvotes:long`, i.follows AS `follows:long`, i.ignores AS `ignores:long`, i.set AS `set`, i.neighbourhoodOverlap AS `neighbourhoodOverlap:float`, i.minLocalClusterinCoefficient AS `minLocalClusterinCoefficient:float`, i.tieStrength AS `tieStrength:float`
        ORDER BY u1.id, u2.id" AS query
    CALL apoc.export.csv.query(query, "interaction.csv", {quotes: "ifNeeded"})
    YIELD file, source, format, nodes, relationships, properties, time, rows, batchSize, batches, done, data
    RETURN file, source, format, nodes, relationships, properties, time, rows, batchSize, batches, done, data
''')

In [None]:
!docker cp tie-strength-prediction-neo4j-1:/var/lib/neo4j/import/interaction.csv graph/

## Feature Genration

Interaction Frequency:

In [None]:
with driver.session() as session:
    result = session.run('''
    CALL {
        MATCH p=(u1:User)-[i:INTERACTION]-(u2:User) 
        RETURN u1, u2, p, i
    }
    WITH u1, u2, i,
        CASE WHEN i.upvotes IS NULL
            THEN 0.0
            ELSE i.upvotes
        END AS upvotes,
        CASE WHEN i.downvotes IS NULL
            THEN 0.0
            ELSE i.downvotes
        END AS downvotes,
        CASE WHEN i.postings IS NULL
            THEN 0.0
            ELSE i.postings
        END AS postings
    CALL {
        WITH u1, u2, i, upvotes, downvotes, postings
        SET i.interactionFrequency = upvotes + downvotes + postings
    } IN TRANSACTIONS OF 1000 ROWS
''')

Sentiment:

In [None]:
with driver.session() as session:
    result = session.run('''
    CALL {
        MATCH p=(u1:User)-[i:INTERACTION]-(u2:User) 
        RETURN u1, u2, p, i
    }
    WITH u1, u2, i,
        CASE WHEN i.upvotes IS NULL
            THEN 0.0
            ELSE toFloat(i.upvotes)
        END AS upvotes,
        CASE WHEN i.downvotes IS NULL
            THEN 0.0
            ELSE toFloat(i.downvotes)
        END AS downvotes,
        CASE WHEN i.follows IS NULL
            THEN 0.0
            ELSE toFloat(i.follows)
        END AS follows,
        CASE WHEN i.ignores IS NULL
            THEN 0.0
            ELSE toFloat(i.ignores)
        END AS ignores        
    WITH u1, u2, i, 
        CASE WHEN upvotes + downvotes = 0.0
            THEN 0.0
            ELSE ((upvotes / (upvotes + downvotes)) * (0.5))
        END AS voting_sentiment,
        CASE WHEN follows + ignores = 0.0
            THEN 0.0
            ELSE ((follows / (follows + ignores)) * (0.5))
        END AS following_sentiment
    CALL {
        WITH u1, u2, i, voting_sentiment, following_sentiment
        SET i.sentiment = voting_sentiment + following_sentiment
    } IN TRANSACTIONS OF 1000 ROWS
''')

Closeness:

In [None]:
with driver.session() as session:
    result = session.run('''
    CALL {
        MATCH p=(u1:User)-[i:INTERACTION]-(u2:User) 
        WHERE i.neighbourhoodOverlap IS NOT NULL
        RETURN u1, u2, p, i
    }
    WITH u1, u2, i
    CALL {
        WITH u1, u2, i
        MATCH pn=(u1)-[i1:INTERACTION]-(un:User)-[i2:INTERACTION]-(u2)
        WHERE i1.follows = 2 AND i2.follows = 2
        RETURN COUNT(DISTINCT un) as common_friends
    }
    WITH u1, u2, i, common_friends
    CALL {
        WITH u1, u2, i, common_friends
        SET i.closeness = common_friends
    } IN TRANSACTIONS OF 1000 ROWS
''')

Reciprocity

In [None]:
with driver.session() as session:
    result = session.run('''
    CALL {
        MATCH p=(u1:User)-[i:INTERACTION]-(u2:User) 
        RETURN u1, u2, p, i
    }
    WITH u1, u2, i,
        CASE WHEN i.follows IS NULL
            THEN 0
            ELSE i.follows
        END AS follows
    WITH u1, u2, i,
        CASE WHEN follows = 2
            THEN 1
            ELSE 0
        END AS reciprocity
    CALL {
        WITH u1, u2, i, reciprocity
        SET i.reciprocity = reciprocity
    } IN TRANSACTIONS OF 1000 ROWS
''')

Multiplexity

In [None]:
with driver.session() as session:
    result = session.run('''
    CALL {
        MATCH p=(u1:User)-[i:INTERACTION]-(u2:User) 
        RETURN u1, u2, p, i
    }
    WITH u1, u2, i,
        CASE WHEN i.channels IS NULL
            THEN 0
            ELSE i.channels
        END AS channels          
    CALL {
        WITH u1, u2, i, channels
        SET i.multiplexity = channels
    } IN TRANSACTIONS OF 1000 ROWS
''')

In [None]:
with driver.session() as session:
    result = session.run('''
    WITH "MATCH (u1:User) -[i:INTERACTION]- (u2:User)
        WHERE u1.id < u2.id
        RETURN 'u' + u1.id AS `:START_ID`, 'u' + u2.id AS `:END_ID`, i.postings AS `postings:long`, i.channels AS `channels:long`, i.upvotes AS `upvotes:long`, i.downvotes AS `downvotes:long`, i.follows AS `follows:long`, i.ignores AS `ignores:long`, i.neighbourhoodOverlap AS `neighbourhoodOverlap:float`, i.minLocalClusterinCoefficient AS `minLocalClusterinCoefficient:float`, i.tieStrength AS `tieStrength:float`, i.reciprocity AS `reciprocity:float`, i.multiplexity AS `multiplexity:long`, i.closeness AS `closeness:long`, i.sentiment AS `sentiment:float`, i.interactionFrequency AS `interactionFrequency:long`
        ORDER BY u1.id, u2.id" AS query
    CALL apoc.export.csv.query(query, "interaction.csv", {quotes: "ifNeeded"})
    YIELD file, source, format, nodes, relationships, properties, time, rows, batchSize, batches, done, data
    RETURN file, source, format, nodes, relationships, properties, time, rows, batchSize, batches, done, data
''')

In [None]:
!docker cp tie-strength-prediction-neo4j-1:/var/lib/neo4j/import/interaction.csv graph/

Sample Connected Nodes:

## Split Generation

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

Read the data from the csv file

In [None]:
df = pd.read_csv('graph/interaction-final.csv', sep=',', header=0)

display(df)

Remove interactions without tie strength

In [None]:
df = df[df['tieStrength:float'].notna()]

display(df)

Remove unused columns

In [None]:
df = df.drop([':START_ID', ':END_ID', 'postings:long', 'channels:long', 'upvotes:long', 'downvotes:long', 'follows:long', 'ignores:long'], axis=1)

display(df)

In [None]:
# Split the data into train and test set
train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)

# Further split the train set into train and validation set
train_set, val_set = train_test_split(train_set, test_size=0.25, random_state=42) # 0.25 x 0.8 = 0.2


In [None]:
train_set.to_csv('graph/interaction_train_set.csv', index=False)
val_set.to_csv('graph/interaction_val_set.csv', index=False)
test_set.to_csv('graph/interaction_test_set.csv', index=False)

## Model Creation