In [1]:
import networkx as nx
import csv
from collections import defaultdict
from itertools import product

NUM_ROWS = 20000
following = defaultdict(list)
G = nx.Graph()

In [2]:
with open('data/01_raw/train.txt', 'r') as f:
    for i in range(NUM_ROWS): 
        if f:
            line = next(f)
            nodes = line.strip('\n').split('\t')
            source = nodes[0]
            sinks = nodes[1:]
            following[source] = sinks   
    f.close()

In [3]:
nodes = list(following.keys())

for n in nodes:
    targets = following[n]
    combinations = list(product([n], targets))
    G.add_edges_from(combinations)

In [4]:
G.number_of_edges()

23416061

In [6]:
test_pairs = []
with open('data/01_raw/test-public.txt', 'r') as f:
    test = csv.DictReader(f, delimiter='\t')
    for row in test:
        source = row['Source']
        sink = row['Sink']
        test_pairs.append((source, sink))

In [10]:
test_pairs[:5]

[('2184483', '1300190'),
 ('3151356', '1452193'),
 ('1579396', '193159'),
 ('1406432', '2481036'),
 ('2389638', '593017')]

In [7]:
from networkx.algorithms import link_prediction as lp

ra = lp.resource_allocation_index(G, test_pairs)
jc = lp.jaccard_coefficient(G, test_pairs)
aa = lp.adamic_adar_index(G, test_pairs)
pa = lp.preferential_attachment(G, test_pairs)

In [8]:
missing = {}
for ra_index, jc_index, aa_index, pa_index in zip(ra, jc, aa, pa):
    pair = (ra_index[0],ra_index[1])
    missing[pair] = {}
    missing[pair]['ra'] = ra_index[2]
    missing[pair]['jc'] = jc_index[2]
    missing[pair]['aa'] = aa_index[2]
    missing[pair]['pa'] = pa_index[2]

In [11]:
import numpy as np

vals_ra = list(x['ra'] for x in missing.values())
vals_jc = list(x['jc'] for x in missing.values())
vals_aa = list(x['aa'] for x in missing.values())
vals_pa = list(x['pa'] for x in missing.values())
mid_ra = np.percentile(vals_ra, 60)
mid_jc = np.percentile(vals_jc, 60)
mid_aa = np.percentile(vals_aa, 60)
mid_pa = np.percentile(vals_pa, 60)

In [12]:
preds = {}

for pair in missing.keys():
    val = missing[pair]
    ra = val['ra']
    jc = val['jc']
    aa = val['aa']
    pa = val['pa']
    
    predictions = []
    if ra > mid_ra:
        predictions.append(1)
    else:
        predictions.append(0)
    if jc > mid_jc:
        predictions.append(1)
    else:
        predictions.append(0)
    if aa > mid_aa:
        predictions.append(1)
    else:
        predictions.append(0)
    if pa > mid_pa:
        predictions.append(1)
    else:
        predictions.append(0)
    
    preds[pair] = np.mean(predictions)

In [13]:
print(sum([1 for x in preds.values() if x > 0.5]))

735


In [14]:
import pandas as pd

row_list = []
for i in range(1, 2001):
    pair = test_pairs[i-1]
    row = {}
    row['Id'] = i
    row['Predictions'] = preds[pair]
    row_list.append(row)

predictions = pd.DataFrame(row_list)
predictions.to_csv('naive_predictions.csv', index=False)

In [15]:
inv_distance = {}

for pair in test_pairs:
    source = pair[0]
    sink = pair[1]
    distance = nx.shortest_path_length(G, source=source, target=sink)
    inv_distance[pair] = 1/distance

In [16]:
vals_dis = list(inv_distance.values())
mid_dis = np.percentile(vals_dis, 25)

In [17]:
preds = {}

for pair in missing.keys():
    val = missing[pair]
    ra = val['ra']
    jc = val['jc']
    aa = val['aa']
    pa = val['pa']
    inv_dis = inv_distance[pair]
    
    predictions = []
    if inv_dis > mid_dis:
        predictions.append(1)
    else:
        predictions.append(0)
    if ra > mid_ra:
        predictions.append(1)
    else:
        predictions.append(0)
    if jc > mid_jc:
        predictions.append(1)
    else:
        predictions.append(0)
    if aa > mid_aa:
        predictions.append(1)
    else:
        predictions.append(0)
    if pa > mid_pa:
        predictions.append(1)
    else:
        predictions.append(0)
    
    preds[pair] = np.mean(predictions)

In [18]:
print(sum([1 for x in preds.values() if x > 0.5]))

899


In [19]:
import pandas as pd

row_list = []
for i in range(1, 2001):
    pair = test_pairs[i-1]
    row = {}
    row['Id'] = i
    row['Predictions'] = preds[pair]
    row_list.append(row)

predictions = pd.DataFrame(row_list)
predictions.to_csv('naive5_predictions.csv', index=False)