In [1]:
import pandas as pd
import numpy as np
import re
import networkx as nx
from datetime import datetime
import matplotlib.pyplot as plt
import itertools
import time
pd.set_option('display.max_rows', 1500)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.width', 1000)


# Read Edge List

In [2]:
df = pd.read_csv('out.tsv', delim_whitespace=True)

#add prefixes to distinguish escort and buyer nodes
df['FromNode'] = df['FromNode'].apply(lambda x: "{}{}".format('Buyer', x))
df['ToNode'] = df['ToNode'].apply(lambda x: "{}{}".format('Escort', x))

In [3]:
#convert time
year = []; month = []; day= []
i = 0
for t in df['time']:
    t = int(t)
    if t < 0: 
        time.append(np.nan)
        print(datetime.utcfromtimestamp(-t).year)
        i+=1
    else: 
        time = datetime.utcfromtimestamp(t)
        year.append(time.year)
        month.append(time.month)
        day.append(time.day)
        
df['year'] = year
df['month'] = month
df['day'] = day

df['year'].value_counts()

2007    14604
2006    12035
2008    11989
2005     8036
2004     3526
2003      434
2002        8
Name: year, dtype: int64

In [4]:
#split dataframe into train and test period
train_df = df.loc[df['year'] > 2005]
train_df = df.loc[df['year'] < 2008]
test_df = df.loc[df['year'] == 2008]


In [5]:
#change ratings to occurances with average rating
edges_df = train_df.groupby(['FromNode', 'ToNode'],as_index=False).mean()[['FromNode', 'ToNode','weight']]

edges_df['Occurences'] = train_df.groupby(['FromNode', 'ToNode'],as_index=False).count()['weight']

edges_df['year'] = df.groupby(['FromNode', 'ToNode'],as_index=False).min()['year']


# Read labelled data
How the labelled data is gathered is mentioned in some cells below

In [7]:
test_edges = pd.read_csv('safety.csv')

In [8]:
test_edges.drop(columns = ['Unnamed: 0'], inplace = True)
test_edges.head()

Unnamed: 0,FromNode,ToNode,y,shortest distance,Sum of Papers,Sum of Neighbours
0,Buyer1002,Escort1426,True,3.0,160,103
1,Buyer1002,Escort4403,True,3.0,18,18
2,Buyer1007,Escort3873,True,5.0,29,17
3,Buyer1010,Escort5000,True,3.0,22,22
4,Buyer1013,Escort5178,True,7.0,9,8


In [None]:
# THIS IS HOW TEST EDGES WAS CREATED 

from networkx.algorithms import bipartite
TG = nx.Graph()
# Add nodes with the node attribute "bipartite"
TG.add_nodes_from(edges['FromNode'].unique().tolist(), bipartite=0)
TG.add_nodes_from(edges['ToNode'].unique().tolist(),  bipartite=1)
# Add edges only between nodes of opposite node sets
r = edges[['FromNode', 'ToNode']].to_records(index=False)
TG.add_edges_from(list(r)) 

buyer_nodes = {n for n, d in TG.nodes(data=True) if d["bipartite"] == 0}
escort_nodes = set(TG) - buyer_nodes

f = []; t = []
i = 0
for node in buyer_nodes: 
    i+=1
    if i %1000 == 0:
        print(i, len(buyer_nodes))
    no_neighbours = nx.non_neighbors(TG, node)
    for n in no_neighbours:
        if n in escort_nodes:
            f.append(node)
            t.append(n)

temp = pd.DataFrame()
temp['FromNode'] = f
temp['ToNode'] = t
temp['y'] = False

test_edges = pd.concat([test_edges,temp])

In [9]:
#make graph at present phase
g = nx.Graph()
# Add nodes with the node attribute "bipartite"
g.add_nodes_from(edges_df['FromNode'].unique().tolist(), bipartite=0)
g.add_nodes_from(edges_df['ToNode'].unique().tolist(),  bipartite=1)
# Add edges only between nodes of opposite node sets
r = edges_df[['FromNode', 'ToNode']].to_records(index=False)
g.add_edges_from(list(r)) 


In [10]:
len(g.edges)

30098

In [11]:
buyer_nodes = {n for n, d in g.nodes(data=True) if d["bipartite"] == 0}
escort_nodes = set(g) - buyer_nodes

degreees = [t[1] for t in g.degree]
np.mean(degreees)

4.404800234157764

In [12]:

print("Unique Buyers", len(edges_df['FromNode'].unique()), 'Unique escorts', len(edges_df['ToNode'].unique()))
unique_nodes = edges_df['FromNode'].unique().tolist()
unique_nodes.extend(edges_df['ToNode'].unique())


nodes = pd.DataFrame()
nodes['Node'] = list(unique_nodes)

Unique Buyers 8386 Unique escorts 5280


In [None]:
##DONE 
# sum of papers 
sum_papers = []; sum_papers2 = []
for i, row in nodes.iterrows():
    #get all edges in which row[node] is in
    temp = edges_df[(edges_df['FromNode'] == row['Node']) | (edges_df['ToNode'] == row['Node'])]
    sum_papers.append(sum(temp['Occurences']))
    if sum(temp['Occurences']) == 0: 
        sum_papers2.append(0)
    else:
        sum_papers2.append(sum(temp['Occurences'])/temp.shape[0])#only if weights differ from one

#%%

nodes['Sum of Papers'] = sum_papers


In [None]:
#DONE
### SUm of Neighbours
from networkx.classes.function import neighbors
temp = nx.Graph(g)
sum_neighbours = []

for i, row in nodes.iterrows():
    sum_neighbours.append(temp.degree(row['Node'])) #len([n for n in temp.neighbors(row['Node'])])) # degree


#%%

nodes['Sum of Neighbours'] = sum_neighbours


In [None]:
#DONE
### WEIGHTED NEIGHBOURS

from networkx.classes.function import neighbors
temp = nx.Graph(g)
sum_neighbours = []

for i, row in nodes.iterrows():
    sum_neighbours.append(temp.degree(row['Node'],weight = 'Occurences')) #len([n for n in temp.neighbors(row['Node'])])) # degree
nodes['Weighted Sum of Neighbours'] = sum_neighbours


In [None]:
# DONE
### SHortest Distance
shortest_distance = [] 
j = 0
start = True
for i, row in test_edges.iterrows():
    if j %1000000 == 0: 
        print (j, test_edges.shape, 'index', i, 'shortest distance', len(shortest_distance))
    j+=1
    try:
        X = nx.shortest_path_length(g, row['FromNode'], row['ToNode'])
        shortest_distance.append(X)
    except nx.NetworkXNoPath:
        shortest_distance.append(np.inf)


#%%

len(shortest_distance)

#%%


test_edges['shortest distance'] = shortest_distance


In [None]:
#DONE
papers_dict = pd.Series(nodes['Sum of Papers'].values,index=nodes['Node']).to_dict()

#%%

neighbours_dict = pd.Series(nodes['Sum of Neighbours'].values,index=nodes['Node']).to_dict()

#%%

papers = []; neighbours = [] 
j = 0
for i, row in test_edges.iterrows():
    j+=1
    if j %1000000 == 0: 
        print (j, test_edges.shape, 'index', i, 'length', len(papers))
    papers.append(papers_dict[row['FromNode']] + papers_dict[row['ToNode']])
    neighbours.append(neighbours_dict[row['FromNode']] + neighbours_dict[row['ToNode']])

#%%

test_edges['Sum of Papers'] = papers
test_edges['Sum of Neighbours'] = neighbours


# Sampling

In [None]:
#random undersampling
import random
random.seed(42)
neg_indices = test_edges.loc[test_edges['y']== False].index.to_list()
neg_sample = random.sample(neg_indices, 60)
test_edges_sample = test_edges.loc[neg_sample]
test_edges_sample = pd.concat([test_edges_sample, test_edges.loc[test_edges['y']== True]])

In [None]:
#neighbourhood sample
test_edges_neighbourhood = test_edges.loc[test_edges['shortest distance'] == 3]

# METHODS

### Dummy Classifier

In [14]:


from sklearn import model_selection
from sklearn import metrics


def dummy(x):
    avg_acc = []; avg_precision =[]; avg_recall = []; avg_f1 = []; avg_auc = []
    for seed in [0,1,2,3,4]:
        acc =[]; precision = []; recall = []; f1 = []; auc = []
        cv = model_selection.StratifiedKFold(n_splits=5, shuffle = True, random_state = seed)
        for train_index, test_index in cv.split(x, x['y']):
            x_tr, x_te = x.iloc[train_index], x.iloc[test_index]
            if x_tr.loc[x_tr['y']==1,'y'].shape[0] <=  x_tr.loc[x_tr['y']==0,'y'].shape[0]:
                majority_class = 0
            else: 
                majority_class = 1
            print('majority class: ', majority_class)
            y_pred = [majority_class for i in range(len(x_te['y']))]
            y_te = x_te['y'].tolist()


            acc.append(metrics.accuracy_score(y_te, y_pred))
            precision.append(metrics.precision_score(y_te, y_pred))
            recall.append(metrics.recall_score(y_te, y_pred))
            f1.append(metrics.f1_score(y_te, y_pred))
            auc.append(metrics.roc_auc_score(y_te, y_pred))
        avg_acc.append(np.mean(acc))
        avg_precision.append(np.mean(precision))
        avg_recall.append(np.mean(recall))
        avg_f1.append(np.mean(f1))
        avg_auc.append(np.mean(auc))
    print('acc', avg_acc, np.mean(avg_acc))
    print('precision',avg_precision,  np.mean(avg_precision))
    print('recall',avg_recall, np.mean(avg_recall))
    print('f1', avg_f1,  np.mean(avg_f1))
    print('AUC',avg_auc, np.mean(avg_auc))

#%%


In [14]:
dummy(test_edges[['FromNode','ToNode', 'y']])

majority class:  0


  _warn_prf(average, modifier, msg_start, len(result))


majority class:  0


  _warn_prf(average, modifier, msg_start, len(result))


majority class:  0


  _warn_prf(average, modifier, msg_start, len(result))


majority class:  0


  _warn_prf(average, modifier, msg_start, len(result))


majority class:  0


  _warn_prf(average, modifier, msg_start, len(result))


majority class:  0


  _warn_prf(average, modifier, msg_start, len(result))


majority class:  0


  _warn_prf(average, modifier, msg_start, len(result))


majority class:  0


  _warn_prf(average, modifier, msg_start, len(result))


majority class:  0


  _warn_prf(average, modifier, msg_start, len(result))


majority class:  0


  _warn_prf(average, modifier, msg_start, len(result))


majority class:  0


  _warn_prf(average, modifier, msg_start, len(result))


majority class:  0


  _warn_prf(average, modifier, msg_start, len(result))


majority class:  0


  _warn_prf(average, modifier, msg_start, len(result))


majority class:  0


  _warn_prf(average, modifier, msg_start, len(result))


majority class:  0


  _warn_prf(average, modifier, msg_start, len(result))


majority class:  0


  _warn_prf(average, modifier, msg_start, len(result))


majority class:  0


  _warn_prf(average, modifier, msg_start, len(result))


majority class:  0


  _warn_prf(average, modifier, msg_start, len(result))


majority class:  0


  _warn_prf(average, modifier, msg_start, len(result))


majority class:  0


  _warn_prf(average, modifier, msg_start, len(result))


majority class:  0


  _warn_prf(average, modifier, msg_start, len(result))


majority class:  0


  _warn_prf(average, modifier, msg_start, len(result))


majority class:  0


  _warn_prf(average, modifier, msg_start, len(result))


majority class:  0


  _warn_prf(average, modifier, msg_start, len(result))


majority class:  0


  _warn_prf(average, modifier, msg_start, len(result))


acc [0.9999326974956754, 0.9999326974956754, 0.9999326974956754, 0.9999326974956754, 0.9999326974956754] 0.9999326974956754
precision [0.0, 0.0, 0.0, 0.0, 0.0] 0.0
recall [0.0, 0.0, 0.0, 0.0, 0.0] 0.0
f1 [0.0, 0.0, 0.0, 0.0, 0.0] 0.0
AUC [0.5, 0.5, 0.5, 0.5, 0.5] 0.5


### Jaccard

In [16]:
def jaccard(x, threshold = 0.5, jaccard = None, verbose = True):
    avg_acc = []; avg_precision =[]; avg_recall = []; avg_f1 = []; avg_auc = []

    for seed in [0,1,2,3,4]:
        #print('seed', seed)
        acc =[]; precision = []; recall = []; f1 = []; auc = []
        cv = model_selection.StratifiedKFold(n_splits=5, shuffle = True, random_state = seed)
        for train_index, test_index in cv.split(x,x['y']):
            x_tr, x_te = x.iloc[train_index], x.iloc[test_index]
            y_pred = []
            y_te = []
            for i, row in x_te.iterrows(): 
                if (row['FromNode'], row['ToNode']) in jaccard.keys():
                    key = (row['FromNode'], row['ToNode'])
                else: 
                    key = (row['ToNode'], row['FromNode'])
                if jaccard[key] < threshold:
                    y_pred.append(0)
                else: 
                    y_pred.append(1)
                y_te.append(row['y'])
            acc.append(metrics.accuracy_score(y_te, y_pred))
            precision.append(metrics.precision_score(y_te, y_pred))
            recall.append(metrics.recall_score(y_te, y_pred))
            f1.append(metrics.f1_score(y_te, y_pred))
            auc.append(metrics.roc_auc_score(y_te, y_pred))
        avg_acc.append(np.mean(acc))
        avg_precision.append(np.mean(precision))
        avg_recall.append(np.mean(recall))
        avg_f1.append(np.mean(f1))
        avg_auc.append(np.mean(auc))
    if verbose:
        print('acc', avg_acc, np.mean(avg_acc))
        print('precision',avg_precision,  np.mean(avg_precision))
        print('recall',avg_recall, np.mean(avg_recall))
        print('f1', avg_f1,  np.mean(avg_f1))
        print('AUC',avg_auc, np.mean(avg_auc))
    else:
        print('acc',np.mean(avg_acc))
        print('precision',np.mean(avg_precision))
        print('recall',np.mean(avg_recall))
        print('f1', np.mean(avg_f1))
        print('AUC',np.mean(avg_auc))  


In [15]:
test_edges['jtuple'] = list(zip(test_edges.FromNode, test_edges.ToNode))
jaccard_similarity = list(nx.jaccard_coefficient(g, test_edges['jtuple'].tolist()))

In [17]:
jaccard_dict= {}
for j in list(jaccard_similarity):
    jaccard_dict[(j[0], j[1])] = j[2]

In [18]:
jaccard(test_edges[['FromNode','ToNode', 'y']],jaccard= jaccard_dict,threshold = 0.1)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

acc [0.9999326974956754, 0.9999326974956754, 0.9999326974956754, 0.9999326974956754, 0.9999326974956754] 0.9999326974956754
precision [0.0, 0.0, 0.0, 0.0, 0.0] 0.0
recall [0.0, 0.0, 0.0, 0.0, 0.0] 0.0
f1 [0.0, 0.0, 0.0, 0.0, 0.0] 0.0
AUC [0.5, 0.5, 0.5, 0.5, 0.5] 0.5


### Node2Vec
Implementation from https://stellargraph.readthedocs.io/en/stable/demos/link-prediction/node2vec-link-prediction.html. This code is merely altered for my purpose.


In [15]:

from stellargraph import StellarGraph, datasets
from stellargraph.data import EdgeSplitter
from stellargraph.data import BiasedRandomWalk
from gensim.models import Word2Vec
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
import multiprocessing

In [20]:

def node2vec_embedding(graph, name):
    p = 1.0
    q = 1.0
    dimensions = 128
    num_walks = 10
    walk_length = 80
    window_size = 10
    num_iter = 1
    workers = multiprocessing.cpu_count()
    rw = BiasedRandomWalk(graph)
    walks = rw.run(graph.nodes(), n=num_walks, length=walk_length, p=p, q=q)
    print(f"Number of random walks for '{name}': {len(walks)}")

    model = Word2Vec(
        walks,
        size=dimensions,
        window=window_size,
        min_count=0,
        sg=1,
        workers=workers,
        iter=num_iter,
    )

    def get_embedding(u):
        return model.wv[u]

    return get_embedding


# 1. link embeddings
def link_examples_to_features(link_examples, transform_node, binary_operator):
    return [
        binary_operator(transform_node(src), transform_node(dst))
        for src, dst in link_examples
    ]


# 2. training classifier
def train_link_prediction_model(
    link_examples, link_labels, get_embedding, binary_operator, seed
):
    clf = link_prediction_classifier(seed)
    link_features = link_examples_to_features(
        link_examples, get_embedding, binary_operator
    )
    clf.fit(link_features, link_labels)
    return clf


def link_prediction_classifier(seed, max_iter=2000):
    lr_clf = LogisticRegression(C=10, max_iter=max_iter, random_state = seed)
    #LogisticRegressionCV(Cs=10, cv=3, scoring="roc_auc", max_iter=max_iter, random_state = seed)
    return Pipeline(steps=[("sc", StandardScaler()), ("clf", lr_clf)])


# 3. and 4. evaluate classifier
def evaluate_link_prediction_model(
    clf, link_examples_test, link_labels_test, get_embedding, binary_operator
):
    link_features_test = link_examples_to_features(
        link_examples_test, get_embedding, binary_operator
    )
    score = evaluate_roc_auc(clf, link_features_test, link_labels_test)
    return score


def evaluate_roc_auc(clf, link_features, link_labels):
    predicted = clf.predict(link_features)
    return metrics.accuracy_score(link_labels, predicted), metrics.precision_score(link_labels, predicted), metrics.recall_score(link_labels, predicted),metrics.f1_score(link_labels, predicted), metrics.roc_auc_score(link_labels, predicted)


#%%

def operator_hadamard(u, v):
    return u * v


def operator_l1(u, v):
    return np.abs(u - v)


def operator_l2(u, v):
    return (u - v) ** 2


def operator_avg(u, v):
    return (u + v) / 2.0


#%%

def node2vec(x, seed = 42): 
    # get training
    edges = edges_df[['FromNode', 'ToNode', 'Occurences']]
    edges = edges.rename(columns = {'FromNode':'source', 'ToNode':'target', 'Occurences': 'weight'})
    train_graph = StellarGraph(edges=edges)
    print(train_graph.info())
    
    # test graph: ever needed? 
    #edges = edges_df[['Node1', 'Node2', 'Occurances']]
    #edges = edges.rename(columns = {'Node1':'source', 'Node2':'target', 'Occurances': 'weight'})
    #test_graph = StellarGraph(edges=edges)
    #print(test_graph.info())
    


    embedding_train = node2vec_embedding(train_graph, "train")
    
    
    y = test_edges['y']
    
    
    
           
    for binary_operator in  [operator_hadamard, operator_l1, operator_l2, operator_avg]: 
        avg_acc = []; avg_precision =[]; avg_recall = []; avg_f1 = []; avg_auc = []
        for seed in [0]:#,1,2,3,4]:
            cv = model_selection.KFold(n_splits=2, shuffle = True, random_state = seed) 
            for train_index, test_index in cv.split(x):
                acc =[]; precision =[]; recall = []; f1 =[]; auc = []
                index = random.sample(x.index.to_list(), int(len(x)/5))
                y_tr = y.iloc[train_index].to_numpy()
                x_tr = x.iloc[train_index].to_numpy()
                y_te = y.iloc[test_index].to_numpy()
                x_te = x.iloc[test_index].to_numpy()

                clf = train_link_prediction_model(x_tr,y_tr, embedding_train, binary_operator,seed)
                a, p, r, f, ac = evaluate_link_prediction_model(clf,x_te,y_te,embedding_train,binary_operator)

                acc.append(a)
                precision.append(p)
                recall.append(r)
                f1.append(f)
                auc.append(ac)
            avg_acc.append(np.mean(acc))
            avg_precision.append(np.mean(precision))
            avg_recall.append(np.mean(recall))
            avg_f1.append(np.mean(f1))
            avg_auc.append(np.mean(auc))
            
        print('---')
        print(binary_operator.__name__)
        print('acc', avg_acc, np.mean(avg_acc))
        print('precision',avg_precision,  np.mean(avg_precision))
        print('recall',avg_recall, np.mean(avg_recall))
        print('f1', avg_f1,  np.mean(avg_f1))
        print('AUC',avg_auc, np.mean(avg_auc))



In [21]:
node2vec(test_edges[['FromNode', 'ToNode']], 42)

StellarGraph: Undirected multigraph
 Nodes: 13666, Edges: 30098

 Node types:
  default: [13666]
    Features: none
    Edge types: default-default->default

 Edge types:
    default-default->default: [30098]
        Weights: range=[1, 36], mean=1.28391, std=1.18903
        Features: none
Number of random walks for 'train': 136660


MemoryError: Unable to allocate 21.1 GiB for an array with shape (22123991, 128) and data type float64

### Bagging

In [15]:
### Bagging
from sklearn.ensemble import BaggingClassifier


from sklearn import model_selection
from sklearn import metrics

#%%

def bagging(x,y):
    avg_acc = []; avg_precision =[]; avg_recall = []; avg_f1 = []; avg_auc = []
    
    x = x.replace(np.inf, -1)
    for seed in [0,1,2,3,4]:
        print(seed)
        acc =[]; precision = []; recall = []; f1 = []; auc = []
        cv = model_selection.StratifiedKFold(n_splits=5, shuffle = True, random_state = seed)
        clf =BaggingClassifier( n_estimators=10, random_state=42)
        for train_index, test_index in cv.split(x,y):
            x_tr, x_te = x.iloc[train_index], x.iloc[test_index]
            y_tr, y_te = y.iloc[train_index], y.iloc[test_index]
            clf.fit(x_tr, y_tr)
            y_pred = clf.predict(x_te)
            acc.append(metrics.accuracy_score(y_te, y_pred))
            precision.append(metrics.precision_score(y_te, y_pred))
            recall.append(metrics.recall_score(y_te, y_pred))
            f1.append(metrics.f1_score(y_te, y_pred))
            auc.append(metrics.roc_auc_score(y_te, y_pred))
            
            print(acc)
            print(precision)
            print(recall)
            print(f1)
            print(auc)
        avg_acc.append(np.mean(acc))
        avg_precision.append(np.mean(precision))
        avg_recall.append(np.mean(recall))
        avg_f1.append(np.mean(f1))
        avg_auc.append(np.mean(auc))
        
    print('acc', avg_acc, np.mean(avg_acc))
    print('precision',avg_precision,  np.mean(avg_precision))
    print('recall',avg_recall, np.mean(avg_recall))
    print('f1', avg_f1,  np.mean(avg_f1))
    print('AUC',avg_auc, np.mean(avg_auc))


In [16]:
x = test_edges[['Sum of Neighbours', 'Sum of Papers', 'shortest distance']]
y = test_edges['y']

bagging(x,y)

0
[0.9999310703074954]
[0.0]
[0.0]
[0.0]
[0.49999920895025324]
[0.9999310703074954, 0.9999314093059831]
[0.0, 0.0]
[0.0, 0.0]
[0.0, 0.0]
[0.49999920895025324, 0.49999937846091325]
[0.9999310703074954, 0.9999314093059831, 0.9999317482967585]
[0.0, 0.0, 0.0]
[0.0, 0.0, 0.0]
[0.0, 0.0, 0.0]
[0.49999920895025324, 0.49999937846091325, 0.4999995479715222]
[0.9999310703074954, 0.9999314093059831, 0.9999317482967585, 0.9999316352972497]
[0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0]
[0.49999920895025324, 0.49999937846091325, 0.4999995479715222, 0.49999943496446664]
[0.9999310703074954, 0.9999314093059831, 0.9999317482967585, 0.9999316352972497, 0.9999322002947931]
[0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0]
[0.49999920895025324, 0.49999937846091325, 0.4999995479715222, 0.49999943496446664, 0.4999997174822333]
acc [0.999931612700456] 0.999931612700456
precision [0.0] 0.0
recall [0.0] 0.0
f1 [0.0] 0.0
AUC [0.49999945756587777] 0.49999945756587777

### SVD

In [17]:
from surprise import SVD
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate
from surprise.model_selection import KFold

#%%

def svd(x):
    avg_acc = []; avg_precision =[]; avg_recall = []; avg_f1 = []; avg_auc = []
    
    
    algo = SVD()
    reader = Reader(rating_scale=(0,1))
    for seed in [0]:
        acc =[]; precision = []; recall = []; f1 = []; auc = []
        cv = model_selection.KFold(n_splits=5, shuffle = True, random_state = seed)
        accuracy = []
        # define a cross-validation iterator
        #kf = KFold(n_splits=5, random_state =42)
        cv = model_selection.StratifiedKFold(n_splits=5, shuffle = True, random_state = seed)
        algo = SVD()
        for train_index, test_index in cv.split(x,x['y']):
        # The columns must correspond to user id, item id and ratings (in that order).
            x_tr, x_te = x.iloc[train_index], x.iloc[test_index]
            trainset = Dataset.load_from_df(x_tr, reader)
            testset = Dataset.load_from_df(x_te, reader)
        #for trainset, testset in kf.split(data):

            # train and test algorithm.
            trainset = trainset.build_full_trainset()
            algo.fit(trainset)
            testset = testset.build_full_trainset().build_testset()
            predictions = algo.test(testset)
            y_te = []; y_pred = []
            for p in range(len(predictions)):
                y_te.append(testset[p][2])
                if predictions[p].est <= 0.5:
                    y_pred.append(0)
                else:
                    y_pred.append(1)
            acc.append(metrics.accuracy_score(y_te, y_pred))
            precision.append(metrics.precision_score(y_te, y_pred))
            recall.append(metrics.recall_score(y_te, y_pred))
            f1.append(metrics.f1_score(y_te, y_pred))
            auc.append(metrics.roc_auc_score(y_te, y_pred))
        avg_acc.append(np.mean(acc))
        avg_precision.append(np.mean(precision))
        avg_recall.append(np.mean(recall))
        avg_f1.append(np.mean(f1))
        avg_auc.append(np.mean(auc))
        
    print('acc', avg_acc, np.mean(avg_acc))
    print('precision',avg_precision,  np.mean(avg_precision))
    print('recall',avg_recall, np.mean(avg_recall))
    print('f1', avg_f1,  np.mean(avg_f1))
    print('AUC',avg_auc, np.mean(avg_auc))

In [18]:
svd(test_edges[['FromNode', 'ToNode', 'y']])


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


acc [0.9999326974956754] 0.9999326974956754
precision [0.0] 0.0
recall [0.0] 0.0
f1 [0.0] 0.0
AUC [0.5] 0.5
