In [1]:
import networkx as nx
import torch
import numpy as np
import pandas as pd
import random
import community as community_louvain
from sklearn.model_selection import train_test_split
from itertools import product

# allCategories = pd.read_csv("categories.csv", index_col=0)

class Feedforward(torch.nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Feedforward, self).__init__()

        self.input_size = input_size
        self.hidden_size  = hidden_size

        self.fc1 = torch.nn.Linear(self.input_size, self.hidden_size)
        self.fc2 = torch.nn.Linear(self.hidden_size, self.hidden_size)
        self.fc3 = torch.nn.Linear(self.hidden_size, self.hidden_size)
        self.fc4 = torch.nn.Linear(self.hidden_size, 1)

        self.relu = torch.nn.ReLU()
        self.out_act = torch.nn.Sigmoid()

    def forward(self, x):
        output = self.fc1(x)
        output = self.relu(output)

        output = self.fc2(output)
        output = self.relu(output)

        output = self.fc3(output)
        output = self.relu(output)

        output = self.fc4(output)

        output = self.out_act(output)

        return output


In [2]:
#Network to evaluate links
GMissingEdges = nx.read_gml("GraphMissingEdges.gml")

# To local test -----------------------------------------
# Remove 20% das arestas
proportion_edges = 0.2
edge_subset = random.sample(GMissingEdges.edges(), int(proportion_edges * GMissingEdges.number_of_edges()))

# Cria uma cópia do grafo e remove arestas
GMissingEdgesTrain = GMissingEdges.copy()
GMissingEdgesTrain.remove_edges_from(edge_subset)
# print(edge_subset)
# To local test -----------------------------------------

dfGraphEdges = pd.DataFrame.from_dict(dict(GMissingEdgesTrain.edges()), orient='index')
dfGraphEdges = dfGraphEdges.reset_index()
dfGraphEdges = dfGraphEdges.rename(columns={'level_0': 'idx', 'level_1':'edge'})

print(dfGraphEdges)
print(dfGraphEdges.index)
print(dfGraphEdges.columns)

                          idx                    edge  weight
0      --DaPTJW3-tB1vP-PfdTEg  EL-iUP2pr6aJE2ZRVyNwyA       1
1      --DaPTJW3-tB1vP-PfdTEg  MhiBpIBNTCAm1Xd3WzRzjQ       1
2      --DaPTJW3-tB1vP-PfdTEg  dT70QOjn-o9pkdSAAPdSWQ       1
3      --DaPTJW3-tB1vP-PfdTEg  mzREMIknfmagJugibXrCsQ       1
4      --DaPTJW3-tB1vP-PfdTEg  zUgEycv_0a6hKu0nIkH1rA       1
...                       ...                     ...     ...
42387  zzUj3ej4vm_DtvRxNvWDEw  tJcpzXzykNSLuzWwa1JQUw       1
42388  zzUj3ej4vm_DtvRxNvWDEw  trzuDWvJqEIxtqjsKHCrhg       1
42389  zzf3RkMI1Y2E1QaZqeU8yA  mZRKH9ngRY92bI_irrHq6w       1
42390  zzvlwkcNR1CCqOPXwuvz2A  4Jscimulh38Rq2hOgjb2Hg       1
42391  zzvlwkcNR1CCqOPXwuvz2A  _xAJZOKBMPOe47p1MphB2w       1

[42392 rows x 3 columns]
RangeIndex(start=0, stop=42392, step=1)
Index(['idx', 'edge', 'weight'], dtype='object')


In [3]:
# Create file to test using data removed
dfToEvaluate = pd.DataFrame(edge_subset)
dfToEvaluate = dfToEvaluate.rename(columns={0:'venue1',1:'venue2'})
dfToEvaluate.to_csv("edgesToEvaluateTest.csv", index_label='linkID')
dfToEvaluate

Unnamed: 0,venue1,venue2
0,Qme4ZWzZEdZUwnpo-sPPIw,IE1lzZvdD9UnGeB1kXjuOQ
1,K2CdjES-IZDDEgetOnf0Vw,Rl_l6TWkBzr6Op7egdZ-YA
2,g3t2AQQVT2tWD64Huu7xVA,-TjzQsimpriXVYaH3HdtQA
3,wVc3g-YfyDvkOLDecpr4DA,schqgX7r7lBKxZF6MBqTUA
4,ErnZ6XLbX3aG9jz3OcyzLg,n2NRR4N-kmeDvhHJivHGkA
...,...,...
10593,sEAKw3MZkER1u_1fzIeD3g,Ki4aSG30_W_2KPjqT50o1w
10594,nb2Bpt5uGGZvNhnkEGMnpw,GGfavkYjctCjBpz3Zun4RA
10595,L-Yj3Y1bYKTSg4uE9KTVBw,LBHbyID7tcaGszznHtnCrw
10596,xrA53WYhRCml2bMfpQ_Hig,cQK9M2JAwETQnnBoYyua5A


In [4]:

GMissingEdges.nodes(data=True)
# GMissingEdges.edges(data=True)

# https://stackoverflow.com/questions/35046087/make-networkx-node-attributes-into-pandas-dataframe-columns
dfGraphNodes = pd.DataFrame.from_dict(dict(GMissingEdgesTrain.nodes(data=True)), orient='index')
# dfGraphNodes.drop(['name','categories'],axis=1,inplace=True)

# Calculate louvain communities
# louvainPartition = community_louvain.best_partition(GMissingEdges)
# dfLouvainPartition = pd.DataFrame.from_dict(louvainPartition, orient='index')
# dfLouvainPartition = dfLouvainPartition.rename(columns={0: 'cluster'})

# Add Louvain clusterization
# dfGraphNodes = pd.concat([dfGraphNodes, dfLouvainPartition], axis=1)
dfGraphNodes = dfGraphNodes.reset_index()

# print(dfGraphNodes, dfGraphEdges)
print(dfGraphNodes)

                        index  longitude   latitude               categories  \
0      DHCdMpffUncZWxaiYNHSZw -79.434315  43.646220          140,321,616,754   
1      huCf4kwsoGl1YUHCjMJG5A -79.397848  43.631814                  872,828   
2      a6FJ9HcERvtGF4PYILF_fA -79.378986  43.654590       609,73,570,872,625   
3      b8cwL5L3241tOcqXywEfLw -79.353691  43.683799          616,778,754,245   
4      YQ_z9iDgdNjwJhZ-owHSjA -79.396689  43.674244  562,402,578,743,288,668   
...                       ...        ...        ...                      ...   
11075  q3bkTWv854XTLXq1F4pnKg -79.409645  43.645954              650,284,743   
11076  8pGD3zt6HEL2xzaT3lqMFQ -79.408460  43.642893       124,66,323,680,750   
11077  iByQmTmTdO7hP4n1grSSWQ -79.422871  43.662417                  151,124   
11078  lkM72Y21bjBqUGaW7iL7tQ -79.293509  43.803568          457,277,621,124   
11079  vUef2kuyYWG7phLySoRJGw -79.357862  43.676379              611,230,359   

      stars                           n

In [5]:
dfGraph = pd.merge(dfGraphNodes, dfGraphEdges, how="right", right_on=["idx"], left_on=["index"])
dfGraph.drop(['idx'],axis=1,inplace=True)
# dfGraph = dfGraph.set_index("index")

print(dfGraph)
print(dfGraph.info())
print(dfGraph.index)

# dfGraph.to_csv("dfGraph.csv")

                        index  longitude   latitude              categories  \
0      --DaPTJW3-tB1vP-PfdTEg -79.444674  43.677807                 124,850   
1      --DaPTJW3-tB1vP-PfdTEg -79.444674  43.677807                 124,850   
2      --DaPTJW3-tB1vP-PfdTEg -79.444674  43.677807                 124,850   
3      --DaPTJW3-tB1vP-PfdTEg -79.444674  43.677807                 124,850   
4      --DaPTJW3-tB1vP-PfdTEg -79.444674  43.677807                 124,850   
...                       ...        ...        ...                     ...   
42387  zzUj3ej4vm_DtvRxNvWDEw -79.402828  43.643715             631,256,444   
42388  zzUj3ej4vm_DtvRxNvWDEw -79.402828  43.643715             631,256,444   
42389  zzf3RkMI1Y2E1QaZqeU8yA -79.370983  43.651883  570,112,224,800,389,73   
42390  zzvlwkcNR1CCqOPXwuvz2A -79.393727  43.655822     277,124,803,621,192   
42391  zzvlwkcNR1CCqOPXwuvz2A -79.393727  43.655822     277,124,803,621,192   

      stars                name reviewCount        

In [6]:
dfEdgesToEvaluate = pd.read_csv('edgesToEvaluate.csv')
dfEdgesToEvaluate.drop(['linkID'], axis=1, inplace=True)
dfEdgesToEvaluate.rename(columns={'venue1': 'index', 'venue2':'edge'},inplace=True)

# Generate zero cases to help test
combinedList = list(product(dfEdgesToEvaluate['index'], dfEdgesToEvaluate['edge']))
# print(combinedList)

dfT = pd.DataFrame(combinedList)
dfT.rename(columns={0: 'index', 1:'edge'},inplace=True)

dfGraph = pd.merge(dfT, dfGraph, how="outer", on=["index", "edge"])
dfGraph.drop(['longitude','latitude','stars','reviewCount','cluster', 'categories', 'name'], axis=1, inplace=True, errors='ignore')

# Remove edges to evaluate
dfGraph = pd.concat([dfGraph, dfEdgesToEvaluate])
dfGraph.drop_duplicates(subset=["index","edge"],keep=False, inplace=True)

dfGraph = pd.merge(dfGraph, dfGraphNodes, how="inner", on="index")

# # weight = 0 and existEdge = False
dfGraph['weight'] = dfGraph['weight'].fillna(0)
dfGraph['existEdge'] = np.where((dfGraph.weight > 0 ), 'True', 'False')
# print(dfGraph.info())
print(dfGraph)

dfGraph = dfGraph.astype({"stars": float, "reviewCount": int, "existEdge": bool})

dfGraph.to_csv("resultTest.csv")

                         index                    edge  weight  longitude  \
0       klu0zF1rWAoNAhKPsFyUog  oQFMJqDwNXbNMRbcmIYRYg     0.0 -79.363541   
1       klu0zF1rWAoNAhKPsFyUog  egLYFnycp8ktxMCvilFdLw     0.0 -79.363541   
2       klu0zF1rWAoNAhKPsFyUog  Nxg73OigmRQQq0d1pKtkUQ     0.0 -79.363541   
3       klu0zF1rWAoNAhKPsFyUog  hyXNS3tSmi6njhBjgo8eGw     0.0 -79.363541   
4       klu0zF1rWAoNAhKPsFyUog  Sflaxtv6SR0lgbL7-pIGPQ     0.0 -79.363541   
...                        ...                     ...     ...        ...   
246776  zzUj3ej4vm_DtvRxNvWDEw  tJcpzXzykNSLuzWwa1JQUw     1.0 -79.402828   
246777  zzUj3ej4vm_DtvRxNvWDEw  trzuDWvJqEIxtqjsKHCrhg     1.0 -79.402828   
246778  zzf3RkMI1Y2E1QaZqeU8yA  mZRKH9ngRY92bI_irrHq6w     1.0 -79.370983   
246779  zzvlwkcNR1CCqOPXwuvz2A  4Jscimulh38Rq2hOgjb2Hg     1.0 -79.393727   
246780  zzvlwkcNR1CCqOPXwuvz2A  _xAJZOKBMPOe47p1MphB2w     1.0 -79.393727   

         latitude              categories stars                name  \
0   

In [7]:
# Delete random ids because the structure is too heavy
print("Size of dataframe before deletion: %s" % (len(dfGraph)))
delSize = int(len(dfGraph)*0.8) # Come back to original size of dataframe +-

deletedItems = random.sample(range(len(dfGraph)), delSize)


dfGraph.drop(dfGraph.index[deletedItems], inplace=True)

print("Elements deleted %s" % (delSize))
print("Size of dataframe after deletion: %s" % (len(dfGraph)))

dfGraph = dfGraph.reset_index()
dfGraph.drop('level_0',axis=1,inplace=True)
# print (deletedItems)
print(dfGraph)


Size of dataframe before deletion: 246781
Elements deleted 197424
Size of dataframe after deletion: 49357
                        index                    edge  weight  longitude  \
0      klu0zF1rWAoNAhKPsFyUog  eSp5ge9VAwTywZKlJ_LBvA     0.0 -79.363541   
1      klu0zF1rWAoNAhKPsFyUog  jqvVB8AezYhCuVinJf5fWA     0.0 -79.363541   
2      klu0zF1rWAoNAhKPsFyUog  YipJWgvwnCwYd59IlssR7Q     0.0 -79.363541   
3      klu0zF1rWAoNAhKPsFyUog  39nPYUxkRn8SQY29C322UQ     0.0 -79.363541   
4      klu0zF1rWAoNAhKPsFyUog  bYGn37k0KgleABKMWgjnMw     0.0 -79.363541   
...                       ...                     ...     ...        ...   
49352  zzUj3ej4vm_DtvRxNvWDEw  7peyM7sZuXQPZCFewwyPDA     1.0 -79.402828   
49353  zzUj3ej4vm_DtvRxNvWDEw  EA7NU-fwH9VtCCVd3yO1fg     1.0 -79.402828   
49354  zzUj3ej4vm_DtvRxNvWDEw  L-Rhai11kLsL6tKvOGNG8Q     1.0 -79.402828   
49355  zzUj3ej4vm_DtvRxNvWDEw  TdI6Suf6NuT50IJD1dTqmQ     1.0 -79.402828   
49356  zzf3RkMI1Y2E1QaZqeU8yA  mZRKH9ngRY92bI_irrHq6w     

In [8]:
# Replace place ids to integers ids
placesId = dfGraphNodes['index'].to_dict()
# print(placesId)

# for index,place in placesId.items():
#     dfGraph.replace({'index':{place:index}},inplace=True)
#     dfGraph.replace({'edge':{place:index}},inplace=True)

dfGraph


Unnamed: 0,index,edge,weight,longitude,latitude,categories,stars,name,reviewCount,existEdge
0,klu0zF1rWAoNAhKPsFyUog,eSp5ge9VAwTywZKlJ_LBvA,0.0,-79.363541,43.709978,12425644432517631,3.5,Local Leaside,104,True
1,klu0zF1rWAoNAhKPsFyUog,jqvVB8AezYhCuVinJf5fWA,0.0,-79.363541,43.709978,12425644432517631,3.5,Local Leaside,104,True
2,klu0zF1rWAoNAhKPsFyUog,YipJWgvwnCwYd59IlssR7Q,0.0,-79.363541,43.709978,12425644432517631,3.5,Local Leaside,104,True
3,klu0zF1rWAoNAhKPsFyUog,39nPYUxkRn8SQY29C322UQ,0.0,-79.363541,43.709978,12425644432517631,3.5,Local Leaside,104,True
4,klu0zF1rWAoNAhKPsFyUog,bYGn37k0KgleABKMWgjnMw,0.0,-79.363541,43.709978,12425644432517631,3.5,Local Leaside,104,True
...,...,...,...,...,...,...,...,...,...,...
49352,zzUj3ej4vm_DtvRxNvWDEw,7peyM7sZuXQPZCFewwyPDA,1.0,-79.402828,43.643715,631256444,3.0,Wheat Sheaf Tavern,114,True
49353,zzUj3ej4vm_DtvRxNvWDEw,EA7NU-fwH9VtCCVd3yO1fg,1.0,-79.402828,43.643715,631256444,3.0,Wheat Sheaf Tavern,114,True
49354,zzUj3ej4vm_DtvRxNvWDEw,L-Rhai11kLsL6tKvOGNG8Q,1.0,-79.402828,43.643715,631256444,3.0,Wheat Sheaf Tavern,114,True
49355,zzUj3ej4vm_DtvRxNvWDEw,TdI6Suf6NuT50IJD1dTqmQ,1.0,-79.402828,43.643715,631256444,3.0,Wheat Sheaf Tavern,114,True


In [10]:
# Generate data without edges to help in test cases

device = "cuda" if torch.cuda.is_available() else "cpu"
device = 'cpu'
print("Using {}".format(device))

focus = dfGraph['existEdge']
data = dfGraph.iloc[:,[3,4]]
# data = data.astype({"stars": float, "reviewCount": int})

# print(focus)
# print(data.info())
# print(dfGraph.columns)

Y_tensor = torch.tensor(focus)
X_tensor = torch.tensor(data.to_numpy())
print(Y_tensor.shape)
print(X_tensor.shape)

X_train, X_test, y_train, y_test = train_test_split(X_tensor, Y_tensor, test_size = 0.20, random_state=5)

print("Training data:")
print(X_train.shape)
print(y_train.shape)

print("Test data:")
print(X_test.shape)
print(y_test.shape)

# Cast fields to float to avoid compatibility problems
X_train = X_train.float().to(device)
y_train = y_train.float().to(device)
X_test = X_test.float().to(device)
y_test = y_test.float().to(device)

Using cpu
torch.Size([49357])
torch.Size([49357, 2])
Training data:
torch.Size([39485, 2])
torch.Size([39485])
Test data:
torch.Size([9872, 2])
torch.Size([9872])


In [11]:
#input = 8 (número de features), e hidden size = 20 (número de neurôneos na camada escondida)
num_features = X_train.shape[1]
print("num_features: "+str(num_features))

model = Feedforward(num_features, 20).to(device)
print(model)

criterion = torch.nn.MSELoss()
# criterion = torch.nn.BCEWithLogitsLoss()

# lr = Learning rate
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)
# optimizer = torch.optim.SGD(model.parameters(), lr = 0.001)

num_features: 2
Feedforward(
  (fc1): Linear(in_features=2, out_features=20, bias=True)
  (fc2): Linear(in_features=20, out_features=20, bias=True)
  (fc3): Linear(in_features=20, out_features=20, bias=True)
  (fc4): Linear(in_features=20, out_features=1, bias=True)
  (relu): ReLU()
  (out_act): Sigmoid()
)


In [12]:
model.eval()
y_pred = model(X_test)
before_train = criterion(y_pred, y_test) 
print('Teste - perda antes do treinamento' , before_train.item())

model.train()
epoch = 15

for epoch in range(epoch):
    # Zero gradients
    optimizer.zero_grad()
    
    # Forward pass
    y_pred = model(X_train)
    
    # Calculate loss
    loss = criterion(y_pred, y_train)
    
    print('Epoch {}: perda treino: {}'.format(epoch, loss.item()))
    
    # Backward pass
    loss.backward()
    # update pass
    optimizer.step()

model.eval()
y_pred = model(X_test)
after_train = criterion(y_pred, y_test)
print('Teste - perda depois do treinamento' , after_train.item())

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Teste - perda antes do treinamento 0.73652583360672
Epoch 0: perda treino: 0.7365222573280334
Epoch 1: perda treino: 0.6476241946220398
Epoch 2: perda treino: 0.5449137091636658
Epoch 3: perda treino: 0.434844434261322
Epoch 4: perda treino: 0.3267446458339691
Epoch 5: perda treino: 0.2304207980632782
Epoch 6: perda treino: 0.1605946272611618
Epoch 7: perda treino: 0.11058821529150009
Epoch 8: perda treino: 0.07330168783664703
Epoch 9: perda treino: 0.04740724712610245
Epoch 10: perda treino: 0.030328018590807915
Epoch 11: perda treino: 0.019424108788371086
Epoch 12: perda treino: 0.012571590021252632
Epoch 13: perda treino: 0.008275724947452545
Epoch 14: perda treino: 0.005563391838222742
Teste - perda depois do treinamento 0.0038277404382824898


In [13]:

print(X_test)
print(y_pred)
print(after_train)

#Links to be evaluated
dfEdgesToEvaluate = pd.read_csv('edgesToEvaluateTest.csv')
dfEdgesToEvaluate = dfEdgesToEvaluate.rename(columns={'venue1': 'index', 'venue2':'edge'})
dfEdgesToEvaluate.set_index('index',inplace=True)

# print(dfEdgesToEvaluate)
dfNodesToCopy = dfGraphNodes.copy()
dfNodesToCopy.set_index('index',inplace=True)
# print(dfNodesToCopy)

dfToTest = pd.merge(dfEdgesToEvaluate, dfNodesToCopy, on='index')
dfToTest.reset_index(inplace=True)
dfToTest.drop('linkID',axis=1,inplace=True)

# for index,place in placesId.items():
#     dfToTest.replace({'index':{place:index}},inplace=True)
#     dfToTest.replace({'edge':{place:index}},inplace=True)

dfToTest['weight'] = 0

dfToTest.info()

tensor([[-79.3812,  43.6654],
        [-79.3798,  43.6519],
        [-79.3825,  43.6493],
        ...,
        [-79.4706,  43.6740],
        [-79.3984,  43.6545],
        [-79.3279,  43.6709]])
tensor([[0.9381],
        [0.9381],
        [0.9381],
        ...,
        [0.9382],
        [0.9381],
        [0.9380]], grad_fn=<SigmoidBackward0>)
tensor(0.0038, grad_fn=<MseLossBackward0>)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10598 entries, 0 to 10597
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   index        10598 non-null  object 
 1   edge         10598 non-null  object 
 2   longitude    10598 non-null  float64
 3   latitude     10598 non-null  float64
 4   categories   10598 non-null  object 
 5   stars        10598 non-null  object 
 6   name         10598 non-null  object 
 7   reviewCount  10598 non-null  object 
 8   weight       10598 non-null  int64  
dtypes: float64(2), int64(1), object(6)
mem

In [19]:
# focus = dfToTest['existEdge']
# Same column order
data = dfToTest.iloc[:,[2,3]]
# data = data.astype({"stars": float, "reviewCount": int})

# print(focus)
# print(data.info())
# print(dfGraph.columns)

Y_tensor_eval = torch.zeros(len(dfEdgesToEvaluate),1)
X_tensor_eval = torch.tensor(data.to_numpy())

print("Test data:")
print(X_tensor_eval.shape)
print(Y_tensor_eval.shape)

# Cast fields to float to avoid compatibility problems
X_tensor_eval = X_tensor_eval.float().to(device)
Y_tensor_eval = Y_tensor_eval.float().to(device)


torch.Size([49357])
torch.Size([49357, 2])
Training data:
Test data:
torch.Size([10598, 2])


In [22]:
model.eval()
y_pred = model(X_tensor_eval)
after_train = criterion(y_pred, Y_tensor_eval) 
print('Teste - usando dados do treinamento' , after_train.item())

# print(len(y_pred))
# print(y_pred)

exitData = y_pred.detach().numpy()
print(len(exitData))
print(exitData)

binaryExit = np.where(exitData > 0, 1, 0)
print(len(binaryExit))
print(binaryExit)
dfEdgesToEvaluate['link'] = binaryExit

# dfEdgesToEvaluate.set_index('linkID',inplace=True)
dfEdgesToEvaluate.drop(['edge'],axis=1,inplace=True,errors='ignore')

dfEdgesToEvaluate.to_csv("edgesToEvaluateAnswers.csv", columns=['linkID','link'],index=False) 
dfEdgesToEvaluate


Teste - usando dados do treinamento 0.8800757527351379
10598
[[0.9381275]
 [0.9381502]
 [0.9381502]
 ...
 [0.9384969]
 [0.9381043]
 [0.9382892]]
10598
[[1]
 [1]
 [1]
 ...
 [1]
 [1]
 [1]]


Unnamed: 0_level_0,linkID,link
index,Unnamed: 1_level_1,Unnamed: 2_level_1
Qme4ZWzZEdZUwnpo-sPPIw,0,1
K2CdjES-IZDDEgetOnf0Vw,1,1
g3t2AQQVT2tWD64Huu7xVA,2,1
wVc3g-YfyDvkOLDecpr4DA,3,1
ErnZ6XLbX3aG9jz3OcyzLg,4,1
...,...,...
sEAKw3MZkER1u_1fzIeD3g,10593,1
nb2Bpt5uGGZvNhnkEGMnpw,10594,1
L-Yj3Y1bYKTSg4uE9KTVBw,10595,1
xrA53WYhRCml2bMfpQ_Hig,10596,1
