In [1]:
import networkx as nx
import torch
import numpy as np
import pandas as pd
import community as community_louvain
from sklearn.model_selection import train_test_split
from itertools import product

# allCategories = pd.read_csv("categories.csv", index_col=0)

class Feedforward(torch.nn.Module):
    
        def __init__(self, input_size, hidden_size):
            super(Feedforward, self).__init__()

            self.input_size = input_size
            self.hidden_size  = hidden_size

            self.fc1 = torch.nn.Linear(self.input_size, self.hidden_size)
            self.fc2 = torch.nn.Linear(self.hidden_size, self.hidden_size)
            self.fc3 = torch.nn.Linear(self.hidden_size, self.hidden_size)
            self.fc4 = torch.nn.Linear(self.hidden_size, 1)

            self.relu = torch.nn.ReLU()
            self.out_act = torch.nn.Sigmoid()


        def forward(self, x):
            output = self.fc1(x)
            output = self.relu(output)

            output = self.fc2(output)
            output = self.relu(output)

            output = self.fc3(output)
            output = self.relu(output)

            output = self.fc4(output)

            output = self.out_act(output)

            return output


In [2]:
#Network to evaluate links
GMissingEdges = nx.read_gml("GraphMissingEdges.gml")

dfGraphEdges = pd.DataFrame.from_dict(dict(GMissingEdges.edges()), orient='index')
dfGraphEdges = dfGraphEdges.reset_index()
dfGraphEdges = dfGraphEdges.rename(columns={'level_0': 'idx', 'level_1':'edge'})

print(dfGraphEdges)
print(dfGraphEdges.index)
print(dfGraphEdges.columns)

                          idx                    edge  weight
0      --DaPTJW3-tB1vP-PfdTEg  EL-iUP2pr6aJE2ZRVyNwyA       1
1      --DaPTJW3-tB1vP-PfdTEg  MhiBpIBNTCAm1Xd3WzRzjQ       1
2      --DaPTJW3-tB1vP-PfdTEg  dT70QOjn-o9pkdSAAPdSWQ       1
3      --DaPTJW3-tB1vP-PfdTEg  mzREMIknfmagJugibXrCsQ       1
4      --DaPTJW3-tB1vP-PfdTEg  zUgEycv_0a6hKu0nIkH1rA       1
...                       ...                     ...     ...
52985  zzUj3ej4vm_DtvRxNvWDEw  trzuDWvJqEIxtqjsKHCrhg       1
52986  zzf3RkMI1Y2E1QaZqeU8yA  mZRKH9ngRY92bI_irrHq6w       1
52987  zzf3RkMI1Y2E1QaZqeU8yA  s7Pj1mNYqRTGNOXLOiBafw       1
52988  zzvlwkcNR1CCqOPXwuvz2A  4Jscimulh38Rq2hOgjb2Hg       1
52989  zzvlwkcNR1CCqOPXwuvz2A  _xAJZOKBMPOe47p1MphB2w       1

[52990 rows x 3 columns]
RangeIndex(start=0, stop=52990, step=1)
Index(['idx', 'edge', 'weight'], dtype='object')


In [3]:

GMissingEdges.nodes(data=True)
# GMissingEdges.edges(data=True)

# https://stackoverflow.com/questions/35046087/make-networkx-node-attributes-into-pandas-dataframe-columns
dfGraphNodes = pd.DataFrame.from_dict(dict(GMissingEdges.nodes(data=True)), orient='index')
dfGraphNodes.drop(['name','categories'],axis=1,inplace=True)

# Calculate louvain communities
# louvainPartition = community_louvain.best_partition(GMissingEdges)
# dfLouvainPartition = pd.DataFrame.from_dict(louvainPartition, orient='index')
# dfLouvainPartition = dfLouvainPartition.rename(columns={0: 'cluster'})

# Add Louvain clusterization
# dfGraphNodes = pd.concat([dfGraphNodes, dfLouvainPartition], axis=1)
dfGraphNodes = dfGraphNodes.reset_index()

print(dfGraphNodes, dfGraphEdges)

                        index  longitude   latitude stars reviewCount
0      DHCdMpffUncZWxaiYNHSZw -79.434315  43.646220   4.0           4
1      huCf4kwsoGl1YUHCjMJG5A -79.397848  43.631814   2.5           3
2      a6FJ9HcERvtGF4PYILF_fA -79.378986  43.654590   2.5          63
3      b8cwL5L3241tOcqXywEfLw -79.353691  43.683799   3.0           8
4      YQ_z9iDgdNjwJhZ-owHSjA -79.396689  43.674244   2.5           7
...                       ...        ...        ...   ...         ...
11075  q3bkTWv854XTLXq1F4pnKg -79.409645  43.645954   4.0          61
11076  8pGD3zt6HEL2xzaT3lqMFQ -79.408460  43.642893   4.5          12
11077  iByQmTmTdO7hP4n1grSSWQ -79.422871  43.662417   4.0          37
11078  lkM72Y21bjBqUGaW7iL7tQ -79.293509  43.803568   3.0          83
11079  vUef2kuyYWG7phLySoRJGw -79.357862  43.676379   4.0          22

[11080 rows x 5 columns]                           idx                    edge  weight
0      --DaPTJW3-tB1vP-PfdTEg  EL-iUP2pr6aJE2ZRVyNwyA       1
1      --D

In [4]:
dfGraph = pd.merge(dfGraphNodes, dfGraphEdges, how="right", right_on=["idx"], left_on=["index"])
dfGraph.drop(['idx'],axis=1,inplace=True)
# dfGraph = dfGraph.set_index("index")

print(dfGraph)
print(dfGraph.info())
print(dfGraph.index)

# dfGraph.to_csv("dfGraph.csv")

                        index  longitude   latitude stars reviewCount  \
0      --DaPTJW3-tB1vP-PfdTEg -79.444674  43.677807   3.5          49   
1      --DaPTJW3-tB1vP-PfdTEg -79.444674  43.677807   3.5          49   
2      --DaPTJW3-tB1vP-PfdTEg -79.444674  43.677807   3.5          49   
3      --DaPTJW3-tB1vP-PfdTEg -79.444674  43.677807   3.5          49   
4      --DaPTJW3-tB1vP-PfdTEg -79.444674  43.677807   3.5          49   
...                       ...        ...        ...   ...         ...   
52985  zzUj3ej4vm_DtvRxNvWDEw -79.402828  43.643715   3.0         114   
52986  zzf3RkMI1Y2E1QaZqeU8yA -79.370983  43.651883   4.5          33   
52987  zzf3RkMI1Y2E1QaZqeU8yA -79.370983  43.651883   4.5          33   
52988  zzvlwkcNR1CCqOPXwuvz2A -79.393727  43.655822   3.5           7   
52989  zzvlwkcNR1CCqOPXwuvz2A -79.393727  43.655822   3.5           7   

                         edge  weight  
0      EL-iUP2pr6aJE2ZRVyNwyA       1  
1      MhiBpIBNTCAm1Xd3WzRzjQ       1  
2  

In [5]:
dfEdgesToEvaluate = pd.read_csv('edgesToEvaluate.csv')
dfEdgesToEvaluate.drop(['linkID'], axis=1, inplace=True)
dfEdgesToEvaluate.rename(columns={'venue1': 'index', 'venue2':'edge'},inplace=True)

# Generate zero cases to help test
combinedList = list(product(dfEdgesToEvaluate['index'], dfEdgesToEvaluate['edge']))
# print(combinedList)

dfT = pd.DataFrame(combinedList)
dfT.rename(columns={0: 'index', 1:'edge'},inplace=True)

dfGraph = pd.merge(dfT, dfGraph, how="outer", on=["index", "edge"])
dfGraph.drop(['longitude','latitude','stars','reviewCount'], axis=1, inplace=True)

# Remove edges to evaluate
dfGraph = pd.concat([dfGraph, dfEdgesToEvaluate])
dfGraph.drop_duplicates(subset=["index","edge"],keep=False, inplace=True)

dfGraph = pd.merge(dfGraph, dfGraphNodes, how="inner", on="index")

# weight = 0 and existEdge = False
dfGraph['weight'] = dfGraph['weight'].fillna(0)
dfGraph['existEdge'] = np.where((dfGraph.weight > 0 ), 'True', 'False')
# print(dfGraph.info())
print(dfGraph)

dfGraph = dfGraph.astype({"stars": float, "reviewCount": int, "existEdge": bool})

dfGraph.to_csv("resultTest.csv")

                         index                    edge  weight  longitude  \
0       klu0zF1rWAoNAhKPsFyUog  oQFMJqDwNXbNMRbcmIYRYg     0.0 -79.363541   
1       klu0zF1rWAoNAhKPsFyUog  egLYFnycp8ktxMCvilFdLw     0.0 -79.363541   
2       klu0zF1rWAoNAhKPsFyUog  Nxg73OigmRQQq0d1pKtkUQ     0.0 -79.363541   
3       klu0zF1rWAoNAhKPsFyUog  hyXNS3tSmi6njhBjgo8eGw     0.0 -79.363541   
4       klu0zF1rWAoNAhKPsFyUog  Sflaxtv6SR0lgbL7-pIGPQ     0.0 -79.363541   
...                        ...                     ...     ...        ...   
257159  zzUj3ej4vm_DtvRxNvWDEw  trzuDWvJqEIxtqjsKHCrhg     1.0 -79.402828   
257160  zzf3RkMI1Y2E1QaZqeU8yA  mZRKH9ngRY92bI_irrHq6w     1.0 -79.370983   
257161  zzf3RkMI1Y2E1QaZqeU8yA  s7Pj1mNYqRTGNOXLOiBafw     1.0 -79.370983   
257162  zzvlwkcNR1CCqOPXwuvz2A  4Jscimulh38Rq2hOgjb2Hg     1.0 -79.393727   
257163  zzvlwkcNR1CCqOPXwuvz2A  _xAJZOKBMPOe47p1MphB2w     1.0 -79.393727   

         latitude stars reviewCount existEdge  
0       43.709978   3.5    

In [6]:
# Replace place ids to integers ids
placesId = dfGraphNodes['index'].to_dict()
# print(placesId)

# for index,place in placesId.items():
#     dfGraph.replace({'index':{place:index}},inplace=True)
#     dfGraph.replace({'edge':{place:index}},inplace=True)

dfGraph


Unnamed: 0,index,edge,weight,longitude,latitude,stars,reviewCount,existEdge
0,klu0zF1rWAoNAhKPsFyUog,oQFMJqDwNXbNMRbcmIYRYg,0.0,-79.363541,43.709978,3.5,104,True
1,klu0zF1rWAoNAhKPsFyUog,egLYFnycp8ktxMCvilFdLw,0.0,-79.363541,43.709978,3.5,104,True
2,klu0zF1rWAoNAhKPsFyUog,Nxg73OigmRQQq0d1pKtkUQ,0.0,-79.363541,43.709978,3.5,104,True
3,klu0zF1rWAoNAhKPsFyUog,hyXNS3tSmi6njhBjgo8eGw,0.0,-79.363541,43.709978,3.5,104,True
4,klu0zF1rWAoNAhKPsFyUog,Sflaxtv6SR0lgbL7-pIGPQ,0.0,-79.363541,43.709978,3.5,104,True
...,...,...,...,...,...,...,...,...
257159,zzUj3ej4vm_DtvRxNvWDEw,trzuDWvJqEIxtqjsKHCrhg,1.0,-79.402828,43.643715,3.0,114,True
257160,zzf3RkMI1Y2E1QaZqeU8yA,mZRKH9ngRY92bI_irrHq6w,1.0,-79.370983,43.651883,4.5,33,True
257161,zzf3RkMI1Y2E1QaZqeU8yA,s7Pj1mNYqRTGNOXLOiBafw,1.0,-79.370983,43.651883,4.5,33,True
257162,zzvlwkcNR1CCqOPXwuvz2A,4Jscimulh38Rq2hOgjb2Hg,1.0,-79.393727,43.655822,3.5,7,True


In [7]:
# Generate data without edges to help in test cases

device = "cuda" if torch.cuda.is_available() else "cpu"
device = 'cpu'
print("Using {}".format(device))

focus = dfGraph['existEdge']
data = dfGraph.iloc[:,3:6]
# data = data.astype({"stars": float, "reviewCount": int})

# print(focus)
# print(data.info())
# print(dfGraph.columns)

Y_tensor = torch.tensor(focus)
X_tensor = torch.tensor(data.to_numpy())
print(Y_tensor.shape)
print(X_tensor.shape)

X_train, X_test, y_train, y_test = train_test_split(X_tensor, Y_tensor, test_size = 0.20, random_state=5)

print("Training data:")
print(X_train.shape)
print(y_train.shape)

print("Test data:")
print(X_test.shape)
print(y_test.shape)

# Cast fields to float to avoid compatibility problems
X_train = X_train.float().to(device)
y_train = y_train.float().to(device)
X_test = X_test.float().to(device)
y_test = y_test.float().to(device)

Using cpu
torch.Size([257164])
torch.Size([257164, 3])
Training data:
torch.Size([205731, 3])
torch.Size([205731])
Test data:
torch.Size([51433, 3])
torch.Size([51433])


In [8]:
#input = 8 (número de features), e hidden size = 20 (número de neurôneos na camada escondida)
num_features = X_train.shape[1]
print("num_features: "+str(num_features))

model = Feedforward(num_features, 20).to(device)
print(model)

criterion = torch.nn.MSELoss()
# criterion = torch.nn.BCEWithLogitsLoss()

# lr = Learning rate
optimizer = torch.optim.Adam(model.parameters(), lr = 0.01)
# optimizer = torch.optim.SGD(model.parameters(), lr = 0.001)

num_features: 3
Feedforward(
  (fc1): Linear(in_features=3, out_features=20, bias=True)
  (fc2): Linear(in_features=20, out_features=20, bias=True)
  (fc3): Linear(in_features=20, out_features=20, bias=True)
  (fc4): Linear(in_features=20, out_features=1, bias=True)
  (relu): ReLU()
  (out_act): Sigmoid()
)


In [9]:
model.eval()
y_pred = model(X_test)
before_train = criterion(y_pred, y_test) 
print('Teste - perda antes do treinamento' , before_train.item())

model.train()
epoch = 50

for epoch in range(epoch):
    # Zero gradients
    optimizer.zero_grad()
    
    # Forward pass
    y_pred = model(X_train)
    
    # Calculate loss
    loss = criterion(y_pred, y_train)
    
    print('Epoch {}: perda treino: {}'.format(epoch, loss.item()))
    
    # Backward pass
    loss.backward()
    # update pass
    optimizer.step()

model.eval()
y_pred = model(X_test)
after_train = criterion(y_pred, y_test)
print('Teste - perda depois do treinamento' , after_train.item())

  return F.mse_loss(input, target, reduction=self.reduction)


Teste - perda antes do treinamento 0.03223619610071182


  return F.mse_loss(input, target, reduction=self.reduction)


RuntimeError: [enforce fail at CPUAllocator.cpp:68] . DefaultCPUAllocator: can't allocate memory: you tried to allocate 169300977444 bytes. Error code 12 (Cannot allocate memory)

In [None]:

print(X_test)
print(y_pred)
print(after_train)

#Links to be evaluated
dfEdgesToEvaluate = pd.read_csv('edgesToEvaluate.csv')
dfEdgesToEvaluate = dfEdgesToEvaluate.rename(columns={'venue1': 'index', 'venue2':'edge'})
dfEdgesToEvaluate.set_index('index',inplace=True)

# print(dfEdgesToEvaluate)
dfNodesToCopy = dfGraphNodes.copy()
dfNodesToCopy.set_index('index',inplace=True)
# print(dfNodesToCopy)

dfToTest = pd.merge(dfEdgesToEvaluate, dfNodesToCopy, on='index')
dfToTest.reset_index(inplace=True)
dfToTest.drop('linkID',axis=1,inplace=True)

for index,place in placesId.items():
    dfToTest.replace({'index':{place:index}},inplace=True)
    dfToTest.replace({'edge':{place:index}},inplace=True)

dfToTest['weight'] = 0

dfToTest.info()

In [None]:
# focus = dfToTest['existEdge']
# Same column order
data = dfToTest.iloc[:,2:6]
data = data.astype({"stars": float, "reviewCount": int})

# print(focus)
# print(data.info())
# print(dfGraph.columns)

Y_tensor_eval = torch.zeros(500,1)
X_tensor_eval = torch.tensor(data.to_numpy())
print(Y_tensor.shape)
print(X_tensor.shape)

# X_train_eval, X_test_eval, y_train_eval, y_test_eval = train_test_split(X_tensor_eval, Y_tensor_eval, test_size = 0.9, random_state=5)

print("Training data:")
# print(X_train_eval.shape)
# print(y_train_eval.shape)

print("Test data:")
print(X_tensor_eval.shape)
# print(y_test_eval.shape)

# Cast fields to float to avoid compatibility problems
# X_train_eval = X_train_eval.float().to(device)
# y_train_eval = y_train_eval.float().to(device)
X_tensor_eval = X_tensor_eval.float().to(device)
Y_tensor_eval = Y_tensor_eval.float().to(device)


In [None]:
model.eval()
y_pred = model(X_tensor_eval)
after_train = criterion(y_pred, Y_tensor_eval) 
print('Teste - usando dados do treinamento' , after_train.item())

# print(len(y_pred))
# print(y_pred)

exitData = y_pred.detach().numpy()
print(len(exitData))
print(exitData)

binaryExit = np.where(exitData > 0.1, 1, 0)
print(len(binaryExit))
print(binaryExit)
dfEdgesToEvaluate['link'] = binaryExit

# dfEdgesToEvaluate.set_index('linkID',inplace=True)
dfEdgesToEvaluate.drop(['edge'],axis=1,inplace=True,errors='ignore')

dfEdgesToEvaluate.to_csv("edgesToEvaluateAnswers.csv", columns=['linkID','link'],index=False) 
dfEdgesToEvaluate
