In [36]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.spatial import cKDTree
from scipy import stats

In [16]:
root = "C:/users/yabec/desktop/dependencynet/"
nodes = pd.read_csv(root+'githubcode/testdata_nodes.csv')
network = pd.read_csv(root+'githubcode/testdata.xz')

In [17]:
network.head()

Unnamed: 0,poi_a,poi_b,dep,uid_a,uid_b,poilon_a,poilat_a,Taxonomy_a,cat_a,poilon_b,poilat_b,Taxonomy_b,cat_b,distance
0,38150,10998,0.004141,4951.707804,6021.219315,-1.014391,1.735001,Food,Full-Service Restaurants,-1.092607,1.692342,Entertainment,Casinos (except Casino Hotels),9.800212
1,38150,93026,0.004767,4951.707804,1789.646276,-1.014391,1.735001,Food,Full-Service Restaurants,-1.023338,1.76779,Shopping,"Beer, Wine, and Liquor Stores",3.738653
2,38150,21954,0.003133,4951.707804,826.494375,-1.014391,1.735001,Food,Full-Service Restaurants,-1.218628,1.769983,Food,Full-Service Restaurants,22.793235
3,38150,12616,0.002829,4951.707804,9950.603032,-1.014391,1.735001,Food,Full-Service Restaurants,-1.116715,1.775394,Sports,Fitness and Recreational Sports Centers,12.100898
4,38150,7470,0.009958,4951.707804,4310.283458,-1.014391,1.735001,Food,Full-Service Restaurants,-1.011438,1.717937,Food,Full-Service Restaurants,1.904939


In [18]:
nodes

Unnamed: 0,id,poilon,poilat,Taxonomy,cat
0,5,-1.049098,1.539614,City / Outdoors,Nature Parks and Other Similar Institutions
1,8,-1.123863,1.679403,Service,Offices of Real Estate Agents and Brokers
2,9,-1.133338,1.649368,Food,Full-Service Restaurants
3,26,-1.104900,1.815933,Service,Commercial Banking
4,28,-1.139759,1.654785,Service,Commercial Banking
...,...,...,...,...,...
16462,106790,-1.245000,1.773242,Office,Office
16463,106791,-1.244367,1.773475,Office,Office
16464,106825,-1.139273,1.653300,Office,Office
16465,106826,-1.139538,1.655312,Office,Office


In [12]:
def get_candidates(this,btree,nodes,node_mj_info):
    ### create candidate dataframe for 'this' nodes  
    nA = np.array(list(this[["poilon_b","poilat_b"]].values))
    dist, idx = btree.query(nA, k=tosearch, distance_upper_bound=1)
    dist[dist == np.inf] = np.nan
    dist_flat = dist.flatten()
    idx_flat  = idx.flatten()[~np.isnan(dist_flat)]
    B_flat = np.repeat(this['id'].values, tosearch)[~np.isnan(dist_flat)]
    A_flat = nodes['id'].values[idx_flat]
    candidates = pd.DataFrame({'poi_a':A_flat, 'poi_b':B_flat, 'dist':dist_flat[~np.isnan(dist_flat)]*110})
    candidates = candidates[candidates['poi_a']!=candidates['poi_b']]
    candidates = candidates[candidates['dist']>0]
    candidates = candidates.merge(node_mj_info, left_on='poi_a', right_on='id', how='left').dropna()
    candidates['log_grav'] = np.log10(candidates['num']/((l+candidates['dist'])**gamma))
    return candidates

In [30]:
tosearch = 1000
l = 0.2
gamma = 1.5

In [31]:
### make KDtree
nB = np.array(list(nodes[["poilon","poilat"]].values))
btree = cKDTree(nB)

### real network 
network = network.sort_values(by='poi_b').reset_index(drop=True)
node_deg_info = (network.groupby(['poi_b','poilon_b','poilat_b'])
                 ['poi_a'].nunique().reset_index().rename(columns={'poi_b':'id','poi_a':'degree'}))
node_totalinw = (network.groupby('poi_b')['dep'].sum().reset_index().rename(columns={'poi_b':'id','dep':'totalinw'}))
node_mj_info_a = network.groupby('poi_a')['uid_a'].first().reset_index().rename(columns={'poi_a':'id','uid_a':'num'})
node_mj_info_b = network.groupby('poi_b')['uid_b'].first().reset_index().rename(columns={'poi_b':'id','uid_b':'num'})
node_mj_info = node_mj_info_a.append(node_mj_info_b, ignore_index=True)
node_mj_info = node_mj_info.drop_duplicates(subset='id')

### get relationship between wij and grav (global dataframe)
network['loggrav'] = np.log10((network['uid_b'])/((l+network['distance'])**gamma))
network['dep_r'] = network['dep'].apply(lambda x: round(10*np.log10(x))/10)
wij_grav = network.groupby('dep_r').agg({'loggrav':['mean','std']}).reset_index()
wij_grav.columns = ['dep_r','loggrav_mean','loggrav_std']

In [38]:
cands = get_candidates(node_deg_info,btree,nodes,node_mj_info)

destnodes = []
realnodes = []
nullnodes = []
for xx,thisnode in enumerate(node_deg_info['id'].values):

    if xx%500==0:
        print(xx)

    cands_this = cands[cands['poi_b']==thisnode]    
    dep_real_thisnode = network[network['poi_b'] == thisnode]
    deprs_count = dep_real_thisnode['dep_r'].value_counts()
    for v in deprs_count.keys():
        c = deprs_count[v]
        dep_real_thisnode_thisr = dep_real_thisnode[dep_real_thisnode['dep_r']==v]
        loggrav_mean = wij_grav[wij_grav['dep_r']==v]['loggrav_mean']
        cands_this['prob'] = stats.norm.pdf(cands_this['log_grav'], loggrav_mean, 0.3)
        cands_this['prob_norm'] = cands_this['prob']/np.sum(cands_this['prob'])
        thissample = cands_this.sample(c, weights='prob_norm', axis=0)['poi_a'].values
        destnodes = np.concatenate((destnodes,dep_real_thisnode_thisr['poi_b'].values))
        realnodes = np.concatenate((realnodes,dep_real_thisnode_thisr['poi_a'].values))
        nullnodes = np.concatenate((nullnodes,thissample))
        # print('done',v,thisnode)
        
### 'poi_a' is the actual connection, 'poi_a_sample' is the null connection
df_result = pd.DataFrame({'poi_a':realnodes, 'poi_b':destnodes, 'poi_a_sample':nullnodes})

0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cands_this['prob'] = stats.norm.pdf(cands_this['log_grav'], loggrav_mean, 0.3)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cands_this['prob_norm'] = cands_this['prob']/np.sum(cands_this['prob'])


500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500


In [39]:
len(network),len(df_result)

(649450, 649958)

In [48]:
null = df_result.merge(network[['poi_a','poi_b','dep']], on=['poi_a','poi_b'], how='left')

In [50]:
null.to_csv(root+'githubcode/testdata_null.csv.gz', compression='xz', index=False)