# Network Creation

In [None]:
%load_ext autoreload
%autoreload

#%autoreload
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
display(HTML("""<style>div.output_area{max-height:10000px;overflow:scroll;}</style>"""))

import networkx as nx
from networkTrips import organizeTrips
from timeUtils import clock, elapsed, getDateTime
from ioUtils import loadJoblib, saveFile, getFile
from fsUtils import mkDir, mkSubDir, setFile, setSubDir
from pandasUtils import getRowData, getColData, dropColumns
from geoUtils import convertMetersToLat, convertLatToMeters, convertMetersToLong, convertLongToMeters
from geocluster import geoClusters, geoCluster
from geoclusterUtils import genCenters, genCluster, genClusters, genTripsBetweenClusters
from networkClusterMaps import foliumMap
from networkOutput import printNetwork
from driverNetwork import driverNetwork
from networkFeatures import networkFeatures

import pandas as pd
pd.set_option("display.max_rows",1000)
pd.set_option('precision', 3)

import warnings
warnings.filterwarnings('ignore')

_, _ = clock("Last Run")

In [2]:
savedir = "/Users/tgadf/Downloads/network"
mkDir(savedir)

'/Users/tgadf/Downloads/network'

# Load/Generate Data

In [3]:
#######################################################################################
# Generate Clusted Data
#######################################################################################
genData = False
if genData:
    cls     = 20
    total   = 500
    genMax  = 75
    distMax = 500
    raw  = genClusters(cls, 250, latRange=[29.8, 30.2], lngRange=[49.8, 50.2], dist="gauss", maxrad=genMax)
    gc   = geoClusters(key="dummy", points=raw, distMax=distMax, debug=False)
    gc.findClusters(seedMin=2, debug=False)
    df   = genTripsBetweenClusters(n=total, gc=gc, returnDF=True)
    df["device"] = "dummy"    
    
    tmpdf = loadJoblib("/Users/tgadfort/Downloads/r4hIDs.p").sample(n=total, replace=True)
    tojoin = tmpdf.sample(cls)
    tojoin["cl"] = ["cl{0}".format(x) for x in range(cls)]

    df['cl'] = df['cl0']
    drops = [x for x in tojoin.columns if x.startswith("Geo1")]
    tojoinCL0 = dropColumns(tojoin, columns=drops, inplace=False)
    test = df.merge(tojoinCL0, on='cl')

    test['cl'] = test['cl1']
    drops = [x for x in tojoin.columns if x.startswith("Geo0")]
    tojoinCL1 = dropColumns(tojoin, columns=drops, inplace=False)
    test = test.merge(tojoinCL1, on='cl')

    gpsdata = test
    dropColumns(gpsdata, columns=["cl", "cl0", "cl1"])
    gpsdata.replace('nan', 0, inplace=True)
else:
    fname = "/Users/tgadf/Downloads/gpsTripsOakRidge.p"
    print("Loading {0}".format(fname))
    gpsdata = loadJoblib(fname)    

_, _ = clock("Last Run")

Loading /Users/tgadf/Downloads/gpsTripsOakRidge.p
Current Time is Wed Dec 12, 2018 10:24:11 for Last Run


## Show Data (if needed)

## Subselect (if needed)

In [4]:
device  = '352252060173789'
debug   = True
gpsdata = gpsdata[gpsdata['device'] == device]
print("Keeping {0} rows".format(gpsdata.shape[0]))
_, _ = clock("Last Run")

Keeping 3066 rows
Current Time is Wed Dec 12, 2018 10:24:11 for Last Run


# Cluster and Sort Trips

In [5]:
i  = 0
nd = gpsdata['device'].nunique() 
for device, df in gpsdata.groupby('device'):
    print('Key = {0}'.format(device),'\tRun = {0}/{1}'.format(i,nd),'\tTrips = {0}'.format(df.shape[0]))
    i += 1

    #######################################################################################
    # Cluster Geo Data (Lat, Long)
    #######################################################################################
    points         = df[["lat0", "long0"]]
    points.columns = ["lat", "long"]
    pnts           = df[["lat1", "long1"]]
    pnts.columns   = ["lat", "long"]    
    points         = points.append(pnts)



    #######################################################################################
    # Create Clusters
    #######################################################################################True
    gc   = geoClusters(key="dummy", points=points, distMax=200, mergeFactor=2.5, debug=False)
    gc.createCells(debug=False)
    gc.createProtoClusters(seedMin=4, debug=False)
    gc.createSeedlessClusters(seedMin=2, debug=False)
    gc.mergeClusters(debug=False)
    print("Found {0} geo clusters".format(gc.getNClusters()))



    #######################################################################################
    # Set Nearest Clusters
    #######################################################################################
    if debug:
        start, cmt = clock("Finding Nearest Clusters for Start of Trips")
    geoResults = df[['lat0', 'long0']].apply(gc.getNearestClusters, axis=1).values
    df["geo0"] = [x[0] for x in geoResults]
    if debug:
        elapsed(start, cmt)
        start, cmt = clock("Finding Nearest Clusters for End of Trips")
    geoResults = df[['lat1', 'long1']].apply(gc.getNearestClusters, axis=1).values
    df["geo1"] = [x[0] for x in geoResults]    
    if debug:
        elapsed(start, cmt)


    #######################################################################################
    # Organize Trips for Network
    #######################################################################################
    trips = organizeTrips(df=df, gc=gc, debug=True, requireGood=False)

Key = 352252060173789 	Run = 0/1 	Trips = 3066
Found 375 geo clusters
Current Time is Wed Dec 12, 2018 10:24:14 for Finding Nearest Clusters for Start of Trips
Current Time is Wed Dec 12, 2018 10:24:21 for Done with Finding Nearest Clusters for Start of Trips
Process [Done with Finding Nearest Clusters for Start of Trips] took 7 seconds.
Current Time is Wed Dec 12, 2018 10:24:21 for Finding Nearest Clusters for End of Trips
Current Time is Wed Dec 12, 2018 10:24:29 for Done with Finding Nearest Clusters for End of Trips
Process [Done with Finding Nearest Clusters for End of Trips] took 7 seconds.
All Trips:     3066
Deriving Home From Daily Visits, Overnight Stays, Dwell Times, and Common Location
There are 376 possible home clusters
There are 54 possible home clusters with at least two hours of dwell time
There are 38 possible home clusters with at least ten daily visits
There are 5 possible home clusters with overnight stays
Selecting cl0 as the home cluster with significance 109.8, 

# Saved Data

In [None]:
# Save trips/gc if needed
deviceDir = mkSubDir(savedir, device)
tripsfile = setFile(deviceDir, "trips.p")
gcfile    = setFile(deviceDir, "gc.p")
loadTrips=False
if loadTrips:
    trips = getFile(tripsfile)
    gc    = getFile(gcfile)
else:
    print("Saving to {0}".format(deviceDir))
    saveFile(ifile=gcfile, idata=gc)
    saveFile(ifile=tripsfile, idata=trips)
    
_, _ = clock("Last Run")

In [None]:
# Show data if needed
df.head()

# Driver Network

In [7]:
%load_ext autoreload
%autoreload

from driverNetwork import driverNetwork
from edgeInfo import edgeInfo
from vertexInfo import vertexInfo
from networkCategories import categories
from networkAlgos import networkAlgos

dn = driverNetwork(trips)
dn.create(debug=True)
if True:
    dn.computeNetworkAttrs(debug=True, level=1)
    dn.fillVertexCensusData(debug=True)
    dn.fillVertexGeospatialData(debug=True)
    dn.fillVertexInternalData(debug=True)
    dn.fillVertexNetworkData(debug=True)
    dn.fillEdgeInternalData(debug=True)
    dn.fillEdgeVertexData(debug=True)
    dn.fillEdgeNetworkData(debug=True)
g = dn.getNetwork()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Creating a driver network with 376 vertices and 1031 edges.
Creating Network Attributes
Updating Vertex Attributes
Updating Vertices/Edges
Ordering Edges by Weight
Ordering Vertices by Centrality
Flattening Vertices/Edges
Collecting Edge Attributes
Flattening Vertex Attributes
Collecting Vertices/Edges
Collecting Edge Attributes
Collecting Vertex Attributes
Creating Vertex Attrs DataFrame
Cleaning Vertex Attribute Names
Creating Edge Attrs DataFrame
Cleaning Edge Attribute Names
Computing Network Attributes (simple)
Computing Network Algorithms
Running network algorithms
Creating Algorithm Results DataFrame for Vertices
Creating Algorithm Results DataFrame for Edges
  Created 30 attributes for 376 vertices
  Created 2 attributes for 1031 edges
  Created 46 attributes for the network
Filling Vertex Census Data
Filling Vertex Geospatial Data
Filling Vertex Internal Data
Filling Vertex Network Data
Fil

In [8]:
if True:
    %load_ext autoreload
    %autoreload

    from networkOutput import printNetwork
    pn = printNetwork(dn)
    pn.printVertices(minN=40)
    pn.printEdges(minW=20)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload



#   Cl   N    Home Active DayWeek Dwell    Place               State               Cliques Cluster Degree  DCentral ECentral ShortPath PageRank
--- ---  ---  ---  ---    ---     ---      ---                 ---                 ---     ---     ---     ---      ---      ---       ---     
0   cl0  1721 ***  Daily  Week    VeryHigh Loudon              Tennessee           2.0     0.667   79.0    0.011    0.045    3.707     0.002   
1   cl1  587       Daily  Week    VeryLow  Loudon              Tennessee           3.0     0.167   3.25    0.011    0.002    4.641     0.003   
2   cl4  236       Weekly Week    High     Oak Ridge           Tennessee           2.0     0.0     3.0     0.005    0.001    5.301     0.002   
3   cl7  246       Weekly Week    Low      Loudon              Tennessee           1.0     1.0     145.0   0.005    0.037    3.718     0.001   
4   cl3  89        MonthlyWeek    Mid      Le

# Network Features

In [13]:
%load_ext autoreload
%autoreload

from networkFeatures import networkFeatures

nf = networkFeatures(dn)

## Vertex Counts
nf.fillVertexCensusCounts(debug=True)
nf.fillVertexInternalCounts(debug=True)
nf.fillVertexGeoSpatialCounts(debug=True)
nf.fillVertexProperties(debug=True)

## Edge Counts
nf.fillEdgeInternalCounts(debug=True)
nf.fillEdgeCensusCounts(debug=True)
nf.fillEdgeGeoSpatialCounts(debug=True)
nf.fillEdgeProperties(debug=True)

## Network Counts
nf.fillNetworkFeatures(debug=True)

## Home Counts
nf.fillHomeFeatures(debug=True)

## Indiv Vertex/Edge Values
#nf.fillIndividualVertexFeatures(debug=True)
#nf.fillIndividualEdgeFeatures(debug=True)

## Vertex/Edge Correlations
##nf.fillVertexFeatureCorrelations(debug=True)
#nf.fillEdgeFeatureCorrelations(debug=True)
_,_ = clock("Last Run")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Filling Vertex Census Counts
  Filling Vertex Census Counts
Filling Vertex Internal Counts
  Filling Vertex Internal Counts for 34 Cutoff Values
Filling Vertex GeoSpatial Counts
  Filling Vertex GeoSpatial Counts for 0 Cutoff Values
Filling Vertex Properties
  Filled Vertex Properties for 43 Attributes
Filling Edge Internal Counts
  Filling edge Internal Counts for 29 Cutoff Values
Filling Edge Census Counts
  Filling Edge Census Counts
Filling edge GeoSpatial Counts
  Filling edge GeoSpatial Counts for 106 Cutoff Values
Filling Edge Properties
  Filled Edge Properties for 11 Attributes
Filling Network Features
  Filled 46 Network Features
Filling Home Vertex Features
  Filled 178 Home Vertex Features
Current Time is Wed Dec 12, 2018 10:36:13 for Last Run


In [None]:
nf.getFeatures()

In [14]:
nf.getFeatureDataFrame().T

Unnamed: 0,0
VertexCensusCountsCbsaN,23
VertexCensusCountsCbsaMostCommon,"Knoxville, TN"
VertexCensusCountsCbsaMostCommonFraction,0.636
VertexCensusCountsCbsaTypeN,3
VertexCensusCountsCbsaTypeMostCommon,Metro
VertexCensusCountsCbsaTypeMostCommonFraction,0.71
VertexCensusCountsCbsaPopN,5
VertexCensusCountsCbsaPopMostCommon,Big
VertexCensusCountsCbsaPopMostCommonFraction,0.644
VertexCensusCountsCbsaHousingN,5
