In [1]:
import pandas as pd
import numpy as np
import os
import networkx as nx
import netcomp as nc
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.metrics.cluster import rand_score
from sklearn.metrics.cluster import normalized_mutual_info_score
from sklearn.metrics import mean_squared_error
import pycombo
# from netAPI import visualizePartitionShape
# from netAPI import getComboPartition
# from netAPI import getComboSeries
import geopandas as gpd
import matplotlib.pyplot as plt

## Community detection

In [2]:
def makeGraphfromDf(df,oCol, dCol, weight):
    G=nx.DiGraph()
    nx.set_edge_attributes(G,'weight', 0)
    for k in df.index:
        G.add_edge(df[oCol][k],df[dCol][k],weight=df[weight][k])
#     nx.write_edgelist(G, comboPath+'temp/%s.net'%city)
    return G

In [3]:
def reClass(commDict):
    label, index = np.unique(np.fromiter(commDict.values(),dtype=int), return_index=True)
    recommDict = {}
    for zone in commDict.keys():
        recommDict[zone] = recommDict[commDict[zone]]
    return reClass

In [4]:
def fixedARI(group1, group2):
    from sklearn.metrics import pair_confusion_matrix

    (tn, fp), (fn, tp) = pair_confusion_matrix(group1, group2)
    (tn, fp), (fn, tp) = (float(tn), float(fp)), (float(fn), float(tp))
    if fn == 0 and fp == 0:
        return 1.0

    return 2. * (tp * tn - fn * fp) / ((tp + fn) * (fn + tn) +
                                           (tp + fp) * (fp + tn))

In [5]:
def weightedAdjustedRandIndex(df,oCol, dCol,groundTruthCol, predCol,maxcom):
    GTruth = makeGraphfromDf(df,oCol, dCol, groundTruthCol)
    GPred = makeGraphfromDf(df,oCol, dCol, predCol)
    population = df.groupby(['origin']).agg({'flowReal':'sum'}).reset_index()

    population['flowReal'] = population['flowReal'].astype(int)
    groundTruthComm,groundTruthMod = pycombo.execute(GTruth, max_communities = maxcom)
    
    predictComm,predictMod = pycombo.execute(GPred, max_communities = maxcom)
    weight = dict(zip(population.origin, population.flowReal))
#     print(population)
    groundTruthCommWeighted = []
    predictCommWeighted = []
    for zone in sorted(groundTruthComm.keys()):
        if zone in weight:
            groundTruthCommWeighted += [groundTruthComm[zone]]*weight[zone]
            predictCommWeighted += [predictComm[zone]]*weight[zone]

        else:
            pass
    groundTruthCommWeighted = np.array(groundTruthCommWeighted, dtype=np.float64)
    predictCommWeighted = np.array(predictCommWeighted, dtype=np.float64)
#     ri = rand_score(groundTruthCommWeighted,predictCommWeighted)
    mse = mean_squared_error(df[groundTruthCol],df[predCol])
    ari = fixedARI(groundTruthCommWeighted,predictCommWeighted)
    nmi = normalized_mutual_info_score(groundTruthCommWeighted,predictCommWeighted)
#     countCommGT,countCommPr = len(set(groundTruthComm.values())),len(set(predictComm.values()))
#     return (ri,ari,nmi,countCommGT,countCommPr)
    return(mse,ari,nmi)

In [6]:
def plotMetrics(cities,resultList,modelList,metricLoc=0):
    modelAmount = len(modelList)
    metric = {0:'RI',1:'ARI',2:'NMI'}
    metricName = metric[metricLoc]

    x = np.arange(len(cities))  # the label locations
    width = 0.2  # the width of the bars
    
    if modelAmount % 2 == 0:
        shift = width*0.5
    else:
        shift = 0
    
    fig, ax = plt.subplots(figsize=(16,8))
    
    for i in range(modelAmount):
        plt.bar(x-(modelAmount-1)/2*width+i*width,
                np.array(list(resultList[i].values()))[:,metricLoc],width=width,
                label=modelList[i])


   
    # plt.title('Adjusted rand score')

   
    
    _ = plt.xticks(ticks=x,labels=cities,rotation=70)
    plt.legend(loc='best',bbox_to_anchor=(0.5, 0., 0.5, 0.5))
#     plt.title('%s comparison between noconstrain and doubly constrain gravity model for income groups'%metricName)

In [7]:
cities = ['New York City',
    'Los Angeles',
    'Chicago',
    'Houston',
    'Boston',
    'Phoenix',
    'Philadelphia',
    'San Antonio',
    'San Diego', 
    'Dallas', 
    'San Jose', 
    'Austin']

In [11]:
seed = 2017
np.random.seed(seed)
NoconstrainPowerLaw = {}
NoconstrainFullPowerLaw = {}
NoconstrainBuckedPowerLaw = {}
NoconstrainBuckedFullPowerLaw = {}
classicConstrainDist = {}
# cities = ['New York City', 'Los Angeles', 'Chicago', 'Houston', 'Boston', 'Phoenix', 'Philadelphia', 'San Antonio', 'San Diego', 'Dallas', 'San Jose', 'Austin']
for city in cities:
    print(city)
    total = pd.read_csv('../unconstrainCTPowerlaw/%s.csv'%city)
    total['flowPred'] = total['S000pred']
    total['flowReal'] = total['S000flow']
    total = total.groupby(['origin','destination']).sum().reset_index()
    mse,ari,nmi = weightedAdjustedRandIndex(total,'origin','destination','S000flow','S000pred',0)
    NoconstrainPowerLaw[city] = [mse,ari,nmi]

    total = pd.read_csv('../unconstrainCTFullPowerlaw/%s.csv'%city)
    total['flowPred'] = total['S000pred']
    total['flowReal'] = total['S000flow']
    total = total.groupby(['origin','destination']).sum().reset_index()
    mse,ari,nmi = weightedAdjustedRandIndex(total,'origin','destination','S000flow','S000pred',0)
    NoconstrainFullPowerLaw[city] = [mse,ari,nmi]
    
    total = pd.read_csv('../constrainCTdistbinsAB/%s.csv'%city)
    total['flowPred'] = total['S000pred']
    total['flowReal'] = total['S000flow']
    total = total.groupby(['origin','destination']).sum().reset_index()
    mse,ari,nmi = weightedAdjustedRandIndex(total,'origin','destination','S000flow','S000pred',0)
    classicConstrainDist[city] = [mse,ari,nmi]
    
    total = pd.read_csv('../unconstrainCTBuckedPowerlaw/%s.csv'%city)
    total['flowPred'] = total['S000pred']
    total['flowReal'] = total['S000flow']
    total = total.groupby(['origin','destination']).sum().reset_index()
    mse,ari,nmi = weightedAdjustedRandIndex(total,'origin','destination','S000flow','S000pred',0)
    NoconstrainBuckedPowerLaw[city] = [mse,ari,nmi]

    total = pd.read_csv('../unconstrainCTBuckedFullPowerlaw/%s.csv'%city)
    total['flowPred'] = total['S000pred']
    total['flowReal'] = total['S000flow']
    total = total.groupby(['origin','destination']).sum().reset_index()
    mse,ari,nmi = weightedAdjustedRandIndex(total,'origin','destination','S000flow','S000pred',0)
    NoconstrainBuckedFullPowerLaw[city] = [mse,ari,nmi]


New York City
Los Angeles
Chicago
Houston
Boston
Phoenix
Philadelphia
San Antonio
San Diego
Dallas
San Jose
Austin


In [18]:
df = pd.DataFrame(index=cities)
df['NoconstrainPowerLaw'] = np.array(list(NoconstrainPowerLaw.values()))[:,0]
df['NoconstrainBuckedPowerLaw'] = np.array(list(NoconstrainBuckedPowerLaw.values()))[:,0]
df['NoconstrainFullPowerLaw'] = np.array(list(NoconstrainFullPowerLaw.values()))[:,0]
df['NoconstrainBuckedFullPowerLaw'] = np.array(list(NoconstrainBuckedFullPowerLaw.values()))[:,0]
df['DoublyConstrainBucked'] = np.array(list(classicConstrainDist.values()))[:,0]
df

Unnamed: 0,NoconstrainPowerLaw,NoconstrainBuckedPowerLaw,NoconstrainFullPowerLaw,NoconstrainBuckedFullPowerLaw,DoublyConstrainBucked
New York City,4.475363,4.768019,4.449022,4.767617,4.838656
Los Angeles,5.764469,6.338955,5.198298,5.975948,5.603167
Chicago,15.598198,15.230296,15.350196,14.998132,16.290335
Houston,37.226388,38.259448,35.273201,34.67133,33.811145
Boston,49.549755,53.565826,46.427578,51.942475,46.95711
Phoenix,15.694504,16.837985,15.345424,16.374498,14.315659
Philadelphia,20.21531,20.322334,20.016042,20.220293,20.646556
San Antonio,40.873446,42.477519,39.059529,39.824743,36.168107
San Diego,44.483853,45.191492,42.125531,42.258244,35.774731
Dallas,27.117573,27.636378,26.464631,26.80231,25.070868


In [19]:
df = pd.DataFrame(index=cities)
df['NoconstrainPowerLaw'] = np.array(list(NoconstrainPowerLaw.values()))[:,2]
df['NoconstrainBuckedPowerLaw'] = np.array(list(NoconstrainBuckedPowerLaw.values()))[:,2]
df['NoconstrainFullPowerLaw'] = np.array(list(NoconstrainFullPowerLaw.values()))[:,2]
df['NoconstrainBuckedFullPowerLaw'] = np.array(list(NoconstrainBuckedFullPowerLaw.values()))[:,2]
df['DoublyConstrainBucked'] = np.array(list(classicConstrainDist.values()))[:,2]
df

Unnamed: 0,NoconstrainPowerLaw,NoconstrainBuckedPowerLaw,NoconstrainFullPowerLaw,NoconstrainBuckedFullPowerLaw,DoublyConstrainBucked
New York City,0.716754,0.720915,0.709365,0.719658,0.714424
Los Angeles,0.737167,0.743097,0.761449,0.750571,0.723255
Chicago,0.592389,0.530346,0.581617,0.570161,0.592265
Houston,0.624485,0.600703,0.621707,0.606105,0.55169
Boston,0.559127,0.499997,0.650125,0.545818,0.676648
Phoenix,0.541096,0.542281,0.54167,0.537903,0.665939
Philadelphia,0.535641,0.593108,0.530895,0.590652,0.488246
San Antonio,0.737338,0.728133,0.7409,0.727681,0.728133
San Diego,0.530143,0.653863,0.527629,0.652911,0.628607
Dallas,0.523532,0.531328,0.535669,0.531328,0.523089


In [21]:
df = pd.DataFrame(index=cities)
df['NoconstrainPowerLaw'] = np.array(list(NoconstrainPowerLaw.values()))[:,0]
# df['A2'] = np.array(list(NoconstrainExp.values()))[:,0]
df['NoconstrainFullPowerLaw'] = np.array(list(NoconstrainFullPowerLaw.values()))[:,0]
df['DoublyConstrain'] = np.array(list(classicConstrainDist.values()))[:,0]
df