In [None]:
!pip install geopandas

Collecting geopandas
  Downloading geopandas-0.10.2-py2.py3-none-any.whl (1.0 MB)
[K     |████████████████████████████████| 1.0 MB 5.0 MB/s 
Collecting fiona>=1.8
  Downloading Fiona-1.8.21-cp37-cp37m-manylinux2014_x86_64.whl (16.7 MB)
[K     |████████████████████████████████| 16.7 MB 247 kB/s 
[?25hCollecting pyproj>=2.2.0
  Downloading pyproj-3.2.1-cp37-cp37m-manylinux2010_x86_64.whl (6.3 MB)
[K     |████████████████████████████████| 6.3 MB 39.6 MB/s 
Collecting click-plugins>=1.0
  Downloading click_plugins-1.1.1-py2.py3-none-any.whl (7.5 kB)
Collecting munch
  Downloading munch-2.5.0-py2.py3-none-any.whl (10 kB)
Collecting cligj>=0.5
  Downloading cligj-0.7.2-py3-none-any.whl (7.1 kB)
Installing collected packages: munch, cligj, click-plugins, pyproj, fiona, geopandas
Successfully installed click-plugins-1.1.1 cligj-0.7.2 fiona-1.8.21 geopandas-0.10.2 munch-2.5.0 pyproj-3.2.1


In [None]:
#import the necessary libraries 
import pandas as pd
import matplotlib.pyplot as plt
import geopandas as gpd
import seaborn as sns
import folium
import statsmodels.api as sm
import scipy.stats
import numpy as np
from math import sqrt
import statsmodels.formula.api as smf

  import pandas.util.testing as tm


In [None]:
#set up the metric calculations
def CalcRSqaured(observed, estimated):
    """Calculate the r^2 from a series of observed and estimated target values
    inputs:
    Observed: Series of actual observed values
    estimated: Series of predicted values"""
    
    r, p = scipy.stats.pearsonr(observed, estimated)
    R2 = r **2
    
    return R2

def CalcRMSE(observed, estimated):
    """Calculate Root Mean Square Error between a series of observed and estimated values
    inputs:
    Observed: Series of actual observed values
    estimated: Series of predicted values"""
    
    res = (observed -estimated)**2
    RMSE = round(sqrt(res.mean()), 3)
    
    return RMSE

In [None]:
#read in the cdatasub from the first week
cdatasub = pd.read_csv("london_flows_index.csv", index_col=0)
for index, row in cdatasub.iterrows():
    if row["distance"]==0 or row["population"]==0 or row["jobs"]==0 or row["flows"]==0:
        cdatasub.drop(index, inplace=True)

In [None]:
cdatasub.head(10)

Unnamed: 0_level_0,station_origin,station_destination,flows,population,jobs,distance
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2,Abbey Road,Beckton,1,599,442,8510.121774
3,Abbey Road,Blackwall,3,599,665,3775.448872
4,Abbey Road,Canary Wharf,1,599,58772,5086.51422
5,Abbey Road,Canning Town,37,599,15428,2228.923167
6,Abbey Road,Crossharbour,1,599,1208,6686.47556
8,Abbey Road,Cutty Sark,2,599,1748,8503.898909
9,Abbey Road,Cyprus,7,599,850,6532.099618
10,Abbey Road,Devons Road,1,599,611,3958.324171
11,Abbey Road,East India,2,599,1522,3384.141666
12,Abbey Road,Island Gardens,2,599,691,7706.29637


In [None]:
for index, row in cdatasub.iterrows():
   # print(type(row["flows"]))
    
    if row["flows"]==0 or row["distance"]==0:
        #print("True",index)
        cdatasub.drop(index, inplace=True)

In [None]:
#create the formula (the "-1" indicates no intercept in the regression model).
#dbl_form = 'Total ~ Dest + Orig + log_Dist-1'
####classical model######
x_variables = ["distance",]
for x in x_variables:
    cdatasub[f"log_{x}"] = np.log(cdatasub[x])
dbl_form = 'flows ~ station_destination + station_origin + log_distance-1'
#run a doubly constrained sim
doubSim = smf.glm(formula = dbl_form, data=cdatasub, family=sm.families.Poisson()).fit()
#let's have a look at it's summary
print(doubSim.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                  flows   No. Observations:                43945
Model:                            GLM   Df Residuals:                    43149
Model Family:                 Poisson   Df Model:                          795
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:            -8.6312e+05
Date:                Mon, 09 May 2022   Deviance:                   1.5540e+06
Time:                        09:56:42   Pearson chi2:                 2.10e+06
No. Iterations:                     7   Pseudo R-squ. (CS):              1.000
Covariance Type:            nonrobust                                         
                                                       coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------

And the various flows and goodness-of-fit statistics?

In [None]:
#get the estimates
cdatasub["doubsimfitted"] = np.round(doubSim.mu)


In [None]:
CalcRSqaured(cdatasub["flows"],cdatasub["doubsimfitted"])

0.41893772760573356

In [None]:
CalcRMSE(cdatasub["flows"],cdatasub["doubsimfitted"])

117.622

In [None]:
# Run a doubly constrained SIM with a negative exponential cost function.
#dbl_form = 'flows ~ flows + station_origin + log_distance-1'
doubsim_form = "flows ~ station_origin + station_destination + distance -1"
doubsim1 = smf.glm(formula=doubsim_form, data = cdatasub, family = sm.families.Poisson()).fit()
print(doubsim1.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                  flows   No. Observations:                43945
Model:                            GLM   Df Residuals:                    43149
Model Family:                 Poisson   Df Model:                          795
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:            -7.6208e+05
Date:                Mon, 09 May 2022   Deviance:                   1.3520e+06
Time:                        10:00:56   Pearson chi2:                 1.74e+06
No. Iterations:                     7   Pseudo R-squ. (CS):              1.000
Covariance Type:            nonrobust                                         
                                                         coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------

In [None]:
cdatasub["doubsimfitted1"] = np.round(doubsim1.mu,0)

In [None]:
CalcRSqaured(cdatasub["flows"],cdatasub["doubsimfitted1"])

0.49977786985381284

In [None]:
CalcRMSE(cdatasub["flows"],cdatasub["doubsimfitted1"])

109.598