In [None]:
!pip install geopandas

Collecting geopandas
  Downloading geopandas-0.10.2-py2.py3-none-any.whl (1.0 MB)
[K     |████████████████████████████████| 1.0 MB 5.1 MB/s 
[?25hCollecting fiona>=1.8
  Downloading Fiona-1.8.21-cp37-cp37m-manylinux2014_x86_64.whl (16.7 MB)
[K     |████████████████████████████████| 16.7 MB 255 kB/s 
Collecting pyproj>=2.2.0
  Downloading pyproj-3.2.1-cp37-cp37m-manylinux2010_x86_64.whl (6.3 MB)
[K     |████████████████████████████████| 6.3 MB 12.8 MB/s 
Collecting cligj>=0.5
  Downloading cligj-0.7.2-py3-none-any.whl (7.1 kB)
Collecting munch
  Downloading munch-2.5.0-py2.py3-none-any.whl (10 kB)
Collecting click-plugins>=1.0
  Downloading click_plugins-1.1.1-py2.py3-none-any.whl (7.5 kB)
Installing collected packages: munch, cligj, click-plugins, pyproj, fiona, geopandas
Successfully installed click-plugins-1.1.1 cligj-0.7.2 fiona-1.8.21 geopandas-0.10.2 munch-2.5.0 pyproj-3.2.1


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import geopandas as gpd
import seaborn as sns
import folium
import statsmodels.api as sm
import statsmodels.formula.api as smf
import scipy.stats
import numpy as np
from math import sqrt

  import pandas.util.testing as tm


In [None]:
def CalcRSquared(observed, estimated):
    """Calculate the r^2 from a series of observed and estimated target values
    inputs:
    Observed: Series of actual observed values
    estimated: Series of predicted values"""
    
    r, p = scipy.stats.pearsonr(observed, estimated)
    R2 = r **2
    
    return R2

def CalcRMSE(observed, estimated):
    """Calculate Root Mean Square Error between a series of observed and estimated values
    inputs:
    Observed: Series of actual observed values
    estimated: Series of predicted values"""
    
    res = (observed -estimated)**2
    RMSE = round(sqrt(res.mean()), 3)
    
    return RMSE

In [None]:
cdatasub = pd.read_csv("/london_flows_index.csv", index_col= 0)
for index, row in cdatasub.iterrows():
    if row["distance"]==0 or row["population"]==0 or row["jobs"]==0 or row["flows"]==0:
        cdatasub.drop(index, inplace=True)


In [None]:
x_variables = ["jobs","distance",]

for x in x_variables:
    cdatasub[f"log_{x}"] = np.log(cdatasub[x])

cdatasub.head(5)

Unnamed: 0_level_0,station_origin,station_destination,flows,population,jobs,distance,log_jobs,log_distance
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2,Abbey Road,Beckton,1,599,442,8510.121774,6.09131,9.049012
3,Abbey Road,Blackwall,3,599,665,3775.448872,6.499787,8.236275
4,Abbey Road,Canary Wharf,1,599,58772,5086.51422,10.981421,8.534348
5,Abbey Road,Canning Town,37,599,15428,2228.923167,9.643939,7.709274
6,Abbey Road,Crossharbour,1,599,1208,6686.47556,7.096721,8.807842


In [None]:
#############desti constrained model###########

#create the formula (the "-1" indicates no intercept in the regression model).
formula = 'flows ~ station_origin+ log_jobs + log_distance-1'
#run a origin constrained sim
prodSim = smf.glm(formula = formula, data=cdatasub, family=sm.families.Poisson()).fit()
#let's have a look at it's summary
print(prodSim.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                  flows   No. Observations:                43945
Model:                            GLM   Df Residuals:                    43545
Model Family:                 Poisson   Df Model:                          399
Link Function:                    log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:            -9.1409e+05
Date:                Mon, 09 May 2022   Deviance:                   1.6560e+06
Time:                        01:55:06   Pearson chi2:                 2.41e+06
No. Iterations:                     7                                         
Covariance Type:            nonrobust                                         
                                                  coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------------

In [None]:
#We can do this by pulling out the parameter values
coefs = pd.DataFrame(prodSim.params)
coefs.reset_index(inplace=True)
coefs.rename(columns = {0:"alpha_i", "index":"coef"}, inplace = True)
to_repl = ["(station_origin)", "\[", "\]"]
for x in to_repl:
    coefs["coef"] = coefs["coef"].str.replace(x, "")
#then once you have done this you can join them back into the dataframes
cdatasub = cdatasub.merge(coefs, left_on="station_origin", right_on="coef", how = "left")
cdatasub.drop(columns = ["coef"], inplace = True)
#check this has worked
cdatasub.head(2)


  import sys


Unnamed: 0,station_origin,station_destination,flows,population,jobs,distance,log_jobs,log_distance,alpha_i
0,Abbey Road,Beckton,1,599,442,8510.121774,6.09131,9.049012,3.270351
1,Abbey Road,Blackwall,3,599,665,3775.448872,6.499787,8.236275,3.270351


In [None]:
D_j = pd.DataFrame(cdatasub.groupby(["station_destination"])["flows"].agg(np.sum))
D_j.rename(columns={"flows":"D_j"}, inplace = True)
cdatasub = cdatasub.merge(D_j, on = "station_destination", how = "left" )
#cdatasub.head(10)

O_i = pd.DataFrame(cdatasub.groupby(["station_origin"])["flows"].agg(np.sum))
O_i.rename(columns={"flows":"O_i"}, inplace = True)
cdatasub = cdatasub.merge(O_i, on = "station_origin", how = "left" )
#cdatasub.head(10)


In [None]:
alpha_i = prodSim.params[0:-2]
gamma = prodSim.params[-2]
beta = -prodSim.params[-1]

In [None]:
alpha_i

station_origin[Abbey Road]          3.270351
station_origin[Acton Central]       5.008886
station_origin[Acton Town]          4.397394
station_origin[Aldgate]             3.361125
station_origin[Aldgate East]        3.408728
                                      ...   
station_origin[Wood Street]         5.672160
station_origin[Woodford]            4.955425
station_origin[Woodgrange Park]     5.320215
station_origin[Woodside Park]       4.496709
station_origin[Woolwich Arsenal]    6.701868
Length: 398, dtype: float64

In [None]:
gamma

0.7301699265801939

In [None]:
cdatasub["prodsimest1"] = np.exp(cdatasub["alpha_i"]+gamma*cdatasub["log_jobs"] 
                                 - beta*cdatasub["log_distance"])
#or you could do it the easy way like we did last week with the fitted column (See previous practical)
#cdatasub.head(10)

In [None]:
#first round the estimates
cdatasub["prodsimest1"] = round(cdatasub["prodsimest1"],0)


In [None]:
RSquared=CalcRSquared(cdatasub["flows"], cdatasub["prodsimest1"])
print("RSquared is",RSquared)

RSquared is 0.3937259232863417


In [None]:

RMSE=CalcRMSE(cdatasub["flows"], cdatasub["prodsimest1"])
print(RMSE)

120.147


In [None]:
#############origin constrained model###########


x_variables = ["population",]
log_x_vars = []
for x in x_variables:
    cdatasub[f"log_{x}"] = np.log(cdatasub[x])
    log_x_vars.append(f"log_{x}")


#create the formula (the "-1" indicates no intercept in the regression model).
attr_form = 'flows ~ station_destination + log_population + log_distance-1'
#run a production constrained sim
attrSim = smf.glm(formula = attr_form, data=cdatasub, family=sm.families.Poisson()).fit()
#let's have a look at it's summary
print(attrSim.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                  flows   No. Observations:                43945
Model:                            GLM   Df Residuals:                    43545
Model Family:                 Poisson   Df Model:                          399
Link Function:                    log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:            -1.0404e+06
Date:                Mon, 09 May 2022   Deviance:                   1.9085e+06
Time:                        01:52:19   Pearson chi2:                 3.05e+06
No. Iterations:                     7                                         
Covariance Type:            nonrobust                                         
                                                       coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------

We can examine how the constraints hold for destinations this time:

In [None]:
#get the predictions
predictions = attrSim.get_prediction(cdatasub[["station_destination", "log_population", "log_distance"]])
predictions_summary_frame = predictions.summary_frame()
cdatasub["attrsimFitted"] = round(predictions_summary_frame["mean"],0)

In [None]:

RSquared=CalcRSquared(cdatasub["flows"], cdatasub["attrsimFitted"])
print("RSquared is",RSquared)
RMSE=CalcRMSE(cdatasub["flows"], cdatasub["attrsimFitted"])
print("attrsimFitted is",RMSE)

RSquared is 0.34823921191707785
attrsimFitted is 124.587
