In [None]:
# This notebook loads responses from the Vaximap Impact Google Form 

# We intend to show the following:
    # Time spent planning the route, as a function of route size
    # Using the submitted routes to determine time to complete the route
    # Qualitative data on ease of planning routes and comparative ease of using Vaximap
    
# We will then use these results to estimate the time savings using Vaximap

In [None]:
# To load the Google Forms data

# Go to the response for the Impact assessment on Google Forms 
# (https://docs.google.com/forms/d/1ZR8qlDd8TmFP8ELsby0fWtbJ_1w8k8TwHCazUbKWGLE/edit#responses)

# Click 'Download Responses (.csv)'
# Unzip the downloaded folder and copy the .csv file to the analysis/ subfolder

In [None]:
# Load libraries
import pandas as pd
from io import BytesIO
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.linear_model import LinearRegression
import descartes
import geopandas as gpd


In [None]:
# Load data
data = pd.read_csv('Vaximap Impact Survey.csv')

# Load dataset
import sys
sys.path.append('../')
from load_dataset import dataset

In [None]:
# Define the postcodes in route finding tasks
route1=['OX1 4LB','OX5 1PJ','OX4 2HH']
route2=['OX1 4LB','OX1 4EH','OX4 2HH','OX5 1PJ','OX3 8HH']
route3=['OX1 4EH','OX1 4LB','OX14 3DP','OX5 1PJ','OX4 2HH','OX2 9QN','OX4 2JT','OX3 8HH','OX44 9NG']

# Tidy up the column names for ease of reading's sake. Only have to do this once:
data = data.rename(columns={
    
    str(data.columns[0]): "timestamp",
    str(data.columns[1]): "occupation",
    str(data.columns[2]): "technicality",
    
    str(data.columns[3]): route1[0],
    str(data.columns[4]): route1[1],
    str(data.columns[5]): route1[2],
    str(data.columns[6]): "complete1",
    
    str(data.columns[7]): route2[0],
    str(data.columns[8]): route2[1],
    str(data.columns[9]): route2[2],
    str(data.columns[10]): route2[3],
    str(data.columns[11]): route2[4],
    str(data.columns[12]): "complete2",
    
    str(data.columns[13]): route3[0],
    str(data.columns[14]): route3[1],
    str(data.columns[15]): route3[2],
    str(data.columns[16]): route3[3],
    str(data.columns[17]): route3[4],
    str(data.columns[18]): route3[5],
    str(data.columns[19]): route3[6],
    str(data.columns[20]): route3[7],
    str(data.columns[21]): route3[8],
    str(data.columns[22]): "complete3",
    
    str(data.columns[23]): "difficulty",
    str(data.columns[24]): "time_consumption",
    str(data.columns[25]): "excel_ease"  
    
    });

# Inspect data
data.head()

In [None]:
# Tidy up answers to map from 'first', 'second', 'third' etc. to 1,2,3
# Index from 1, to avoid confusion
order_dict= {
    "First": 1,
    "Second": 2,
    "Third": 3,
    "Fourth": 4,
    "Fifth": 5,
    "Sixth": 6,  
    "Seven": 7,
    "Eight": 8,
    "Nine": 9,
    }
    
# Replace all values
data.replace(order_dict, inplace=True)


# And convert completion times to floats (in minutes)
def to_minutes(x):
    return (60*float(x.split(':')[0])) + float(x.split(':')[1])

# Apply to_minutes to completion columns
data['complete1'] = data.apply(lambda x: to_minutes(x.complete1), axis=1)
data['complete2'] = data.apply(lambda x: to_minutes(x.complete2), axis=1)
data['complete3'] = data.apply(lambda x: to_minutes(x.complete3), axis=1)

# Inspect head
data.head()

In [None]:
# Start building some summary statistics

print("Total number of completions = ",len(data))
print("Average time to complete route 1 = ",round(np.mean(data['complete1']),2)," minutes.")
print("Average time to complete route 2 = ",round(np.mean(data['complete2']),2)," minutes.")
print("Average time to complete route 3 = ",round(np.mean(data['complete3']),2)," minutes.")

In [None]:
##### Planning a route #####

In [None]:
# Plot completion time vs. route length
y = data['complete1'].append(data['complete2']).append(data['complete3'])
x0 = [1 for i in range(0,3*len(data))]
x1 = [3 for i in range(0,len(data))]+[5 for i in range(0,len(data))]+[9 for i in range(0,len(data))]

# Generate model params
lr = LinearRegression(fit_intercept=False)
X = np.array([x0,x1],dtype="float32").transpose()
reg = lr.fit(X, y.values)
reg.score(X, y)

# Make prediction
Xpred1 = np.linspace(0, X.max(), num=10).tolist()
Xpred0 = [1 for i in range(0,len(Xpred1))]
Xpred = np.array([Xpred0,Xpred1],dtype="float32").transpose()

# Plot 
plt.scatter(x1,y, alpha=0.5)
plt.plot(Xpred[:,1],reg.predict(Xpred),color='k')
plt.ylabel('Completion time (minutes)')
plt.xlabel('# of sites')
plt.title('Planning time vs. route size')
plt.show()

In [None]:
# Get minutes per sites
print("Approx. minute per site: ",round(reg.coef_[1],2))

In [None]:
# Calculate overall time saving
print("Total number of patients: ",sum(dataset.n_patients))
print("Total planning time: ",int(reg.coef_[1]*sum(dataset.n_patients)/60)," hours.")
print("Total planning time: ",int(reg.coef_[1]*sum(dataset.n_patients)/(60*24))," days.")
print("Total planning time: ",int(reg.coef_[1]*sum(dataset.n_patients)/(60*8))," worker days.")

# Map time saving into cost
avg_hourly_rate = 15
print("Total cost saving: £",int(reg.coef_[1]*sum(dataset.n_patients)/60)*avg_hourly_rate,".")


In [None]:
##### Illustrating location of sites around UK #####

In [None]:
def flatten(t):
    return [item for sublist in t for item in sublist]

# Shift to centre of UK
uk_centre_lat = 53
uk_centre_long = -1.2

# Get all 
latlongs = flatten(dataset['latlong'])
lats = [uk_centre_lat+x[0] for x in latlongs]
longs = [uk_centre_long+x[1] for x in latlongs]

# make bespoke dataframe
df = pd.DataFrame(
    {'lats': lats,
     'longs': longs,
    })

print("Number of points: ",len(lats))
print("Number of points: ",len(longs))
print("Max lat: ",max(lats),". Min lat: ",min(lats))
print("Max long: ",max(longs),". Min long: ",min(longs))

In [None]:
# Plot for whole world

# Being geo plot
gdf = gpd.GeoDataFrame(
    df, geometry=gpd.points_from_xy(longs, lats))

# plotting coordinates over a country level map
world = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres"))

# Plot for world
ax = world.plot(
    color='white', edgecolor='black')

# then plot the geodataframe on this
gdf.plot(ax=ax, color='red')
plt.xlabel('Longitude')
plt.ylabel('Longitude')
plt.show()

In [None]:
# Plot for UK only

# Define bounds
max_uk_lat = 60
min_uk_lat  = 50
max_uk_long = 4
min_uk_long = -10

# Filter lats and longs
df2 = df[(df["lats"] > min_uk_lat) & (df["lats"] < max_uk_lat)]
df2 = df2[(df2["longs"] > min_uk_long) & (df2["longs"] < max_uk_long)]

# Being geo plot
gdf = gpd.GeoDataFrame(
    df2, geometry=gpd.points_from_xy(df2.longs.values, df2.lats.values))

# plotting coordinates over a country level map
world = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres"))

# Plot for world
ax = world[world.name == 'United Kingdom'].plot(
    color='white', edgecolor='black')

# then plot the geodataframe on this
gdf.plot(ax=ax, color='red')
plt.xlabel('Longitude')
plt.ylabel('Longitude')
plt.show()