In [1]:
import sqlite3
from tqdm import tqdm_notebook as tqdm
from os import *
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
from gurobipy import *
from itertools import *


In [2]:
conn = sqlite3.connect("resources/data.db")
cur = conn.cursor()
cur.execute('''
    SELECT name, entries FROM Subways
    WHERE 7 < CAST(STRFTIME("%H", time) AS INT) 
    AND CAST(STRFTIME("%H", time) AS INT) < 11
    AND STRFTIME("%Y", time) = 2015
''')

data = cur.fetchall()
stations_am = {}
for name, count in tqdm(list(data)):
    if name not in stations_am:
        stations_am[name] = 0
    stations_am[name] += count





In [3]:
conn = sqlite3.connect("resources/data.db")
cur = conn.cursor()
cur.execute(''' 
    SELECT name, entries FROM Subways
    WHERE 15 < CAST(STRFTIME("%H", time) AS INT) AND CAST(STRFTIME("%H", time) AS INT) < 19
''')

data = cur.fetchall()
stations_pm = {}
for name, count in tqdm(list(data)):
    if name not in stations_pm:
        stations_pm[name] = 0
    stations_pm[name] += count





In [4]:
def fuzzy_match(a, b):
    i = 0
    j = 0
    count = 0
    match = -1
    while True:
        if i >= len(a):
            break
        elif j >= len(b):
            i += 1
            j = match + 1
        elif a[i] == b[j]:
            match = j 
            i, j, count = [x + 1 for x in (i, j, count)]
        else:
            j += 1
    return count / min(len(a), len(b))


In [5]:
import io
import re

long_scale = 0.758
cur.execute(f'''
    SELECT name, longitude * {long_scale}, latitude FROM Stations
''')


def convert_name(string, searchlist):
    string = string.upper()
    m1 = re.search('([0-9]+)', string)
    get_group = lambda m: m if not m else m.group(1)
    candidates = [name.upper() for name in searchlist 
                  if (not m1 or m1.group(1) == get_group(re.search('([0-9]+)', name)))
                     and (m1 or not re.search('([0-9]+)', name))]
    found = max(candidates, key=lambda x: fuzzy_match(string, x)) if candidates else string
    return found

location_names = {a.upper(): (b, c) for a, b, c in list(cur.fetchall())}
name_to_location = {}
for k in tqdm(stations_am):
    turnstile_label = convert_name(k, location_names)
    if turnstile_label in location_names:
        name_to_location[k] = location_names[convert_name(k, location_names)]
        
print(f'{sum(a in name_to_location for a in stations_am) / len(stations_am)*100:.1f}%')





ZeroDivisionError: division by zero

In [None]:
from math import sqrt
from scipy.spatial import KDTree

cur.execute(f'''
    SELECT pickup_longitude * {long_scale}, pickup_latitude,
           dropoff_longitude * {long_scale}, dropoff_latitude 
    FROM Taxis
    WHERE 7 < CAST(STRFTIME("%H", pickup_datetime) AS INT) < 11
    LIMIT 2000000
''')

def dist(a, b):
    return sqrt((a[0] - b[0]) ** 2 + (a[1] - b[1]) ** 2)

taxis = list(cur.fetchall())
traffic_matrix = np.zeros([len(name_to_location), len(name_to_location)])
for a, b, c, d in tqdm(taxis):
    start = np.array([1 / dist([a, b], name_to_location[x]) for x in name_to_location])
    start = start / start.sum()
    end = np.array([1 / dist([c, d], name_to_location[x]) for x in name_to_location])
    end = end / end.sum()
    update = start[:, np.newaxis] @ end[np.newaxis, :]
    traffic_matrix += update

taxi_traffic = {k: {v: traffic_matrix[i][j] for j, v in enumerate(name_to_location)} for i, k in enumerate(name_to_location)}

In [None]:
'''
Using normalized rows in y and x.
y=∑t1'*t2/1•(t1'*t2) where t1 is taxi start and t2 taxi end
s= pm subway entries per station
t= am subway entries per station

'''
m = Model()
station_tuples = [(i, stations_am[i], stations_pm[i], name_to_location[i]) 
                  for i in stations_am if i in name_to_location]
station_to_traffic = {a: (b, c) for a, b, c, d in station_tuples}
station_traffic_am = np.array([v1 for i, (v1, v2) in station_to_traffic.items()])
station_traffic_am = station_traffic_am / station_traffic_am.sum()
station_traffic_pm = np.array([v2 for i, (v1, v2) in station_to_traffic.items()])
station_traffic_pm = station_traffic_pm / station_traffic_pm.sum()
x_matrix = np.array([[None for i in station_to_traffic] for j in station_to_traffic])
target_matrix = np.array([[taxi_traffic[j][i] for i in station_to_traffic] 
                          for j in station_to_traffic]).astype(np.float64)

for i, v in enumerate(target_matrix):
    if v.sum() == 0:
        target_matrix[i] = np.ones(v.shape)

for i, row in enumerate(target_matrix):
    target_matrix[i] = row / row.sum()

for (i, a), (j, b) in product(enumerate(station_to_traffic), enumerate(station_to_traffic)):
    x_matrix[i][j] = m.addVar(vtype=GRB.CONTINUOUS, name=f'{a}->{b}')

p_matrix = np.array([x_matrix[i] / station_traffic_pm[i] for i in range(len(x_matrix))])
objective = ((p_matrix - target_matrix) * (p_matrix - target_matrix)).sum()

m.setObjective(objective, GRB.MINIMIZE)
m.addConstrs(x_matrix[i, :].sum() == station_traffic_pm[i] for i in range(len(x_matrix)))
m.addConstrs(x_matrix[:, j].sum() == station_traffic_am[j] for j in range(len(x_matrix)))
m.addConstrs(x_matrix[k][k] == 0 for k in range(len(x_matrix)))

m.optimize()
solution = np.vectorize(lambda x: x.getValue())(p_matrix)

print(solution)
print(objective.getValue())

In [None]:
'''
Using non-normalized xmat
'''
m = Model()
station_tuples = [(i, stations_am[i], stations_pm[i], name_to_location[i]) 
                  for i in stations_am if i in name_to_location]
station_to_traffic = {a: (b, c) for a, b, c, d in station_tuples}
station_traffic_am = np.array([v1 for i, (v1, v2) in station_to_traffic.items()])
station_traffic_pm = np.array([v2 for i, (v1, v2) in station_to_traffic.items()])
station_traffic_pm = station_traffic_pm / station_traffic_pm.sum() * station_traffic_am.sum()
x_matrix = np.array([[None for i in station_to_traffic] for j in station_to_traffic])
target_matrix_nn = np.array([[taxi_traffic[j][i] for i in station_to_traffic] 
                          for j in station_to_traffic]).astype(np.float64)

target_matrix_nn = target_matrix_nn / target_matrix_nn.sum() * station_traffic_am.sum()

for (i, a), (j, b) in product(enumerate(station_to_traffic), enumerate(station_to_traffic)):
    x_matrix[i][j] = m.addVar(vtype=GRB.CONTINUOUS, name=f'{a}->{b}')

objective = ((x_matrix - target_matrix_nn) * (x_matrix - target_matrix_nn)).sum()

m.setObjective(objective, GRB.MINIMIZE)
m.addConstrs(x_matrix[i, :].sum() == station_traffic_pm[i] for i in range(len(x_matrix)))
m.addConstrs(x_matrix[:, j].sum() == station_traffic_am[j] for j in range(len(x_matrix)))
m.addConstrs(x_matrix[k][k] == 0 for k in range(len(x_matrix)))

m.optimize()
solution_x = np.vectorize(lambda x: x.x)(x_matrix)

print(solution_x)
print(objective.getValue())

In [None]:
from math import sqrt
import itertools
from matplotlib.patches import Rectangle
import matplotlib as mpl

def make_scatter(xmat, ymat, file):
    plt.close('all')
    mpl.rcParams.update(mpl.rcParamsDefault)
    %matplotlib inline

    def best_fit(X, Y):
        xbar = sum(X)/len(X)
        ybar = sum(Y)/len(Y)
        n = len(X) 
        numer = sum([xi*yi for xi,yi in zip(X, Y)]) - n * xbar * ybar
        denum = sum([xi**2 for xi in X]) - n * xbar**2
        b = numer / denum
        a = ybar - b * xbar
        return a, b

    eps = 1e-4
    scatpts = [[a, b] for a, b in zip(chain(*xmat), chain(*ymat))]
    norm = [[a, b] for a, b in scatpts if b > eps and a > eps]
    a, b = best_fit(*zip(*norm))
    r = np.corrcoef(*zip(*norm))[1, 0]
    extra = Rectangle((0, 0), 1, 1, fc="w", fill=False, edgecolor='none', linewidth=0)
    fit = [a + b * i for i in list(zip(*scatpts))[0]]
    plt.axis('on')
    plt.legend([extra], [f"r = {r}"])
    plt.scatter(*zip(*scatpts), alpha=0.1, s=0.3)
    plt.xlabel('Taxi Flow')
    plt.ylabel('Projected Subway Flow')
    plt.plot(list(zip(*scatpts))[0], fit, color="r", alpha=0.7)
    #plt.xlim(0, 0.010)
    #plt.ylim(-0.005, 0.02)
    plt.savefig(f'images/subway_model/{file}.png')
    
make_scatter(target_matrix, solution, 'subway_scatter')
make_scatter(target_matrix_nn, solution_x, 'subway_nn_scatter')

In [None]:
import json
outline_points = []
with io.open('resources/roads.geojson') as f:
    data = json.load(f) 
    coo = [i['geometry']['coordinates'] for i in data['features']]
    for outline in tqdm(coo):
        for series in outline:
            for line in series:
                for (a, b), (c, d) in zip(line, line[1:]):
                    outline_points.append([a * 0.758, b, c * 0.758, d])


In [None]:
pts = []

def norm_loc(point):
    return min(name_to_location.values(), 
                  key=lambda x: (point[0] - x[0]) ** 2 + (point[1] - x[1]) ** 2)

with io.open('resources/lines.csv', 'r') as f:
    for line in tqdm(list(f)[1:]):
        line = re.sub(r'.*LINESTRING \((.*)\).*', r'\1', line)
        pts.append([norm_loc([float(i.split(" ")[0]) * long_scale, float(i.split(" ")[1])]) for i in line.split(', ')])

line_net = {k: set() for k in name_to_location.values()}
for line in tqdm(pts):
    for a, b in zip(line, line[1:]):
        line_net[a].add(b)
        line_net[b].add(a)

In [None]:
import networkx as nx
import matplotlib.pyplot as plt
from scipy.spatial import Voronoi, voronoi_plot_2d
import pandas as pd
from math import inf

def draw(matrix, color, filename, lines=False):
    fig = plt.figure(figsize=[24, 24])
    plt.rcParams['figure.figsize'] = [24, 24]

    vor = Voronoi([name_to_location[i] for i in name_to_location])
    voronoi_plot_2d(vor, show_vertices=False, line_colors='lightgray', show_points=False,
                     line_width=0.2, line_alpha=0.8, point_size=2)

    city = nx.Graph()
    
    for i, (name, k, flow, location) in enumerate(station_tuples):
        city.add_node(name, pos=location, size=flow / 500000)
        
    def draw_outline():
        inc = 0
        for a, b, c, d in (outline_points):
            city.add_node(inc, pos=(a, b), size=0)
            inc -= 1
            city.add_node(inc, pos=(c, d), size=0)
            inc -= 1
            city.add_edge(inc + 1, inc + 2, weight='0.6', length=inf, color='orange')
        def dist(x, y):
            return sqrt((x[0] - y[0]) ** 2 + (x[1] - y[1]) ** 2)
        for i in tqdm(line_net):
            for j in line_net[i]:
                n1 = [n for n, d in city.nodes(data=True) if d['pos'] == i]
                n2 = [n for n, d in city.nodes(data=True) if d['pos'] == j]
                for n1v, n2v in product(n1, n2):
                    city.add_edge(n1v, n2v, length=dist(i, j), weight=0.5, color=color)
    draw_outline()

    for i1, (name1, k1, j1, location1) in enumerate(station_tuples):
        for i2, (name2, k2, j2, location2) in enumerate(station_tuples):
            if lines:
                path = nx.shortest_path(city, source=name1, target=name2)
                for a, b in zip(path, path[1:]):
                    weight = matrix[i1][i2] ** 0.4
                    edge = city.get_edge_data(a, b, default=None)
                    if edge:
                        edge['weight'] += weight / len(matrix) / 2
            else:
                city.add_edge(name1, name2, weight=matrix[i1][i2] ** 0.9, color=color, length=inf)
            

    edgewidth = np.array([d['weight']
                          for (u, v, d) in city.edges(data=True)])
    edgecolor = np.array([d['color'] 
                          for (u, v, d) in city.edges(data=True)])

    pos = nx.get_node_attributes(city, 'pos')
    attr = nx.get_node_attributes(city, 'size')
    nodesize = [attr[i] for i in city.nodes()]
    nx.draw_networkx_nodes(city, pos, node_size=nodesize, node_color='red')
    nx.draw_networkx_edges(city, pos, width=edgewidth, edge_color=edgecolor)
    plt.rcParams['figure.facecolor'] = 'black'
    fig.patch.set_facecolor('black')
    plt.axis('off')
    plt.savefig(f'images/subway_model/{filename}_2015.png', bbox_inches='tight', dpi=100, facecolor='black', pad_inches=0)

In [None]:
draw(solution, 'lightblue', 'subways')
draw(solution, 'lightblue', 'subways_lines', lines=True)

In [None]:
draw(solution_x / solution_x.sum() * len(solution_x), 'lightblue', 'subways_nn_lines', lines=True)
draw(solution_x / solution_x.sum() * len(solution_x), 'lightblue', 'subways_nn')

In [None]:
draw(target_matrix, 'lightyellow', 'taxis')

In [None]:
draw(abs(solution - target_matrix), 'violet', 'diff')