In [1]:
import os
import math
from datetime import datetime
import pandas as pd
import numpy as np
from shapely.geometry import LineString
from scipy.stats import entropy
from geopy import distance
import hausdorff
from fastdtw import fastdtw

import seaborn as sns
import matplotlib.pyplot as plt

sns.set_theme()

# 生成效果评估

In [2]:
def js_divergence(p, q):
    p = p / (np.sum(p) + 1e-14)
    q = q / (np.sum(q) + 1e-14)
    m = (p + q) / 2
    return 0.5 * entropy(p, m) + 0.5 * entropy(q, m)

In [3]:
real = pd.read_csv('../../traj_gen_exp_porto/data/Porto_Taxi/cleaned-data/traj_tes_10000_random.cleaned.csv')
geo = pd.read_csv('../../traj_gen_exp_porto/data/Porto_Taxi/cleaned-data/roadmap.cleaned.geo')

gene_data = dict()

for dir_name in os.listdir('./gene'):
    gene_data[dir_name] = []
    for dir_name_ in os.listdir(os.path.join('./gene', dir_name)):
        file_path = os.path.join('./gene', dir_name, dir_name_)
        gene_data[dir_name].append(file_path)

计算道路的中心点

In [4]:
road_gps = []
for _, row in geo.iterrows():
    coordinates = eval(row['coordinates'])
    road_line = LineString(coordinates=coordinates)
    center_coord = road_line.centroid
    center_lon, center_lat = center_coord.x, center_coord.y
    road_gps.append((center_lon, center_lat))

计算距离

In [5]:
distance_dict = dict()

real_distance = []
for _, row in real.iterrows():
    rid_list = eval(row['rid_list'])
    travel_distance = 0
    for i in range(1, len(rid_list)):
        travel_distance += distance.distance((road_gps[rid_list[i-1]][1], road_gps[rid_list[i-1]][0]), (road_gps[rid_list[i]][1], road_gps[rid_list[i]][0])).kilometers
    real_distance.append(travel_distance)

real_max_distance = 28.97
distance_bins = np.linspace(0, real_max_distance, 100).tolist()
distance_bins.append(float('inf'))
distance_bins = np.array(distance_bins)
real_distance_distribution, _ = np.histogram(real_distance, distance_bins)

distance_dict['real'] = real_distance

print(f'real average: {np.mean(real_distance):.3f}')
print('=' * 40)

for name, path_list in gene_data.items():
    value_list = []
    js_list = []

    for path in path_list:
        data = pd.read_csv(path)
        distance_list = []
        for _, row in data.iterrows():
            rid_list = eval(row['rid_list'])
            travel_distance = 0
            for i in range(1, len(rid_list)):
                travel_distance += distance.distance((road_gps[rid_list[i-1]][1], road_gps[rid_list[i-1]][0]), (road_gps[rid_list[i]][1], road_gps[rid_list[i]][0])).kilometers
            distance_list.append(travel_distance)

        value_list.append(np.mean(distance_list))

        distance_distribution, _ = np.histogram(distance_list, distance_bins)
        js_list.append(js_divergence(real_distance_distribution, distance_distribution))

        distance_dict[name] = distance_list

    value_list = np.array(value_list)
    js_list = np.array(js_list)

    print(f'{name} average: {np.mean(value_list):.3f}±{np.std(value_list):.3f}')
    print(f'{name} JS divergence: {np.mean(js_list):.3f}±{np.std(js_list):.3f}')
    print('=' * 40)

real average: 3.626
aaai average: 3.979±0.010
aaai JS divergence: 0.013±0.001
aaai_one_softmax average: 4.017±0.044
aaai_one_softmax JS divergence: 0.011±0.001
new_gene average: 3.916±0.012
new_gene JS divergence: 0.012±0.001
new_gene_one_softmax average: 3.853±0.024
new_gene_one_softmax JS divergence: 0.008±0.000
new_gene_exp_v1 average: 6.838±0.040
new_gene_exp_v1 JS divergence: 0.113±0.001
new_gene_exp_v2 average: 4.528±0.017
new_gene_exp_v2 JS divergence: 0.026±0.001
new_gene_exp_v3 average: 3.397±0.023
new_gene_exp_v3 JS divergence: 0.034±0.001
new_gene_exp_v4 average: 4.181±0.008
new_gene_exp_v4 JS divergence: 0.018±0.001
new_gene_exp_v5 average: 4.087±0.004
new_gene_exp_v5 JS divergence: 0.015±0.001
new_gene_exp_v6 average: 4.175±0.008
new_gene_exp_v6 JS divergence: 0.018±0.001
new_gene_exp_v7 average: 4.072±0.004
new_gene_exp_v7 JS divergence: 0.015±0.001
new_gene_exp_v1_one_softmax average: 6.954±0.060
new_gene_exp_v1_one_softmax JS divergence: 0.115±0.002
new_gene_exp_v2_one_

In [6]:
# for name, distance_list in distance_dict.items():
#     sns.kdeplot(distance_list, label=name, fill=True)

# plt.legend()
# plt.show()

计算行驶半径

In [7]:
radius_dict = dict()

real_radius = []
for _, row in real.iterrows():
    rid_list = eval(row['rid_list'])
    lon_mean = np.mean([road_gps[rid][0] for rid in rid_list])
    lat_mean = np.mean([road_gps[rid][1] for rid in rid_list])
    rad = []
    for rid in rid_list:
        lon = road_gps[rid][0]
        lat = road_gps[rid][1]
        dis = distance.distance((lat_mean, lon_mean), (lat, lon)).kilometers
        rad.append(dis)
    rad = np.mean(rad)
    real_radius.append(rad)

real_max_radius = 4.12
radius_bins = np.linspace(0, real_max_radius, 100).tolist()
radius_bins.append(float('inf'))
real_radius_distribution, _ = np.histogram(real_radius, radius_bins)

radius_dict['real'] = real_radius

print(f'real average: {np.mean(real_radius):.3f}')
print('=' * 40)

for name, path_list in gene_data.items():
    value_list = []
    js_list = []

    for path in path_list:
        data = pd.read_csv(path)
        radius_list = []
        for _, row in data.iterrows():
            rid_list = eval(row['rid_list'])
            if isinstance(rid_list, int):
                rid_list = [rid_list]
            lon_mean = np.mean([road_gps[rid][0] for rid in rid_list])
            lat_mean = np.mean([road_gps[rid][1] for rid in rid_list])
            rad = []
            for rid in rid_list:
                lon = road_gps[rid][0]
                lat = road_gps[rid][1]
                dis = distance.distance((lat_mean, lon_mean), (lat, lon)).kilometers
                rad.append(dis)
            rad = np.mean(rad)
            radius_list.append(rad)

        value_list.append(np.mean(radius_list))

        radius_distribution, _ = np.histogram(radius_list, radius_bins)
        js_list.append(js_divergence(real_radius_distribution, radius_distribution))

        radius_dict[name] = radius_list

    value_list = np.array(value_list)
    js_list = np.array(js_list)

    print(f'{name} average: {np.mean(value_list):.3f}±{np.std(value_list):.3f}')
    print(f'{name} JS divergence: {np.mean(js_list):.3f}±{np.std(js_list):.3f}')
    print('=' * 40)

real average: 0.764
aaai average: 0.806±0.002
aaai JS divergence: 0.012±0.001
aaai_one_softmax average: 0.795±0.003
aaai_one_softmax JS divergence: 0.011±0.001
new_gene average: 0.819±0.002
new_gene JS divergence: 0.011±0.001
new_gene_one_softmax average: 0.798±0.004
new_gene_one_softmax JS divergence: 0.009±0.001
new_gene_exp_v1 average: 0.945±0.004
new_gene_exp_v1 JS divergence: 0.037±0.001
new_gene_exp_v2 average: 0.856±0.002
new_gene_exp_v2 JS divergence: 0.014±0.000
new_gene_exp_v3 average: 0.749±0.005
new_gene_exp_v3 JS divergence: 0.031±0.001
new_gene_exp_v4 average: 0.809±0.001
new_gene_exp_v4 JS divergence: 0.013±0.001
new_gene_exp_v5 average: 0.789±0.001
new_gene_exp_v5 JS divergence: 0.013±0.001
new_gene_exp_v6 average: 0.807±0.001
new_gene_exp_v6 JS divergence: 0.014±0.001
new_gene_exp_v7 average: 0.788±0.001
new_gene_exp_v7 JS divergence: 0.013±0.001
new_gene_exp_v1_one_softmax average: 0.957±0.005
new_gene_exp_v1_one_softmax JS divergence: 0.038±0.001
new_gene_exp_v2_one_

In [8]:
# for name, radius_list in radius_dict.items():
#     sns.kdeplot(radius_list, label=name, fill=True)

# plt.legend()
# plt.show()

计算Location Frequency

In [9]:
real_rid_freq = np.zeros(len(geo), dtype=np.float32)
for _, row in real.iterrows():
    rid_list = eval(row['rid_list'])
    for rid in rid_list:
        real_rid_freq[rid] += 1
real_rid_freq /= np.sum(real_rid_freq)
real_rid_freq = np.sort(real_rid_freq)[::-1][:500]

real_max_rid_freq =  0.00051
rid_freq_bins = np.linspace(0, real_max_rid_freq, 100).tolist()
rid_freq_bins.append(float('inf'))
real_rid_freq_distribution, _ = np.histogram(real_rid_freq, rid_freq_bins)

print(f'real average: {np.mean(real_rid_freq):.3f}')
print('=' * 40)

for name, path_list in gene_data.items():
    value_list = []
    js_list = []

    for path in path_list:
        data = pd.read_csv(path)
        rid_freq = np.zeros(len(geo), dtype=np.float32)
        for _, row in data.iterrows():
            rid_list = eval(row['rid_list'])
            for rid in rid_list:
                rid_freq[rid] += 1
        rid_freq /= np.sum(rid_freq)
        rid_freq = np.sort(rid_freq)[::-1][:500]

        value_list.append(np.mean(rid_freq))

        rid_freq_distribution, _ = np.histogram(rid_freq, rid_freq_bins)
        js_list.append(js_divergence(real_rid_freq_distribution, rid_freq_distribution))

    value_list = np.array(value_list)
    js_list = np.array(js_list)

    print(f'{name} average: {np.mean(value_list):.3f}±{np.std(value_list):.3f}')
    print(f'{name} JS divergence: {np.mean(js_list):.3f}±{np.std(js_list):.3f}')
    print('=' * 40)

real average: 0.001
aaai average: 0.001±0.000
aaai JS divergence: 0.016±0.009
aaai_one_softmax average: 0.001±0.000
aaai_one_softmax JS divergence: 0.041±0.021
new_gene average: 0.001±0.000
new_gene JS divergence: 0.014±0.004
new_gene_one_softmax average: 0.001±0.000
new_gene_one_softmax JS divergence: 0.020±0.010
new_gene_exp_v1 average: 0.001±0.000
new_gene_exp_v1 JS divergence: 0.015±0.000
new_gene_exp_v2 average: 0.001±0.000
new_gene_exp_v2 JS divergence: 0.015±0.000
new_gene_exp_v3 average: 0.001±0.000
new_gene_exp_v3 JS divergence: 0.015±0.000
new_gene_exp_v4 average: 0.001±0.000
new_gene_exp_v4 JS divergence: 0.019±0.012
new_gene_exp_v5 average: 0.001±0.000
new_gene_exp_v5 JS divergence: 0.021±0.002
new_gene_exp_v6 average: 0.001±0.000
new_gene_exp_v6 JS divergence: 0.017±0.004
new_gene_exp_v7 average: 0.001±0.000
new_gene_exp_v7 JS divergence: 0.004±0.002
new_gene_exp_v1_one_softmax average: 0.001±0.000
new_gene_exp_v1_one_softmax JS divergence: 0.015±0.000
new_gene_exp_v2_one_

计算行驶总时间

In [10]:
time_cost_dict = dict()

real_time_cost = []
for _, row in real.iterrows():
    time_list = row['time_list'].split(',')
    start_time = datetime.strptime(time_list[0], '%Y-%m-%dT%H:%M:%SZ')
    end_time = datetime.strptime(time_list[-1], '%Y-%m-%dT%H:%M:%SZ')
    time_cost = (end_time-start_time).total_seconds() / 60
    real_time_cost.append(time_cost)

real_max_time_cost = 71.75
time_cost_bins = np.linspace(0, real_max_time_cost, 100).tolist()
time_cost_bins.append(float('inf'))
time_cost_bins = np.array(time_cost_bins)
real_time_cost_distribution, _ = np.histogram(real_time_cost, time_cost_bins)

time_cost_dict['real'] = real_time_cost

print(f'real average: {np.mean(real_time_cost):.3f}')
print('=' * 40)

for name, path_list in gene_data.items():
    value_list = []
    js_list = []

    for path in path_list:
        data = pd.read_csv(path)
        time_cost_list = []
        for _, row in data.iterrows():
            time_list = eval(row['time_list'])
            start_time = time_list[0]
            end_time = time_list[-1]
            time_cost = (end_time-start_time) % 2880
            time_cost_list.append(time_cost)

        value_list.append(np.mean(time_cost_list))

        time_cost_distribution, _ = np.histogram(time_cost_list, time_cost_bins)
        js_list.append(js_divergence(real_time_cost_distribution, time_cost_distribution))

        time_cost_dict[name] = time_cost_list

    value_list = np.array(value_list)
    js_list = np.array(js_list)

    print(f'{name} average: {np.mean(value_list):.3f}±{np.std(value_list):.3f}')
    print(f'{name} JS divergence: {np.mean(js_list):.3f}±{np.std(js_list):.3f}')
    print('=' * 40)

real average: 7.982
aaai average: 17.048±0.560
aaai JS divergence: 0.147±0.001
aaai_one_softmax average: 17.500±0.588
aaai_one_softmax JS divergence: 0.155±0.002
new_gene average: 15.111±0.443
new_gene JS divergence: 0.164±0.001
new_gene_one_softmax average: 14.460±0.353
new_gene_one_softmax JS divergence: 0.178±0.001
new_gene_exp_v1 average: 28.218±0.605
new_gene_exp_v1 JS divergence: 0.186±0.002
new_gene_exp_v2 average: 17.886±0.351
new_gene_exp_v2 JS divergence: 0.155±0.001
new_gene_exp_v3 average: 17.025±0.290
new_gene_exp_v3 JS divergence: 0.154±0.001
new_gene_exp_v4 average: 13.440±0.721
new_gene_exp_v4 JS divergence: 0.159±0.001
new_gene_exp_v5 average: 14.715±0.348
new_gene_exp_v5 JS divergence: 0.165±0.001
new_gene_exp_v6 average: 13.334±0.691
new_gene_exp_v6 JS divergence: 0.157±0.001
new_gene_exp_v7 average: 14.888±0.574
new_gene_exp_v7 JS divergence: 0.165±0.001
new_gene_exp_v1_one_softmax average: 27.521±0.317
new_gene_exp_v1_one_softmax JS divergence: 0.191±0.003
new_gene

In [11]:
# for name, time_cost_list in time_cost_dict.items():
#     sns.kdeplot(time_cost_list, label=name, fill=True)


# plt.legend()
# plt.show()

计算Hausdorff

In [12]:
for name, path_list in gene_data.items():
    value_list = []

    for path in path_list:
        data = pd.read_csv(path)
        hausdorff_list = []
        for i in range(data.shape[0]):
            real_rid_list = eval(real.loc[i, 'rid_list'])
            real_gps_list = [road_gps[rid][::-1] for rid in real_rid_list]
            real_gps_list = np.array(real_gps_list)

            gene_rid_list = eval(data.loc[i, 'rid_list'])
            gene_gps_list = [road_gps[rid][::-1] for rid in gene_rid_list]
            gene_gps_list = np.array(gene_gps_list)
            hausdorff_list.append(hausdorff.hausdorff_distance(real_gps_list, gene_gps_list, distance='haversine'))
        value_list.append(np.mean(hausdorff_list))

    print(f'{name}: {np.mean(value_list):.3f}±{np.std(value_list):.3f}')
    print('=' * 40)

aaai: 0.671±0.006
aaai_one_softmax: 0.663±0.011
new_gene: 0.627±0.004
new_gene_one_softmax: 0.533±0.010
new_gene_exp_v1: 1.269±0.007
new_gene_exp_v2: 0.775±0.002
new_gene_exp_v3: 0.887±0.010
new_gene_exp_v4: 0.773±0.001
new_gene_exp_v5: 0.743±0.002
new_gene_exp_v6: 0.774±0.002
new_gene_exp_v7: 0.742±0.004
new_gene_exp_v1_one_softmax: 1.295±0.015
new_gene_exp_v2_one_softmax: 0.726±0.021
new_gene_exp_v3_one_softmax: 0.351±0.004
new_gene_exp_v4_one_softmax: 0.751±0.007
new_gene_exp_v5_one_softmax: 0.713±0.008
new_gene_exp_v6_one_softmax: 0.752±0.007
new_gene_exp_v7_one_softmax: 0.712±0.008
new_gene_exp_v1_one_softmax_without_log: 1.297±0.008
new_gene_exp_v2_one_softmax_without_log: 0.740±0.012
new_gene_exp_v3_one_softmax_without_log: 0.371±0.010
new_gene_exp_v4_one_softmax_without_log: 0.751±0.007
new_gene_exp_v5_one_softmax_without_log: 0.719±0.004
new_gene_exp_v6_one_softmax_without_log: 0.752±0.007
new_gene_exp_v7_one_softmax_without_log: 0.719±0.005
new_gene_one_softmax_without_log: 0

计算DTW

In [13]:
def haversine(array_x, array_y):
    R = 6378.0
    radians = np.pi / 180.0
    lat_x = radians * array_x[0]
    lon_x = radians * array_x[1]
    lat_y = radians * array_y[0]
    lon_y = radians * array_y[1]
    dlon = lon_y - lon_x
    dlat = lat_y - lat_x
    a = (pow(math.sin(dlat/2.0), 2.0) + math.cos(lat_x) * math.cos(lat_y) * pow(math.sin(dlon/2.0), 2.0))
    return R * 2 * math.asin(math.sqrt(a))

for name, path_list in gene_data.items():
    value_list = []

    for path in path_list:
        data = pd.read_csv(path)
        dtw_list = []
        for i in range(data.shape[0]):
            pred_rid_list = eval(data.loc[i, 'rid_list'])
            real_rid_list = eval(real.loc[i, 'rid_list'])

            gene_gps_list = [road_gps[rid][::-1] for rid in pred_rid_list]
            real_gps_list = [road_gps[rid][::-1] for rid in real_rid_list]
            gene_gps_list = np.array(gene_gps_list)
            real_gps_list = np.array(real_gps_list)

            dtw_list.append(fastdtw(real_gps_list, gene_gps_list, dist=haversine)[0])
        value_list.append(np.mean(dtw_list))

    print(f'{name}: {np.mean(value_list):.3f}±{np.std(value_list):.3f}')
    print('=' * 40)

aaai: 18.238±0.222
aaai_one_softmax: 19.851±0.712
new_gene: 16.433±0.093
new_gene_one_softmax: 13.723±0.272
new_gene_exp_v1: 70.941±0.622
new_gene_exp_v2: 22.190±0.139
new_gene_exp_v3: 27.984±0.399
new_gene_exp_v4: 25.629±0.138
new_gene_exp_v5: 24.524±0.137
new_gene_exp_v6: 25.654±0.141
new_gene_exp_v7: 24.387±0.163
new_gene_exp_v1_one_softmax: 73.830±1.084
new_gene_exp_v2_one_softmax: 20.475±0.518
new_gene_exp_v3_one_softmax: 8.735±0.073
new_gene_exp_v4_one_softmax: 25.190±0.427
new_gene_exp_v5_one_softmax: 24.054±0.389
new_gene_exp_v6_one_softmax: 25.059±0.435
new_gene_exp_v7_one_softmax: 23.913±0.429
new_gene_exp_v1_one_softmax_without_log: 74.091±0.696
new_gene_exp_v2_one_softmax_without_log: 21.051±0.310
new_gene_exp_v3_one_softmax_without_log: 9.506±0.342
new_gene_exp_v4_one_softmax_without_log: 25.066±0.259
new_gene_exp_v5_one_softmax_without_log: 23.984±0.213
new_gene_exp_v6_one_softmax_without_log: 24.996±0.231
new_gene_exp_v7_one_softmax_without_log: 23.844±0.218
new_gene_one

计算EDR

In [14]:
rad = math.pi / 180.0
R = 6378137.0

def great_circle_distance(lon1, lat1, lon2, lat2):
    dlat = rad * (lat2 - lat1)
    dlon = rad * (lon2 - lon1)
    a = (math.sin(dlat / 2.0) * math.sin(dlat / 2.0) +
         math.cos(rad * lat1) * math.cos(rad * lat2) *
         math.sin(dlon / 2.0) * math.sin(dlon / 2.0))
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    d = R * c
    return d

def edr(t0, t1, eps):
    n0 = len(t0)
    n1 = len(t1)
    C = np.full((n0 + 1, n1 + 1), np.inf)
    C[:, 0] = np.arange(n0 + 1)
    C[0, :] = np.arange(n1 + 1)

    for i in range(1, n0 + 1):
        for j in range(1, n1 + 1):
            if great_circle_distance(t0[i - 1][0], t0[i - 1][1], t1[j - 1][0], t1[j - 1][1]) < eps:
                subcost = 0
            else:
                subcost = 1
            C[i][j] = min(C[i][j - 1] + 1, C[i - 1][j] + 1, C[i - 1][j - 1] + subcost)
    edr = float(C[n0][n1]) / max([n0, n1])
    return edr

for name, path_list in gene_data.items():
    value_list = []

    for path in path_list:
        data = pd.read_csv(path)
        edr_list = []
        for i in range(data.shape[0]):
            gene_rid_list = eval(data.loc[i, 'rid_list'])
            real_rid_list = eval(real.loc[i, 'rid_list'])

            gene_gps_list = [road_gps[rid][::-1] for rid in gene_rid_list]
            real_gps_list = [road_gps[rid][::-1] for rid in real_rid_list]
            gene_gps_list = np.array(gene_gps_list)
            real_gps_list = np.array(real_gps_list)

            edr_list.append(edr(real_gps_list, gene_gps_list, 100))
        value_list.append(np.mean(edr_list))

    print(f'{name}: {np.mean(value_list):.3f}±{np.std(value_list):.3f}')
    print('=' * 40)

aaai: 0.541±0.003
aaai_one_softmax: 0.493±0.003
new_gene: 0.499±0.002
new_gene_one_softmax: 0.419±0.005
new_gene_exp_v1: 0.738±0.001
new_gene_exp_v2: 0.592±0.002
new_gene_exp_v3: 0.439±0.003
new_gene_exp_v4: 0.547±0.002
new_gene_exp_v5: 0.541±0.001
new_gene_exp_v6: 0.551±0.002
new_gene_exp_v7: 0.543±0.002
new_gene_exp_v1_one_softmax: 0.736±0.001
new_gene_exp_v2_one_softmax: 0.501±0.005
new_gene_exp_v3_one_softmax: 0.270±0.004
new_gene_exp_v4_one_softmax: 0.528±0.005
new_gene_exp_v5_one_softmax: 0.524±0.001
new_gene_exp_v6_one_softmax: 0.531±0.004
new_gene_exp_v7_one_softmax: 0.526±0.001
new_gene_exp_v1_one_softmax_without_log: 0.738±0.001
new_gene_exp_v2_one_softmax_without_log: 0.551±0.002
new_gene_exp_v3_one_softmax_without_log: 0.271±0.005
new_gene_exp_v4_one_softmax_without_log: 0.530±0.005
new_gene_exp_v5_one_softmax_without_log: 0.525±0.002
new_gene_exp_v6_one_softmax_without_log: 0.533±0.005
new_gene_exp_v7_one_softmax_without_log: 0.527±0.002
new_gene_one_softmax_without_log: 0