In [1]:
import numpy as np
import pandas as pd
import math

In [2]:
# minX,minY,maxX,maxY = 113.175050,22.800400,113.617000,23.676500
# #经纬度范围
# ROW_NUM = 11
# COL_NUM = 11
# # 区域的宽度和高度
# REGION_WIDTH = (maxX-minX)/COL_NUM
# REGION_HEIGHT = (maxY-minY)/ROW_NUM
# REGION_WIDTH,REGION_HEIGHT

In [3]:
#返回请求所在区域id（0~120）
def getRegionNum(location):
    longitude,latitude = location
    weight = longitude - minX
    height = latitude - minY
    col_index = math.floor(weight / REGION_WIDTH)
    row_index = math.floor(height/ REGION_HEIGHT)
    region_num = row_index*COL_NUM + col_index
    return region_num

#增加两列 pickup_region, dropoff_region,分别表示起点所在区域和终点所在区域
def add_region(df):
    pickup_df = df[['pickup_longitude','pickup_latitude']]
    pickup_region = pickup_df.apply(getRegionNum,axis=1)
    dropoff_df = df[['dropoff_longitude','dropoff_latitude']]
    dropoff_region = dropoff_df.apply(getRegionNum,axis=1)
    df['pickup_region'] = pickup_region
    df['dropoff_region'] = dropoff_region

# 生成请求终点所在的区域
def getDestRegionId(origin_rid):
    probability_regions = probability_df.loc[origin_rid]#不要用probability_df[origin_rid]
    regions = probability_regions[probability_regions>0].cumsum()
    id_regions = regions.index
    alpha = np.random.random()
    for (i,v) in enumerate(regions.values):
        if(v>=alpha): 
            dest_rid = id_regions[i]
            break
    return dest_rid

# 获取区域范围
def getRegionRange(region_id):
    row_idx = math.floor(region_id / COL_NUM)
    col_idx = region_id % COL_NUM
    min_long = minX + col_idx*REGION_WIDTH
    max_long = minX + (col_idx+1)*REGION_WIDTH
    min_lat = minY + row_idx*REGION_HEIGHT 
    max_lat = minY + (row_idx+1)*REGION_HEIGHT
    return min_long,min_lat,max_long,max_lat

# 生成坐标
def generateCoordinate(region_id):
    min_long,min_lat,max_long,max_lat = getRegionRange(region_id)
    #生成经纬度
    longitude = np.random.uniform(min_long,max_long)
    latitude = np.random.uniform(min_lat,max_lat)
    return longitude,latitude

#生成请求
def generateRequests(total_num):
    
    arr = np.zeros((total_num,4))
    org_num = np.zeros(total_num)#每个请求的起点区域
    n = 0
    
    while n < total_num:
        #选择一个区域生成请求
        origin_rid = np.random.randint(0,121)
        if pickup_counts[origin_rid] == 0:
            continue
        #在该区域内生成的请求数量（满足泊松分布）
        request_num = np.random.poisson(np.ceil(pickup_counts[origin_rid]/30))
        if request_num == 0:
            continue
        if n + request_num > total_num:
            request_num = total_num - n
        org_num[n:n+request_num] = np.array([origin_rid]*request_num)
        n += request_num
    
    #生成坐标
    for i in range(total_num):
        #生成起点坐标
        pickup_longitude,pickup_latitude = generateCoordinate(org_num[i])
        #生成终点坐标
        dest_rid = getDestRegionId(org_num[i])
        dropoff_longitude, dropoff_latitude = generateCoordinate(dest_rid)
        arr[i] = [pickup_longitude,pickup_latitude,dropoff_longitude, dropoff_latitude]
       
       
    df = pd.DataFrame(arr,columns=['pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude'])
    df = df.sample(frac=1).reset_index(drop=True)
    df = df.round(5)
    return df

#转移矩阵 transform_matrix[2,0] = 1表示从区域2到区域0的taxi数为1
def computeProbability(df):
    add_region(df)
    transform_matrix = np.zeros((ROW_NUM*COL_NUM,ROW_NUM*COL_NUM))
    res = df.groupby(by='pickup_region')['dropoff_region'].value_counts()
    keys = res.index
    values = res.values
    for (idx,key) in enumerate(keys):
        x,y = key
        transform_matrix[x][y] = values[idx]
    transform_df = pd.DataFrame(transform_matrix)
    #统计每个区域的请求起点数量
    pickup_counts = transform_df.apply(np.sum,axis=1)
    #转移概率矩阵
    probability_df = transform_df.div(pickup_counts, axis=0)
    return pickup_counts,probability_df 


In [4]:
df = pd.read_table('./Guangzhou.txt',sep=',', header=None, names=['pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude'])
df.describe()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude
count,962.0,962.0,962.0,962.0
mean,113.301773,23.154476,113.301209,23.152286
std,0.053901,0.075335,0.054882,0.074524
min,113.17505,22.8005,113.17505,22.8004
25%,113.262425,23.11794,113.262043,23.1177
50%,113.29491,23.139965,113.29291,23.1386
75%,113.33075,23.195407,113.327978,23.193683
max,113.55674,23.6764,113.616,23.67171


In [5]:
minX,minY=113.222425,23.097940
#经纬度范围
ROW_NUM = 11
COL_NUM = 11
# 区域的宽度和高度
# REGION_WIDTH = 0.010108667
# REGION_HEIGHT = 0.0093325
REGION_WIDTH = 0.01008667
REGION_HEIGHT = 0.01008667
maxX,maxY = minX + COL_NUM*REGION_WIDTH, minY + ROW_NUM*REGION_HEIGHT 
print(maxX,maxY)

df = pd.read_table('./Guangzhou.txt',sep=',', header=None, names=['pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude'])
print(df.shape)
# 获取经纬度范围内的数据
df = df[(df['pickup_longitude']>minX) & (df['pickup_longitude']<maxX)]
df = df[(df['dropoff_longitude']>minX) & (df['dropoff_longitude']<maxX)]
df = df[(df['pickup_latitude']>minY) & (df['pickup_latitude']<maxY)]
df = df[(df['dropoff_latitude']>minY) & (df['dropoff_latitude']<maxY)]
df.shape

113.33337837 23.208893370000002
(962, 4)


(355, 4)

In [27]:
dist = np.power(df['pickup_longitude'] - df['dropoff_longitude'],2) + np.power(df['pickup_latitude'] - df['dropoff_latitude'],2)
df1 = df[dist>0.00040].copy()
df1.shape

(193, 4)

In [28]:
pickup_counts,probability_df = computeProbability(df1)
df1['pickup_region'].value_counts()

38    9
49    8
30    6
43    6
41    6
     ..
32    1
34    1
55    1
63    1
87    1
Name: pickup_region, Length: 77, dtype: int64

In [29]:
for i in range(1,11):
    p_df = generateRequests(600)
    d_df = generateRequests(300)
    p_df.to_csv('F:/yuan/carpooling/Guangzhou/passengers_requests/requests_600_{}.txt'.format(i),sep=' ',header=None,index=False)
    d_df.to_csv('F:/yuan/carpooling/Guangzhou/drivers_requests/requests_300_{}.txt'.format(i),sep=' ',header=None,index=False)