# 基于轨迹数据的聚类
## 1.读取数据

In [2]:
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN


#DBSCAN聚类
gpsdata = pd.read_csv('trajectory_res.csv')

In [3]:
gpsdata.head(15)

Unnamed: 0,IMSI,o_longitude,o_latitude,d_longitude,d_latitude
0,460030084404532,118.800018,32.002331,118.800034,32.00206
1,460030084404532,118.800034,32.00206,118.799751,32.002109
2,460030084404532,118.799751,32.002109,118.800133,32.0023
3,460030084404532,118.800133,32.0023,118.800613,32.00235
4,460030084404532,118.800613,32.00235,118.799896,32.002239
5,460030084404532,118.799896,32.002239,118.800484,32.002312
6,460030084404532,118.800484,32.002312,118.799942,32.00219
7,460030084404532,118.799942,32.00219,118.800568,32.002331
8,460030084404532,118.800568,32.002331,118.800087,32.002232
9,460030084404532,118.800087,32.002232,118.80027,32.002201


## 2.计算轨迹的弧度制
用于之后计算haversine距离

In [5]:
#计算轨迹OD的haversine距离
# from sklearn.metrics.pairwise import haversine_distances
from math import radians

star = gpsdata[['o_longitude','o_latitude']]
star_rad = []
end = gpsdata[['d_longitude','d_latitude']]
end_rad = []
for i in range(len(star)):
    star_rad.append([radians(_) for _ in star.iloc[i]])
    end_rad.append([radians(_) for _ in end.iloc[i]])

In [7]:
star[1:5]

Unnamed: 0,o_longitude,o_latitude
1,118.800034,32.00206
2,118.799751,32.002109
3,118.800133,32.0023
4,118.800613,32.00235


In [8]:
star_rad[1:5]

[[2.0734517377998922, 0.5585413126754473],
 [2.0734468107354136, 0.5585421783587562],
 [2.0734534691665103, 0.5585455084469692],
 [2.0734618572188954, 0.5585463741302781]]

## 3.使用DBSCAN聚类

In [10]:
#确定密度半径

# earth's radius in km
kms_per_radian = 6371.0088
# define epsilon as 0.5 kilometers, converted to radians for use by haversine
epsilon = 0.1 / kms_per_radian

In [12]:
#DBSCAN聚类
# min_samples is the minimum cluster size (everything else is classified as noise)
db = DBSCAN(eps=epsilon, min_samples=2, algorithm='ball_tree', metric='haversine').fit(star_rad,end_rad)

In [13]:
db

DBSCAN(algorithm='ball_tree', eps=1.5696101377226163e-05, leaf_size=30,
       metric='haversine', metric_params=None, min_samples=2, n_jobs=None,
       p=None)

In [14]:
#类别标签
cluster_labels = db.labels_
# get the number of clusters (ignore noisy samples which are given the label -1)
#去除噪声，类别为-1
num_clusters = len(set(cluster_labels) - set([-1]))
pd_obj2 = pd.DataFrame(cluster_labels)
print ('Clustered ' + str(len(gpsdata)) + ' points to ' + str(num_clusters) + ' clusters')

Clustered 4434 points to 27 clusters


## 4.将聚类数据储存

In [16]:
cluster_frame = pd.concat([gpsdata,pd.DataFrame(cluster_labels)],axis=1)
# turn the clusters in to a pandas series
clusters = pd.Series([gpsdata.iloc[cluster_labels == n] for n in range(num_clusters)])
cluster_frame.to_csv('cluster.csv')

In [17]:
cluster_frame.head(15)

Unnamed: 0,IMSI,o_longitude,o_latitude,d_longitude,d_latitude,0
0,460030084404532,118.800018,32.002331,118.800034,32.00206,0
1,460030084404532,118.800034,32.00206,118.799751,32.002109,0
2,460030084404532,118.799751,32.002109,118.800133,32.0023,0
3,460030084404532,118.800133,32.0023,118.800613,32.00235,0
4,460030084404532,118.800613,32.00235,118.799896,32.002239,0
5,460030084404532,118.799896,32.002239,118.800484,32.002312,0
6,460030084404532,118.800484,32.002312,118.799942,32.00219,0
7,460030084404532,118.799942,32.00219,118.800568,32.002331,0
8,460030084404532,118.800568,32.002331,118.800087,32.002232,0
9,460030084404532,118.800087,32.002232,118.80027,32.002201,0
