-
Notifications
You must be signed in to change notification settings - Fork 9
/
loader.py
139 lines (112 loc) · 4.44 KB
/
loader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
from joblib import Parallel, delayed
from sklearn.utils import gen_even_slices
import pandas as pd
import numpy as np
from ..trajectory_data import TrajectoryData
class TrajectoryLoader(object):
"""Base class for trajectory loaders.
"""
def load(self):
"""Loads trajectories according to the specific approach.
Returns
-------
data : :class:`trajminer.TrajectoryData`
A :class:`trajminer.TrajectoryData` containing the loaded dataset.
"""
pass
class CSVTrajectoryLoader(TrajectoryLoader):
"""A trajectory data loader from a CSV file.
Parameters
----------
file : str
The CSV file from which to read the data.
sep : str (default=',')
The CSV separator.
tid_col : str (default='tid')
The column in the CSV file corresponding to the trajectory IDs.
label_col : str (default='label')
The column in the CSV file corresponding to the trajectory labels. If
`None`, labels are not loaded.
lat : str (default='lat')
The column in the CSV file corresponding to the latitude of the
trajectory points. If both the `lat` and `lon` columns are present in
the file, they are included as a single new attribute in the loaded
dataset.
lon : str (default='lon')
The column in the CSV file corresponding to the longitude of the
trajectory points. If both the `lat` and `lon` columns are present in
the file, they are included as a single new attribute in the loaded
dataset.
drop_col : array-like (default=None)
List of columns to drop when reading the data from the file.
n_jobs : int (default=1)
The number of parallel jobs.
Examples
--------
>>> from trajminer.utils import CSVTrajectoryLoader
>>> loader = CSVTrajectoryLoader('my_data.csv')
>>> dataset = loader.load()
>>> dataset.get_attributes()
['poi', 'day', 'time']
"""
def __init__(self, file, sep=',', tid_col='tid', label_col='label',
lat='lat', lon='lon', drop_col=None, n_jobs=1):
self.file = file
self.sep = sep
self.tid_col = tid_col
self.label_col = label_col
self.lat = lat
self.lon = lon
self.drop_col = drop_col if drop_col is not None else []
self.n_jobs = n_jobs
def load(self):
cols = []
with open(self.file, 'r') as f:
cols = f.readline().replace('\n', '').split(self.sep)
for col in self.drop_col:
if col in cols:
cols.remove(col)
df = pd.read_csv(self.file, sep=self.sep, usecols=cols)
attributes = list(df.keys())
attributes.remove(self.tid_col)
if self.label_col and self.label_col != self.tid_col:
attributes.remove(self.label_col)
lat_lon = self.lat in attributes and self.lon in attributes
if lat_lon:
attributes.remove(self.lat)
attributes.remove(self.lon)
tids = sorted(df[self.tid_col].unique())
def load_tids(s):
ret = []
for idx in range(s.start, s.stop):
tid = tids[idx]
traj = df.loc[df[self.tid_col] == tid, attributes].values
if lat_lon:
loc = df.loc[df[self.tid_col] == tid,
[self.lat, self.lon]].values
new_traj = []
for i, _ in enumerate(loc):
point = list(traj[i])
point.append(loc[i])
new_traj.append(point)
traj = new_traj
ret.append(traj)
return ret
labels = None
func = delayed(load_tids)
data = Parallel(n_jobs=self.n_jobs, verbose=0)(
func(s) for s in gen_even_slices(len(tids), self.n_jobs))
data = np.concatenate(data)
if self.label_col:
labels = df \
.drop_duplicates(subset=[self.tid_col, self.label_col],
inplace=False) \
.sort_values(self.tid_col,
ascending=True,
inplace=False)[self.label_col].values
if lat_lon:
attributes.append('lat_lon')
return TrajectoryData(attributes=attributes,
data=data,
tids=tids,
labels=labels)