-
Notifications
You must be signed in to change notification settings - Fork 44
/
Copy pathdatasets.py
154 lines (114 loc) · 4.21 KB
/
datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import cv2
import numpy as np
from glob import glob
from os.path import join
from sklearn.datasets import make_moons
from google_drive_downloader import GoogleDriveDownloader
def gaussians_dataset(n_gaussian, n_points, mus, stds):
"""
Provides a dataset made by several gaussians.
Parameters
----------
n_gaussian : int
The number of desired gaussian components.
n_points : list
A list of cardinality of points (one for each gaussian).
mus : list
A list of means (one for each gaussian, e.g. [[1, 1], [3, 1]).
stds : list
A list of stds (one for each gaussian, e.g. [[1, 1], [2, 2]).
Returns
-------
tuple
a tuple like:
data ndarray shape: (n_samples, dims).
class ndarray shape: (n_samples,).
"""
assert n_gaussian == len(mus) == len(stds) == len(n_points)
X = []
Y = []
for i in range(0, n_gaussian):
mu = mus[i]
std = stds[i]
n_pt = n_points[i]
cov = np.diag(std)
X.append(np.random.multivariate_normal(mu, cov, size=2*n_pt))
Y.append(np.ones(shape=2*n_pt) * i)
X = np.concatenate(X, axis=0)
Y = np.concatenate(Y, axis=0)
tot = np.concatenate((X, np.reshape(Y, newshape=(-1, 1))), axis=-1)
np.random.seed(30101990)
np.random.shuffle(tot)
X = tot[:, :-1]
Y = tot[:, -1]
n_train_samples = X.shape[0]//2
X_train = X[:n_train_samples]
Y_train = Y[:n_train_samples]
X_test = X[n_train_samples:]
Y_test = Y[n_train_samples:]
return X_train, Y_train, X_test, Y_test
def two_moon_dataset(n_samples=100, shuffle=True, noise=None, random_state=None):
"""
Make two interleaving half circles
A simple toy dataset to visualize clustering and classification
algorithms.
Parameters
----------
n_samples : int, optional (default=100)
The total number of points generated.
shuffle : bool, optional (default=True)
Whether to shuffle the samples.
noise : double or None (default=None)
Standard deviation of Gaussian noise added to the data.
Read more in the :ref:`User Guide <sample_generators>`.
Returns
-------
X : array of shape [n_samples, 2]
The generated samples.
y : array of shape [n_samples]
The integer labels (0 or 1) for class membership of each sample.
"""
X_train, Y_train = make_moons(n_samples, shuffle, noise, random_state)
X_test, Y_test = make_moons(n_samples, shuffle, noise, random_state)
return X_train, Y_train, X_test, Y_test
def people_dataset(data_path, train_split=60, overwrite=False):
"""
Function that loads data for people vs non people classification.
Parameters
----------
data_path: str
the dataset root folder.
train_split: int
percentage of points for training set (default is 60%).
Returns
-------
tuple
A tuple like (X_img_train, X_feat_train, Y_train, X_img_test, X_feat_test, Y_test)
"""
GoogleDriveDownloader.download_file_from_google_drive(file_id='1hM_kk3ys2YnaZbIBwwdXAMhJm4j9KaKI',
dest_path='./data/svm.zip',
unzip=True, overwrite=overwrite)
X_img = []
Y = []
X_feat = []
for l, c in enumerate(['non_people', 'people']):
img_list = glob(join(data_path, c, '*.pgm'))
X_img.append(np.array([cv2.imread(img, cv2.IMREAD_GRAYSCALE) for img in img_list]))
Y.append(np.ones(shape=len(img_list)) * l)
X_feat.append(np.load(join(data_path, c + '.npy')))
X_img = np.concatenate(X_img, axis=0)
X_feat = np.concatenate(X_feat, axis=0)
Y = np.concatenate(Y, axis=0)
idx = np.arange(0, X_img.shape[0])
np.random.shuffle(idx)
X_img = X_img[idx]
X_feat = X_feat[idx]
Y = Y[idx]
n_train_samples = X_img.shape[0] * train_split // 100
X_img_train = X_img[:n_train_samples]
X_feat_train = X_feat[:n_train_samples]
Y_train = Y[:n_train_samples]
X_img_test = X_img[n_train_samples:]
X_feat_test = X_feat[n_train_samples:]
Y_test = Y[n_train_samples:]
return X_img_train, X_feat_train, Y_train, X_img_test, X_feat_test, Y_test