# Dataset analysis



In [1]:
# https://ipython.org/ipython-doc/3/config/extensions/autoreload.html
%load_ext autoreload
%autoreload 2

In [4]:
from pathlib import Path
import sys
sys.path.insert(0, Path(".").absolute().parent.as_posix())

In [21]:
import numpy as np
from imblearn.under_sampling import RandomUnderSampler
from common.dataset import TrainvalFilesDataset, TransformedDataset

In [68]:
dataset = TrainvalFilesDataset("/home/fast_storage/imaterialist-challenge-furniture-2018/train_400x400/")
dataset = TransformedDataset(dataset, transforms=lambda x: x, target_transforms=lambda y: y - 1)

In [69]:
n_samples = len(dataset)
indices = np.zeros((n_samples, 1), dtype=np.int)
y = np.zeros((n_samples,), dtype=np.int)

In [70]:
for i, dp in enumerate(dataset):
    y[i] = dp[1]
    indices[i, 0] = i

In [71]:
y_counts = np.bincount(y)

In [72]:
y_counts

array([1231, 1481, 2352, 1458, 1577, 1092, 1581, 1349,  465, 1962, 1726,
       2554, 1583, 1854, 1039, 1335, 1459, 1517,  842, 3922, 2416, 1365,
       1178, 1761,  524, 1535, 2164, 1666, 1409, 1149, 2067, 1684, 1219,
        715,  641, 1555, 2221, 2281, 1094, 2016,  619, 3889, 1711, 1429,
       2252, 1481,  729, 1270, 1841, 1261, 1000, 1153, 1274, 1768, 2142,
       1709,  691,  781, 1401, 1656, 1899,  675, 1732, 1928, 1143,  338,
       1057, 2210, 1224, 1330, 1224, 1387, 2243,  604, 1772, 2112,  542,
       1989, 1494, 1482, 1570, 1262,  322, 1254,  616,  664, 1649, 1221,
       2312, 1221, 2062, 2626, 2280, 1280, 1146, 1292, 1060, 1823, 1909,
       1371, 1852, 1330, 2179, 2006,  833,  863, 1720, 1432,  742,  810,
       1827, 1730, 1356,  727,  803, 1866, 2206, 1744, 1062, 2031,  430,
       2433, 1956,  411, 2566, 1922,  729, 1273])

In [73]:
desired_ratio = dict([
    (label, min(c, 1500)) for label, c in enumerate(y_counts)
])

In [74]:
rs = RandomUnderSampler(ratio=desired_ratio,random_state=12345)

In [75]:
resampled_indices, y_resampled = rs.fit_sample(indices, y)

In [76]:
y_resampled_counts = np.bincount(y_resampled)

In [77]:
y_resampled_counts

array([1231, 1481, 1500, 1458, 1500, 1092, 1500, 1349,  465, 1500, 1500,
       1500, 1500, 1500, 1039, 1335, 1459, 1500,  842, 1500, 1500, 1365,
       1178, 1500,  524, 1500, 1500, 1500, 1409, 1149, 1500, 1500, 1219,
        715,  641, 1500, 1500, 1500, 1094, 1500,  619, 1500, 1500, 1429,
       1500, 1481,  729, 1270, 1500, 1261, 1000, 1153, 1274, 1500, 1500,
       1500,  691,  781, 1401, 1500, 1500,  675, 1500, 1500, 1143,  338,
       1057, 1500, 1224, 1330, 1224, 1387, 1500,  604, 1500, 1500,  542,
       1500, 1494, 1482, 1500, 1262,  322, 1254,  616,  664, 1500, 1221,
       1500, 1221, 1500, 1500, 1500, 1280, 1146, 1292, 1060, 1500, 1500,
       1371, 1500, 1330, 1500, 1500,  833,  863, 1500, 1432,  742,  810,
       1500, 1500, 1356,  727,  803, 1500, 1500, 1500, 1062, 1500,  430,
       1500, 1500,  411, 1500, 1500,  729, 1273])

In [79]:
resampled_indices = resampled_indices[:, 0]

In [80]:
y_resampled2 = y[resampled_indices]

In [81]:
y_resampled2.shape

(160644,)

In [83]:
np.bincount(y_resampled2)

array([1231, 1481, 1500, 1458, 1500, 1092, 1500, 1349,  465, 1500, 1500,
       1500, 1500, 1500, 1039, 1335, 1459, 1500,  842, 1500, 1500, 1365,
       1178, 1500,  524, 1500, 1500, 1500, 1409, 1149, 1500, 1500, 1219,
        715,  641, 1500, 1500, 1500, 1094, 1500,  619, 1500, 1500, 1429,
       1500, 1481,  729, 1270, 1500, 1261, 1000, 1153, 1274, 1500, 1500,
       1500,  691,  781, 1401, 1500, 1500,  675, 1500, 1500, 1143,  338,
       1057, 1500, 1224, 1330, 1224, 1387, 1500,  604, 1500, 1500,  542,
       1500, 1494, 1482, 1500, 1262,  322, 1254,  616,  664, 1500, 1221,
       1500, 1221, 1500, 1500, 1500, 1280, 1146, 1292, 1060, 1500, 1500,
       1371, 1500, 1330, 1500, 1500,  833,  863, 1500, 1432,  742,  810,
       1500, 1500, 1356,  727,  803, 1500, 1500, 1500, 1062, 1500,  430,
       1500, 1500,  411, 1500, 1500,  729, 1273])