# Embrapa Wine Grape Instance Segmentation Dataset (WGISD)

## Why was the dataset created?

Embrapa WGISD (Wine Grape Instance Segmentation Dataset) was created to provide images and annotation to study
object detection and instance segmentation in image-based monitoring and field robotics for viticulture. It provides
instances from five different grape varieties taken on field. These instances shows variance in grape pose, illumination
and focus, including genetic and phenological variations as shape, color and compactness.

# Dataset Composition

In [1]:
varietals = ['CDY', 'CFR', 'CSV', 'SVB', 'SYH']

## How many instances of each type are there?

In [2]:
import os
import numpy as np

In [3]:
instances = {v: [] for v in varietals}

for dirname, dirnames, filenames in os.walk('.'):
    for filename in [f for f in filenames if f.endswith('.txt')]:
        for v in varietals:
            if filename.startswith(v):
                instances[v].append(filename[:-4])
        



In [4]:
n_vimages = {v: len(inst_v) for v, inst_v in instances.items()}
n_vimages

{'CDY': 65, 'CFR': 65, 'CSV': 57, 'SVB': 65, 'SYH': 48}

In [5]:
n_images = np.array([n for __, n in n_vimages.items()]).sum()
n_images

300

### Bounding boxes

In [6]:
n_iboxes = {v: {} for v in varietals}

for v in varietals:
    for ii in instances[v]:
        annot_file = ii + '.txt'
        bboxes = np.loadtxt(annot_file)
        n_iboxes[v][ii] = bboxes.shape[0]

In [7]:
n_vboxes = {v: np.array([n for ii, n in n_iboxes[v].items()]).sum() for v in varietals}
n_vboxes

{'CDY': 838, 'CFR': 1069, 'CSV': 640, 'SVB': 1313, 'SYH': 559}

### Masks

In [8]:
n_imasks = {v: {} for v in varietals}

for v in varietals:
    for ii in instances[v]:
        annot_file = ii + '.npz'
        if os.path.isfile(annot_file):
            masks = np.load(annot_file)['arr_0']
            n_imasks[v][ii] = masks.shape[2]

There is a mask for each bounding box in the masked images?

In [10]:
for v in varietals:
    for ii in n_imasks[v]:
        assert(n_imasks[v][ii] == n_iboxes[v][ii])

In [11]:
n_vmasks = {v: np.array([n for ii, n in n_imasks[v].items()]).sum() for v in varietals}
n_vmasks

{'CDY': 242, 'CFR': 460, 'CSV': 290, 'SVB': 586, 'SYH': 256}

In [12]:
import pandas as pd

In [14]:
n_vimages

{'CDY': 65, 'CFR': 65, 'CSV': 57, 'SVB': 65, 'SYH': 48}

In [24]:
df = pd.DataFrame(index=varietals, columns=['Images', 'BoxedBunches', 'MaskedBunches'])
for v, val in n_vimages.items():
    df.loc[v, 'Images'] = val
    df.loc[v, 'BoxedBunches'] = n_vboxes[v]
    df.loc[v, 'MaskedBunches'] = n_vmasks[v]
    
df

Unnamed: 0,Images,BoxedBunches,MaskedBunches
CDY,65,838,242
CFR,65,1069,460
CSV,57,640,290
SVB,65,1313,586
SYH,48,559,256


In [25]:
df.sum()

Images            300
BoxedBunches     4419
MaskedBunches    1834
dtype: int64

# Train/test split for masked

In [17]:
import random
random.seed(42)

In [18]:
random.shuffle(M)

In [19]:
M_train = M[0:n_train_m]
len(M_train)

100

In [20]:
M_test = M[n_train_m:]
len(M_test)

25

# Train/test split for bbox

In [21]:
random.shuffle(D)

In [22]:
n_diff = len(D)
n_diff

175

In [23]:
D_train = D[0: int(0.8 * n_diff)]
len(D_train)

140

In [24]:
D_test = D[int(0.8 * n_diff):]
len(D_test)

35

# General train/test

In [25]:
train = M_train + D_train
len(train)

240

In [26]:
test = M_test + D_test
len(test)

60

In [27]:
a = np.array([v for k, v in bboxes.items() if k in train])
n_train_bboxes = a.sum()
n_train_bboxes

3555

In [28]:
a = np.array([v for k, v in bboxes.items() if k in test])
n_test_bboxes = a.sum()
n_test_bboxes

843

In [29]:
n_train_bboxes + n_test_bboxes

4398

In [30]:
a = np.array([v for k, v in bboxes.items() if k in M_train])
n_train_bboxes_m = a.sum()
n_train_bboxes_m

1479

In [31]:
a = np.array([v for k, v in bboxes.items() if k in M_test])
n_test_bboxes_m = a.sum()
n_test_bboxes_m

355

In [32]:
n_train_bboxes_m + n_test_bboxes_m

1834

In [33]:
1898 - 1834

64

In [34]:
cd masks/train

/media/thiago/st_expansion_3tb/01.14.09.001.05.04-frutifuturo/grape-v3/masks/train


In [35]:
len(M)

125

In [36]:
n_masks = {}
for i in M:
    src = i + '.npz'
    print(src)
    mask = np.load(src)['arr_0']
    n_masks[i] = mask.shape[2]
    if  n_masks[i] != bboxes[i]:
        print('Error in %s: %d != %d' % (i, n_masks[i], bboxes[i]))

CDY_2031.npz
SVB_1974.npz
SVB_1978.npz
SVB_1977.npz
CSV_20180427_144528920.npz
CDY_2050.npz
CSV_1889.npz
CFR_20180427_141236326.npz
SVB_20180427_152106643_HDR.npz
SVB_1939.npz
CSV_1903.npz
CFR_1624.npz
CFR_1642.npz
SVB_1950.npz
CFR_1645.npz
CSV_1893.npz
CSV_20180427_144556277.npz
SYH_2017-04-27_1312.npz
SVB_1971.npz
CSV_1891.npz
CSV_1881.npz
CDY_20180427_153021423_BURST001.npz
CSV_1899.npz
SVB_1937.npz
CFR_1652.npz
SVB_1976.npz
CDY_2027.npz
CDY_20180427_152937457_BURST000_COVER_TOP.npz
SVB_1951.npz
CSV_1873.npz
CFR_1635.npz
SVB_1957.npz
CSV_20180427_144615644.npz
SVB_20180427_152328332_HDR.npz
SVB_1969.npz
SYH_2017-04-27_1280.npz
CFR_1661.npz
SVB_20180427_151954938_HDR.npz
CFR_1650.npz
CSV_1868.npz
CFR_1660.npz
SYH_2017-04-27_1251.npz
CFR_1664.npz
SVB_1946.npz
SVB_1931.npz
CSV_1886.npz
CFR_1670.npz
SYH_2017-04-27_1336.npz
CFR_1626.npz
CSV_1871.npz
SVB_1973.npz
CFR_1627.npz
CFR_1663.npz
CSV_1897.npz
SYH_2017-04-27_1310.npz
CSV_1901.npz
CDY_2019.npz
CSV_1878.npz
SYH_2017-04-27_1332.npz
S

In [37]:
a = np.array([v for k, v in n_masks.items()])
a.sum()

1834

In [38]:
varietals = ['CDY', 'CFR', 'CSV', 'SVB', 'SYH']

In [39]:
acc = 0
for vt in varietals:
    a = np.array([v for k, v in n_masks.items() if k.startswith(vt)])
    print(vt, a.sum())
    acc += a.sum()
acc

CDY 242
CFR 460
CSV 290
SVB 586
SYH 256


1834

# Saving splits

In [40]:
cd ../../

/media/thiago/st_expansion_3tb/01.14.09.001.05.04-frutifuturo/grape-v3


In [41]:
with open('train.txt', 'w') as fp:
    for i in train:
        fp.write(i + '\n')
        
with open('test.txt', 'w') as fp:
    for i in test:
        fp.write(i + '\n')

In [42]:
with open('train_masked.txt', 'w') as fp:
    for i in M_train:
        fp.write(i + '\n')
        
with open('test_masked.txt', 'w') as fp:
    for i in M_test:
        fp.write(i + '\n')