# Preprocese the data
* Reduce some data based on entropy value. Its aim is to remove too simple images and too many strokes images.
* Create the validate and test data.

In [1]:
import numpy as np
from dask import bag
from tqdm import tqdm
import json
import os
import datetime as dt
import pandas as pd
import cv2
import ast

%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# The following code written by beluga from Kaggle  https://www.kaggle.com/gaborfodor/shuffle-csvs
qd_names =['cup','garden hose', 'marker', 'truck', 'oven', 'cooler', 'birthday cake',
'camouflage', 'pool', 'dog', 'bear','bird', 'The Great Wall of China','van',
'tiger', 'bench', 'pickup truck','coffee cup', 'telephone', 'mug','matches',
'animal migration', 'lantern', 'skyscraper','keyboard','foot','monkey','sleeping bag',
'brain', 'peanut', 'belt', 'tent','cookie', 'cake','hot dog',
'violin', 'cello', 'donut', 'hourglass', 'bee']

test_dir = './test'
train_dir = './train'
val_dir = './val'

def f2cat(filename: str) -> str:
    return filename.split('.')[0]

class Simplified():
    def __init__(self, input_path='./input'):
        self.input_path = input_path

    def list_all_categories(self):
        return qd_names

    def read_training_csv(self, category, nrows=None, usecols=None, drawing_transform=False):
        df = pd.read_csv(os.path.join(self.input_path, 'train_simplified', category + '.csv'),
                         nrows=nrows,parse_dates=['timestamp'], usecols=usecols)
        if drawing_transform:
            df['drawing'] = df['drawing'].apply(json.loads)
        return df



In [3]:
start = dt.datetime.now()
s = Simplified()
NCSVS = 20
categories = s.list_all_categories()
print(len(categories))

40


In [10]:
low_threshold = 5
upper_threshold = 5

def entropy_it(x):
    counts = np.bincount(x)
    p = counts[counts > 0] / float(len(x))
    return -np.sum(p * np.log2(p))

def data_draw_cv2(raw_strokes, size=96, linewidth=6, time_color=True):
    img = np.zeros((256, 256), np.uint8)
    for t, stroke in enumerate(ast.literal_eval(raw_strokes)):
        for i in range(len(stroke[0]) - 1):
            color = 255 - min(t, 10) * 13 if time_color else 255
            _ = cv2.line(img, (stroke[0][i], stroke[1][i]), 
                         (stroke[0][i + 1], stroke[1][i + 1]), color, linewidth)
    if size != 256:
        img = cv2.resize(img, (size, size))
        
    img = np.array(img)
    return entropy_it(img.flatten()), img

def create_dataset(recognized_only = False ):
    for y, cat in enumerate(categories):
        df = s.read_training_csv(cat)
        df = df[df['recognized'] == True].copy() if recognized_only else df
        #Shuffle the data of the category.
        print("Create ",y," category:",cat," len",len(df))
        df['rnd'] = np.random.rand(len(df))
        df = df.sort_values(by='rnd').drop('rnd', axis=1)

        entropybag = bag.from_sequence(df.drawing.values).map(data_draw_cv2)
        data = entropybag.compute()
        entropy, images = zip(*data)
        lower = np.percentile(entropy, low_threshold)
        upper = np.percentile(entropy, 100 - upper_threshold)    
        df['y'] = y

        df['cv'] = entropy

        df = df[(df['cv'] > lower) & (df['cv'] < upper)]
        index = np.where((entropy > lower) & (entropy < upper))

        images = np.array(images)
        images = images[index]

        print("After entropy",len(df),len(images),min(entropy),max(entropy))

        #Create test dataset, val dataset, and train dataset.
        test_csv = df[0:512]
        val_csv = df[512:1024]
        df = df[1024:]
        
        df['cv'] = (df.key_id // 10 ** 7) % NCSVS
        
        if y == 0:
            #np.save(os.path.join(test_dir,cat),images[0:512])
            #np.save(os.path.join(val_csv,cat),images[0:512])
            test_csv.to_csv(os.path.join(test_dir,'test_dataset.csv'),index=False)
            val_csv.to_csv(os.path.join(val_dir,'val_dataset.csv'),index = False)
        else:
            test_csv.to_csv(os.path.join(test_dir,'test_dataset.csv'),mode = 'a',header = False, index=False)
            val_csv.to_csv(os.path.join(val_dir,'val_dataset.csv'),mode = 'a',header = False, index=False)
            
        df.to_csv(os.path.join(train_dir,cat + ".csv"),index=False)
        
        for k in range(NCSVS):
            filename = 'train_k{}.csv'.format(k)
            chunk = df[df.cv == k]
            chunk = chunk.drop(['key_id'], axis=1)
            if y == 0:
                #np.save(os.path.join(train_dir,cat),images[1024:])
                chunk.to_csv(os.path.join(train_dir,filename),index=False)
            else:
                chunk.to_csv(os.path.join(train_dir,filename),mode = 'a',header = False, index=False)

create_dataset()

for k in tqdm(range(NCSVS)):

    filename = os.path.join(train_dir,'train_k{}.csv'.format(k))
    if os.path.exists(filename):
        df = pd.read_csv(filename)
        df['rnd'] = np.random.rand(len(df))
        df = df.sort_values(by='rnd').drop('rnd', axis=1)
        length = len(df)
        length = (length // 1024)*1024
        df = df[:length]
        df.to_csv(filename, index=False)
        #os.remove(filename)
        print(df.shape)


Create  0  category: cup  len 130721
After entropy 117647 117647 0.029724729240144854 3.3831330513979117
Create  1  category: garden hose  len 121843
After entropy 109657 109657 0.010447537330226785 3.2984964422329606
Create  2  category: marker  len 319136
After entropy 287222 287222 0.009296104804529342 3.314858570868997
Create  3  category: truck  len 131354
After entropy 118218 118218 0.010881565108004563 3.2473362819853278
Create  4  category: oven  len 206910
After entropy 186218 186218 0.009079090915640454 3.3383356880539363
Create  5  category: cooler  len 271444
After entropy 244298 244298 0.009079090915640454 3.6545267006446074
Create  6  category: birthday cake  len 144982
After entropy 130482 130482 0.010447537330226785 2.81485625247105
Create  7  category: camouflage  len 172710
After entropy 155438 155438 0.009079090915640454 4.00447336980575
Create  8  category: pool  len 133439
After entropy 120095 120095 0.009079090915640454 3.28680916582468
Create  9  category: dog  l

  5%|▌         | 1/20 [00:04<01:25,  4.50s/it]

(281600, 7)


 10%|█         | 2/20 [00:08<01:19,  4.41s/it]

(280576, 7)


 15%|█▌        | 3/20 [00:12<01:12,  4.25s/it]

(280576, 7)


 20%|██        | 4/20 [00:16<01:07,  4.24s/it]

(280576, 7)


 25%|██▌       | 5/20 [00:20<01:02,  4.16s/it]

(281600, 7)


 30%|███       | 6/20 [00:24<00:57,  4.10s/it]

(280576, 7)


 35%|███▌      | 7/20 [00:36<01:22,  6.31s/it]

(281600, 7)


 40%|████      | 8/20 [00:54<01:59,  9.94s/it]

(281600, 7)


 45%|████▌     | 9/20 [00:58<01:29,  8.13s/it]

(282624, 7)


 50%|█████     | 10/20 [01:07<01:22,  8.30s/it]

(281600, 7)


 55%|█████▌    | 11/20 [01:25<01:41, 11.23s/it]

(281600, 7)


 60%|██████    | 12/20 [01:29<01:12,  9.07s/it]

(280576, 7)


 65%|██████▌   | 13/20 [01:36<01:00,  8.59s/it]

(280576, 7)


 70%|███████   | 14/20 [01:46<00:53,  8.88s/it]

(280576, 7)


 75%|███████▌  | 15/20 [01:51<00:38,  7.70s/it]

(281600, 7)


 80%|████████  | 16/20 [01:56<00:27,  6.90s/it]

(280576, 7)


 85%|████████▌ | 17/20 [02:00<00:18,  6.15s/it]

(281600, 7)


 90%|█████████ | 18/20 [02:04<00:11,  5.58s/it]

(281600, 7)


 95%|█████████▌| 19/20 [02:09<00:05,  5.17s/it]

(281600, 7)


100%|██████████| 20/20 [02:13<00:00,  4.89s/it]

(281600, 7)



