# Preprocese the data
* Reduce some data based on entropy value. Its aim is to remove too simple images and too many strokes images.
* Create the validate and test data.

In [1]:
import numpy as np
from dask import bag
from tqdm import tqdm
import json
import os
import datetime as dt
import pandas as pd
import cv2
import ast

%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# The following code written by beluga from Kaggle  https://www.kaggle.com/gaborfodor/shuffle-csvs
qd_names =['cup','garden hose', 'marker', 'truck', 'oven', 'cooler', 'birthday cake',
'camouflage', 'pool', 'dog', 'bear','bird', 'The Great Wall of China','van',
'tiger', 'bench', 'hot tub','coffee cup', 'telephone', 'mug','matches',
'animal migration', 'lantern', 'skyscraper','keyboard','foot','monkey','sleeping bag',
'brain', 'peanut', 'belt', 'tent','cookie', 'sweater','hot dog',
'microwave', 'mermaid', 'donut', 'hourglass', 'bee']

test_dir = './test'
train_dir = './train'
val_dir = './val'

def f2cat(filename: str) -> str:
    return filename.split('.')[0]

class Simplified():
    def __init__(self, input_path='./input'):
        self.input_path = input_path

    def list_all_categories(self):
        return qd_names

    def read_training_csv(self, category, nrows=None, usecols=None, drawing_transform=False):
        df = pd.read_csv(os.path.join(self.input_path, 'train_simplified', category + '.csv'),
                         nrows=nrows,parse_dates=['timestamp'], usecols=usecols)
        if drawing_transform:
            df['drawing'] = df['drawing'].apply(json.loads)
        return df



In [3]:
start = dt.datetime.now()
s = Simplified()
NCSVS = 100
categories = s.list_all_categories()
print(len(categories))

40


In [8]:
low_threshold = 1
upper_threshold = 0.5

def entropy_it(x):
    counts = np.bincount(x)
    p = counts[counts > 0] / float(len(x))
    return -np.sum(p * np.log2(p))

def data_draw_cv2(raw_strokes, size=96, linewidth=6, time_color=True):
    img = np.zeros((256, 256), np.uint8)
    for t, stroke in enumerate(ast.literal_eval(raw_strokes)):
        for i in range(len(stroke[0]) - 1):
            color = 255 - min(t, 10) * 13 if time_color else 255
            _ = cv2.line(img, (stroke[0][i], stroke[1][i]), 
                         (stroke[0][i + 1], stroke[1][i + 1]), color, linewidth)
    if size != 256:
        img = cv2.resize(img, (size, size))
        
    img = np.array(img)
    return entropy_it(img.flatten()), img

def create_dataset(recognized_only = True ):
    for y, cat in enumerate(categories):
        df = s.read_training_csv(cat,nrows=120000)
        df = df[df['recognized'] == True].copy() if recognized_only else df
        #Shuffle the data of the category.
        print("Create ",y," category:",cat," len",len(df))
        df['rnd'] = np.random.rand(len(df))
        df = df.sort_values(by='rnd').drop('rnd', axis=1)

        entropybag = bag.from_sequence(df.drawing.values).map(data_draw_cv2)
        data = entropybag.compute()
        entropy, images = zip(*data)
        lower = np.percentile(entropy, low_threshold)
        upper = np.percentile(entropy, 100 - upper_threshold)    
        df['y'] = y

        df['cv'] = entropy

        df = df[(df['cv'] > lower) & (df['cv'] < upper)]
        index = np.where((entropy > lower) & (entropy < upper))

        images = np.array(images)
        images = images[index]

        print("After entropy",len(df),len(images),min(entropy),max(entropy))

        #Create test dataset, val dataset, and train dataset.
        test_csv = df[0:512]
        val_csv = df[512:1024]
        df = df[1024:]
        
        df['cv'] = (df.key_id // 10 ** 7) % NCSVS
        
        if y == 0:
            #np.save(os.path.join(test_dir,cat),images[0:512])
            #np.save(os.path.join(val_csv,cat),images[0:512])
            test_csv.to_csv(os.path.join(test_dir,'test_dataset.csv'),index=False)
            val_csv.to_csv(os.path.join(val_dir,'val_dataset.csv'),index = False)
        else:
            test_csv.to_csv(os.path.join(test_dir,'test_dataset.csv'),mode = 'a',header = False, index=False)
            val_csv.to_csv(os.path.join(val_dir,'val_dataset.csv'),mode = 'a',header = False, index=False)
            
        df.to_csv(os.path.join(train_dir,cat + ".csv"),index=False)
        
        for k in range(NCSVS):
            filename = 'train_k{}.csv'.format(k)
            chunk = df[df.cv == k]
            chunk = chunk.drop(['key_id'], axis=1)
            if y == 0:
                #np.save(os.path.join(train_dir,cat),images[1024:])
                chunk.to_csv(os.path.join(train_dir,filename),index=False)
            else:
                chunk.to_csv(os.path.join(train_dir,filename),mode = 'a',header = False, index=False)

#create_dataset()

for k in tqdm(range(NCSVS)):

    filename = os.path.join(train_dir,'train_k{}.csv'.format(k))
    if os.path.exists(filename):
        df = pd.read_csv(filename)
        df['rnd'] = np.random.rand(len(df))
        df = df.sort_values(by='rnd').drop('rnd', axis=1)
        df.to_csv(filename + '.gz', compression='gzip', index=False)
        os.remove(filename)
        
print(df.shape)



  0%|          | 0/100 [00:00<?, ?it/s][A
  1%|          | 1/100 [00:10<16:48, 10.19s/it][A
  2%|▏         | 2/100 [00:20<16:36, 10.17s/it][A
  3%|▎         | 3/100 [00:30<16:31, 10.22s/it][A
  4%|▍         | 4/100 [00:40<16:17, 10.19s/it][A
  5%|▌         | 5/100 [00:50<16:04, 10.16s/it][A
  6%|▌         | 6/100 [01:00<15:49, 10.10s/it][A
  7%|▋         | 7/100 [01:10<15:37, 10.08s/it][A
  8%|▊         | 8/100 [01:20<15:26, 10.07s/it][A
  9%|▉         | 9/100 [01:30<15:14, 10.05s/it][A
 10%|█         | 10/100 [01:41<15:06, 10.08s/it][A
 11%|█         | 11/100 [01:51<14:59, 10.10s/it][A
 12%|█▏        | 12/100 [02:01<14:46, 10.08s/it][A
 13%|█▎        | 13/100 [02:11<14:33, 10.04s/it][A
 14%|█▍        | 14/100 [02:21<14:23, 10.04s/it][A
 15%|█▌        | 15/100 [02:31<14:12, 10.03s/it][A
 16%|█▌        | 16/100 [02:41<14:00, 10.01s/it][A
 17%|█▋        | 17/100 [02:51<13:49, 10.00s/it][A
 18%|█▊        | 18/100 [03:01<13:40, 10.01s/it][A
 19%|█▉        | 19/100 [03:1

(42779, 7)
