# Quickdraw Doodle
## Data Preprocessing
### Importing the libraries

In [1]:
import pandas as pd
from os import listdir

### Filter unrecognized drawings and unused columns

In [None]:
input_directory = 'train_simplified/'
output_directory = 'train_simplified_filtered/'
filenames = [e for e in listdir(input_directory) if e[-4:] == '.csv']
for filename in filenames:
    file = pd.read_csv(input_directory + filename)
    number_of_images = len(file)
    number_of_recognized_images = len(file[file['recognized']])
    print(filename+','+str(number_of_images)+','+str(number_of_recognized_images))
    file[file['recognized']][['key_id', 'drawing', 'word']].to_csv(output_directory + filename, index=False)
    del file

### Split training set and test set

In [None]:
input_directory = 'train_simplified_filtered/'
train_output_directory = 'train_simplified/'
test_output_directory = 'test_simplified/'
filenames = [e for e in listdir(input_directory) if e[-4:] == '.csv']
i = 0
for filename in filenames:
    i += 1
    print(i, filename)
    file = pd.read_csv(input_directory + filename)
    test_set = file.sample(frac=1)[:5000]
    test_set.to_csv(test_output_directory + filename, index=False)
    train_set = file[~file['key_id'].isin(test_set['key_id'])]
    train_set.to_csv(train_output_directory + filename, index=False)
    del file

### Split training set into smaller files

In [3]:
input_directory = 'train_simplified/set_t/'
output_directories = []
for i in range(51, 101):
    output_directories.append('train_simplified/set_' + str(i) + '/')
filenames = [e for e in listdir(input_directory) if e[-4:] == '.csv']
file_number = 0
for filename in filenames:
    file_number += 1
    print(file_number, filename)
    file = pd.read_csv(input_directory + filename)
    for i in range(len(output_directories) - 1):
        test_set = file.sample(frac=1/(len(output_directories) - i))
        test_set.to_csv(output_directories[i] + filename, index=False)
        file = file[~file['key_id'].isin(test_set['key_id'])]
    file.to_csv(output_directories[-1] + filename, index=False)
    del file

1 airplane.csv
2 alarm clock.csv
3 ambulance.csv
4 angel.csv
5 animal migration.csv
6 ant.csv
7 anvil.csv
8 apple.csv
9 arm.csv
10 asparagus.csv
11 axe.csv
12 backpack.csv
13 banana.csv
14 bandage.csv
15 barn.csv
16 baseball bat.csv
17 baseball.csv
18 basket.csv
19 basketball.csv
20 bat.csv
21 bathtub.csv
22 beach.csv
23 bear.csv
24 beard.csv
25 bed.csv
26 bee.csv
27 belt.csv
28 bench.csv
29 bicycle.csv
30 binoculars.csv
31 bird.csv
32 birthday cake.csv
33 blackberry.csv
34 blueberry.csv
35 book.csv
36 boomerang.csv
37 bottlecap.csv
38 bowtie.csv
39 bracelet.csv
40 brain.csv
41 bread.csv
42 bridge.csv
43 broccoli.csv
44 broom.csv
45 bucket.csv
46 bulldozer.csv
47 bus.csv
48 bush.csv
49 butterfly.csv
50 cactus.csv
51 cake.csv
52 calculator.csv
53 calendar.csv
54 camel.csv
55 camera.csv
56 camouflage.csv
57 campfire.csv
58 candle.csv
59 cannon.csv
60 canoe.csv
61 car.csv
62 carrot.csv
63 castle.csv
64 cat.csv
65 ceiling fan.csv
66 cell phone.csv
67 cello.csv
68 chair.csv
69 chandelier.cs

### Split test set into smaller files

In [None]:
input_directory = 'test_simplified/merged/'
output_directories = ['test_simplified/set_1/', 'test_simplified/set_2/', 'test_simplified/set_3/', 'test_simplified/set_4/', 'test_simplified/set_5/']
filenames = [e for e in listdir(input_directory) if e[-4:] == '.csv']
file_number = 0
for filename in filenames:
    file_number += 1
    print(file_number, filename)
    file = pd.read_csv(input_directory + filename)
    for i in range(len(output_directories) - 1):
        test_set = file.sample(frac=1/(len(output_directories) - i))
        test_set.to_csv(output_directories[i] + filename, index=False)
        file = file[~file['key_id'].isin(test_set['key_id'])]
    file.to_csv(output_directories[-1] + filename, index=False)
    del file

### Merge files

In [4]:
input_directories = []
for i in range(1, 11):
    input_directories.append('train_simplified/set_'+str(i)+'/')
# input_directories = ['test_simplified/set_1/', 'test_simplified/set_2/', 'test_simplified/set_3/', 'test_simplified/set_4/', 'test_simplified/set_5/']
output_directory = 'train_simplified_merged/set_1/'
filenames = [e for e in listdir(input_directories[0]) if e[-4:] == '.csv']
file_number = 0
for filename in filenames:
    file_number += 1
    print(file_number, filename)
    data = []
    for directory in input_directories:
        data.append(pd.read_csv(directory + filename))
    pd.concat(data).to_csv(output_directory + filename, index=False)
    del data

1 airplane.csv
2 alarm clock.csv
3 ambulance.csv
4 angel.csv
5 animal migration.csv
6 ant.csv
7 anvil.csv
8 apple.csv
9 arm.csv
10 asparagus.csv
11 axe.csv
12 backpack.csv
13 banana.csv
14 bandage.csv
15 barn.csv
16 baseball bat.csv
17 baseball.csv
18 basket.csv
19 basketball.csv
20 bat.csv
21 bathtub.csv
22 beach.csv
23 bear.csv
24 beard.csv
25 bed.csv
26 bee.csv
27 belt.csv
28 bench.csv
29 bicycle.csv
30 binoculars.csv
31 bird.csv
32 birthday cake.csv
33 blackberry.csv
34 blueberry.csv
35 book.csv
36 boomerang.csv
37 bottlecap.csv
38 bowtie.csv
39 bracelet.csv
40 brain.csv
41 bread.csv
42 bridge.csv
43 broccoli.csv
44 broom.csv
45 bucket.csv
46 bulldozer.csv
47 bus.csv
48 bush.csv
49 butterfly.csv
50 cactus.csv
51 cake.csv
52 calculator.csv
53 calendar.csv
54 camel.csv
55 camera.csv
56 camouflage.csv
57 campfire.csv
58 candle.csv
59 cannon.csv
60 canoe.csv
61 car.csv
62 carrot.csv
63 castle.csv
64 cat.csv
65 ceiling fan.csv
66 cell phone.csv
67 cello.csv
68 chair.csv
69 chandelier.cs

KeyboardInterrupt: 