In [0]:
!pip install -U -q kaggle
!mkdir -p ~/.kaggle

In [0]:
from google.colab import files
from IPython.display import clear_output
files.upload()
clear_output()

In [0]:
!pip install pycodestyle flake8 pycodestyle_magic
%load_ext pycodestyle_magic
clear_output()

In [0]:
!chmod 600 kaggle.json

In [0]:
!cp kaggle.json ~/.kaggle/

In [0]:
!kaggle competitions download -c kuzushiji-recognition
clear_output()

In [0]:
!mkdir train_images
!mkdir test_images

In [0]:
!unzip train_images.zip -d train_images
!unzip test_images.zip -d test_images
!unzip train.csv.zip
!rm train_images.zip
!rm test_images.zip
!rm train.csv.zip
clear_output()

#Data
Source: https://www.kaggle.com/c/kuzushiji-recognition/ <br>
Kuzushiji is an ancient Japanese cursive script. The goal is to locate and classify each kuzushiji character on each image in the test set. <br>
Train set size: 3881 images. <br>
Test set size: 4150 images. <br>
**train.csv** contains labels for each kuzushiji character in the train set. Train.csv columns:
*   **image_id** – ID code for the image.
*   **labels** – string with labels for all kuzushiji characters on the image, separated by space. Each label is a space-separated series of values (Unicode char, X, Y, width and height) of the character.

Predictions for the test set should have the following format:
*   **image_id**
*    **labels**, formatted like (Unicode char, X, Y). Width and height don't need to be predicted.

Predictions will be evaluated on F1-score. In this version of the metric, a correct (i. e., true positive) prediction is made when the character is correct and X and Y coordinates are within the ground truth bounding box. The ground truth bounding boxes are defined in the format {Unicode char X Y Width Height}, so if the ground truth label is U+003F 1 1 10 10 then a prediction of U+003F 3 3 would pass. <br>
Mapping between Japanese characters and corresponding Unicode IDs is stored in the file **unicode_translation.csv**. <br>
Notes about the data:
*   Some images don't contain kuzushiji characters.
*   Kuzushiji text is written such that annotations are placed between the columns of the main text, usually in a slightly smaller font. Annotation characters should be ignored.
*   You can occasionally see through especially thin paper and read characters from the opposite side of the page. Those characters should also be ignored.


In [0]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [0]:
data = pd.read_csv("train.csv")

In [15]:
data.shape

(3881, 2)

In [16]:
# labels for the train set
data.head()

Unnamed: 0,image_id,labels
0,100241706_00004_2,U+306F 1231 3465 133 53 U+304C 275 1652 84 69 ...
1,100241706_00005_1,U+306F 1087 2018 103 65 U+304B 1456 1832 40 73...
2,100241706_00005_2,U+306F 572 1376 125 57 U+306E 1551 2080 69 68 ...
3,100241706_00006_1,U+3082 1455 3009 65 44 U+516B 1654 1528 141 75...
4,100241706_00007_2,U+309D 1201 2949 27 33 U+309D 1196 1539 27 36 ...


In [0]:
unicode_translation = pd.read_csv("unicode_translation.csv")

In [23]:
unicode_translation.head(n=10)

Unnamed: 0,Unicode,char
0,U+0031,1
1,U+0032,2
2,U+0034,4
3,U+0036,6
4,U+0039,9
5,U+003F,?
6,U+2000B,𠀋
7,U+20D45,𠵅
8,U+2123D,𡈽
9,U+22999,𢦙


In [0]:
# store mapping in dictionary
mapping = dict()
for i in range(len(unicode_translation.index)):
    mapping[unicode_translation["Unicode"][i]] = unicode_translation["char"][i]

In [0]:
sample_submission = pd.read_csv("sample_submission.csv")

In [27]:
# a sample submission in the correct format
sample_submission.head()

Unnamed: 0,image_id,labels
0,test_00145af3,U+003F 1 1 U+FF2F 2 2
1,test_001c37e2,U+003F 1 1 U+FF2F 2 2
2,test_003aa33a,U+003F 1 1 U+FF2F 2 2
3,test_00665e33,U+003F 1 1 U+FF2F 2 2
4,test_006964dc,U+003F 1 1 U+FF2F 2 2


In [28]:
# removing rows contains NaN
data = data.dropna()
data.shape

(3605, 2)

In [0]:
# for evaluation of different models, let's split
# our data into 0.75 and 0.25 train and test sets
X_train, X_test, y_train, y_test = train_test_split(data["image_id"], data["labels"])