# Development set split
In principle, we can our augmented dataset of 10,000+ images for training. If we want to do any refinement of hyperparameters, we can split our training set into
* training 80% of each class
* dev 20 % of each class

In [5]:
# Requirements
import pandas as pd
import numpy as np

In [3]:
# load dataset
augmented_df = pd.read_csv('/Users/jlc/Google Drive/_code/MIDS_w207/mapmyindia/data/train/augmented/train_augmented.csv')
augmented_df.head()

Unnamed: 0,Img_Name,Label
0,01-05 10.15.27_2_0000.jpg,Speed Limit 60
1,01-05 10.15.27_2_1621.jpg,Speed Limit 60
2,01-05 10.15.27_2_1729.jpg,Speed Limit 60
3,01-05 10.15.27_2_1809.jpg,Speed Limit 60
4,01-05 10.15.27_2_2166.jpg,Speed Limit 60


### Define 80/20 split

In [27]:
# define split function
def split_to_train_test(df, label_column, train_frac=0.8):
    train_df, test_df = pd.DataFrame(), pd.DataFrame()
    labels = df[label_column].unique()
    for lbl in labels:
        lbl_df = df[df[label_column] == lbl]
        lbl_train_df = lbl_df.sample(frac=train_frac)
        lbl_test_df = lbl_df.drop(lbl_train_df.index)
        train_df = train_df.append(lbl_train_df)
        test_df = test_df.append(lbl_test_df)

    return train_df, test_df

### Apply split

In [29]:
# perform split
train, test = split_to_train_test(augmented_df, 'Label', 0.8)

### Double-check split

In [26]:
train.info()
train.groupby('Label').count()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9720 entries, 12090 to 4984
Data columns (total 2 columns):
Img_Name    9720 non-null object
Label       9720 non-null object
dtypes: object(2)
memory usage: 227.8+ KB


Unnamed: 0_level_0,Img_Name
Label,Unnamed: 1_level_1
Speed Limit 20,1349
Speed Limit 30,1661
Speed Limit 40,2307
Speed Limit 50,2470
Speed Limit 60,914
Speed Limit 80,1019


In [25]:
test.info()
test.groupby('Label').count()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2431 entries, 0 to 10723
Data columns (total 2 columns):
Img_Name    2431 non-null object
Label       2431 non-null object
dtypes: object(2)
memory usage: 57.0+ KB


Unnamed: 0_level_0,Img_Name
Label,Unnamed: 1_level_1
Speed Limit 20,337
Speed Limit 30,415
Speed Limit 40,577
Speed Limit 50,618
Speed Limit 60,229
Speed Limit 80,255


### Save CSV files

In [31]:
test.to_csv("../data/train/augmented/split_20_dev.csv", index=False)
train.to_csv("../data/train/augmented/split_80_train.csv", index=False)