# Create new Test and Validation Sets

In [4]:
import pandas as pd
import os
import s3fs
import boto3
import json
from datetime import datetime
import numpy as np

import random

In [5]:
# GET df_combine (from model0 results)
SAGEMAKER_REPO_PATH = r'/home/ec2-user/SageMaker/classify-streetview'
df_combine = pd.read_csv(os.path.join(SAGEMAKER_REPO_PATH, 'inference-eval-model0/model0predict_with_groundtruth.csv'))
df_combine.shape

(3448, 20)

In [6]:
df_all = df_combine[['image_name', 'img_id', 'ground_truth']]
df_all['ground_truth'].value_counts()

1_null            2268
3_present          718
4_surface_prob     250
2_obstacle         129
0_missing           83
Name: ground_truth, dtype: int64

In [7]:
df_all['ground_truth'].value_counts() / 2

1_null            1134.0
3_present          359.0
4_surface_prob     125.0
2_obstacle          64.5
0_missing           41.5
Name: ground_truth, dtype: float64

In [10]:
imgid_list = list(df_all['img_id'].unique())
print(len(imgid_list))
imgid_list[0:5]

124


[8475, 8503, 8540, 8650, 8780]

In [18]:
valid_imgid = random.sample(imgid_list, 62)
len(valid_imgid)
df_valid = df_all.loc[df_all['img_id'].isin(valid_imgid)]
print(df_valid.shape)
df_valid['ground_truth'].value_counts()

(1712, 4)


1_null            1127
3_present          358
4_surface_prob     120
2_obstacle          69
0_missing           38
Name: ground_truth, dtype: int64

In [19]:
df_valid.head()

Unnamed: 0,image_name,img_id,ground_truth,is_valid_set
0,8475_135_1.jpg,8475,1_null,True
1,8475_135_2.jpg,8475,2_obstacle,True
2,8475_135_3.jpg,8475,1_null,True
3,8475_135_4.jpg,8475,1_null,True
4,8475_135_5.jpg,8475,2_obstacle,True


In [20]:
df_valid.to_csv('validation_set_from_test.csv', index = False)

In [21]:
df_all['is_valid_set'] = np.where(df_all['img_id'].isin(valid_imgid), True, False)
df_all.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


Unnamed: 0,image_name,img_id,ground_truth,is_valid_set
0,8475_135_1.jpg,8475,1_null,True
1,8475_135_2.jpg,8475,2_obstacle,True
2,8475_135_3.jpg,8475,1_null,True
3,8475_135_4.jpg,8475,1_null,True
4,8475_135_5.jpg,8475,2_obstacle,True


In [22]:
df_all['is_valid_set'].value_counts()

False    1736
True     1712
Name: is_valid_set, dtype: int64

In [23]:
df_all.to_csv('validation_test_includeallnull.csv', index = False)

In [24]:
df_new_test = df_all.loc[~df_all['is_valid_set']]
df_new_test['ground_truth'].value_counts()

1_null            1141
3_present          360
4_surface_prob     130
2_obstacle          60
0_missing           45
Name: ground_truth, dtype: int64

# Remove Nulls

In [27]:
df_null = df_all.loc[df_all['ground_truth'].str.contains('1_null')]
df_to_remove = df_null.sample(n = 1548) #Keep 720 total
remove_imgid_list = list(df_to_remove['image_name'])
df_new_all = df_all.loc[~df_all['image_name'].isin(remove_imgid_list)]
#df_new_all['ground_truth'].value_counts()
df_new_all.groupby(['ground_truth', 'is_valid_set'])['image_name', 'img_id'].nunique()

Unnamed: 0_level_0,Unnamed: 1_level_0,image_name,img_id
ground_truth,is_valid_set,Unnamed: 2_level_1,Unnamed: 3_level_1
0_missing,False,45,18
0_missing,True,38,11
1_null,False,363,62
1_null,True,357,61
2_obstacle,False,60,15
2_obstacle,True,69,14
3_present,False,360,56
3_present,True,358,54
4_surface_prob,False,130,24
4_surface_prob,True,120,26


In [28]:
df_new_all.to_csv('validation_test_final.csv', index = False)

In [29]:
df_all['ground_truth'].value_counts()

1_null            2268
3_present          718
4_surface_prob     250
2_obstacle         129
0_missing           83
Name: ground_truth, dtype: int64

In [46]:
df_all.groupby(['ground_truth', 'is_valid_set'])['image_name', 'img_id'].nunique()

Unnamed: 0_level_0,Unnamed: 1_level_0,image_name,img_id
ground_truth,is_valid_set,Unnamed: 2_level_1,Unnamed: 3_level_1
0_missing,False,46,16
0_missing,True,37,13
1_null,False,1129,62
1_null,True,1139,62
2_obstacle,False,64,17
2_obstacle,True,65,12
3_present,False,364,54
3_present,True,354,56
4_surface_prob,False,133,26
4_surface_prob,True,117,24
