In [1]:
import numpy as np
import pandas as pd
import os
import glob
pd.options.display.max_columns = 100
pd.options.display.max_colwidth = 200
pd.options.display.max_rows = 20
np.random.seed(231)
from sklearn.model_selection import train_test_split
import cv2


In [2]:
train_dir = "/mnt/disks/large/data/original_train/"
data_dir = "/mnt/disks/large/data/"
output_dir = "/home/rugezhao/CS231N_CZT/data/"

In [3]:
train_df = pd.read_csv(data_dir+"train.csv", engine='python')
labels_df = pd.read_csv(data_dir+"labels.csv", engine='python')


The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2. To stratify sample the class needs at least 3 samples. So I decided to drop all samples that are tagged to a label that appeared less than 3 times.

# Data Split

In [4]:
def train_val_test_split(train_id_attribute_id, ratios = (0.8,0.1,0.1),min_count=3):
    train_id_attribute_id_ohe= np.zeros((len(train_df), len(labels_df)), dtype=int)
    for idx, attr_arr in enumerate(train_df.attribute_ids.str.split(" ").apply(lambda l: list(map(int, l))).values):
        train_id_attribute_id_ohe[idx, attr_arr] = 1
    train_id_attribute_id_ohe = pd.DataFrame(train_id_attribute_id_ohe,columns=labels_df.attribute_id)
    print("one hot encoding done")
    
    col_sum=pd.DataFrame(np.sum(train_id_attribute_id_ohe,axis=0))
    col_sum=col_sum.reset_index()
    infreq_labels = list(col_sum[col_sum.iloc[:,1] < min_count].iloc[:,0])
    print("{} infrequent labels less than {} times found".format(len(infreq_labels),min_count))
    
    # drop the examples with infreq labels, i.e. if 1 in any of these columns -->drop
    df_onehot = pd.concat([train_id_attribute_id,train_id_attribute_id_ohe], axis=1)
    df_onehot = df_onehot.drop(columns=["attribute_ids"])
    infreq_cols = df_onehot[infreq_labels]
    X_infreq = list(df_onehot.loc[(infreq_cols!=0).any(axis=1)].id)
    
    df_onehot = df_onehot[~df_onehot.id.isin(X_infreq)]
     
    X = df_onehot.id
    y = df_onehot.iloc[:,1:]
    # use stratify=y for stratified sampling -->> impossible!! since they need to treat combination of label as a class
    X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=ratios[2], random_state=231,stratify=None)
    print("test set done, size {}".format(X_test.shape))
    # use stratify =y_train_val for stratified sampling -->impossible!!
    X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=ratios[1]/(ratios[0]+ratios[1]), random_state=231,stratify=None)
    print("val set done, size {}".format(X_val.shape))
    print("train set done, size {}".format(X_train.shape))
    return X_train, X_val, X_test, X_infreq, df_onehot

In [5]:
np.random.seed(231)
X_train, X_val, X_test, X_infreq, df_onehot = train_val_test_split(train_df)

one hot encoding done
26 infrequent labels less than 3 times found
test set done, size (10920,)
val set done, size (10920,)
train set done, size (87360,)


## Check distribution

In [6]:
df_one_train = df_onehot[df_onehot.id.isin(X_train)]
df_one_val = df_onehot[df_onehot.id.isin(X_val)]
df_one_test = df_onehot[df_onehot.id.isin(X_test)]

In [10]:
df_one_train_export = train_df[train_df.id.isin(X_train)]
df_one_train_export.to_csv(output_dir+"/train_split_train.csv",index=False)
df_one_val_export = train_df[train_df.id.isin(X_val)]
df_one_val_export.to_csv(output_dir+"/train_split_val.csv",index=False)
df_one_test_export = train_df[train_df.id.isin(X_test)]
df_one_test_export.to_csv(output_dir+"/train_split_test.csv",index=False)

In [7]:
col_pct_train=pd.DataFrame(np.sum(df_one_train.iloc[:,1:],axis=0))/df_one_train.shape[0]
col_pct_val=pd.DataFrame(np.sum(df_one_val.iloc[:,1:],axis=0))/df_one_val.shape[0]
col_pct_test=pd.DataFrame(np.sum(df_one_test.iloc[:,1:],axis=0))/df_one_test.shape[0]

In [8]:
col_pct_train.sort_values(0,ascending=False)

Unnamed: 0,0
813,0.183745
1092,0.130804
147,0.124531
189,0.094677
13,0.083116
671,0.077724
51,0.070250
194,0.068098
1059,0.059879
121,0.059558


In [9]:
col_pct_val.sort_values(0,ascending=False)

Unnamed: 0,0
813,0.185623
1092,0.128480
147,0.119505
189,0.093132
13,0.088187
671,0.075000
51,0.068407
194,0.065018
1059,0.058883
121,0.058150


In [10]:
col_pct_test.sort_values(0,ascending=False)

Unnamed: 0,0
813,0.172711
1092,0.132418
147,0.122253
189,0.099451
13,0.084890
671,0.073626
194,0.067308
51,0.066575
121,0.064469
1059,0.062912


In [11]:
# export split to csv
train_df['train'] = 0
train_df['val']=0
train_df['test']=0


In [12]:
train_df.loc[train_df.id.isin(list(X_train)),'train'] = 1
train_df.loc[train_df.id.isin(list(X_val)),'val'] = 1
train_df.loc[train_df.id.isin(list(X_test)),'test'] = 1
train_df.to_csv(output_dir+"/train_split.csv",index=False)

# TODO: Change Folder Structure - move .png according to train_split.csv

In [13]:
# TODO
# import os
# train_files = os.listdir(train_dir)
src_dir = "/mnt/disks/large/data/original_train/"

train_new_dir = "/mnt/disks/large/data/train_split/"
train_split_dir = train_new_dir + "train/"
val_split_dir = train_new_dir + "val/"
test_split_dir = train_new_dir + "test/"

In [14]:
import os, shutil, pathlib, fnmatch

def move_dir(src: str, dst: str, pattern: str = '*'):
    if not os.path.isdir(dst):
        pathlib.Path(dst).mkdir(parents=True, exist_ok=True)
    for f in fnmatch.filter(os.listdir(src), pattern):
        shutil.move(os.path.join(src, f), os.path.join(dst, f))
        


In [None]:
train_files=list(train_df[train_df.train==1].id)
train_files = [x+".png" for x in train_files]
for i,f in enumerate(train_files):
    move_dir(src_dir, train_split_dir, f)
    print("{}/{} files moved".format(i, len(train_files)))

In [None]:
val_files=list(train_df[train_df.val==1].id)
val_files = [x+".png" for x in val_files]
for i,f in enumerate(val_files):
    move_dir(src_dir, val_split_dir, f)
    print("{}/{} files moved".format(i, len(val_files)))

In [None]:
test_files=list(train_df[train_df.test==1].id)
test_files = [x+".png" for x in test_files]
for i,f in enumerate(test_files):
    move_dir(src_dir, test_split_dir, f)
    print("{}/{} files moved".format(i, len(test_files)))

# Training preprocessing

In [None]:
# pause for now since it can be done at run time using pytorch
# https://pytorch.org/tutorials/beginner/data_loading_tutorial.html

# Val, Test Preprocessing

In [None]:
# pause for now since it can be done at run time using pytorch
# https://pytorch.org/tutorials/beginner/data_loading_tutorial.html

# Training Per Channel Mean and Std - after preprocessing

In [25]:
# import os
# train_files = os.listdir(train_dir)
# do not use folder structure, use the newly created training data

train_files=list(train_df[train_df.train==1].id)
train_files = [x+".png" for x in train_files]

In [None]:
sum_x_rgb = np.array([0.0]*3)
sum_x2_rgb = np.array([0.0]*3)
file_count =0
pixel_count=0
for i, file in enumerate(train_files):
    if i%10==0:
        print("{}/{} files read".format(i+1, len(train_files)))
    file_count+=1
    
    image = cv2.imread(train_dir+"/"+file)
    pixel_count += image.shape[0]*image.shape[1]
    # R, G, B
    for i in range(3):
        sum_x_rgb[i] += np.sum(image[:,:,i].astype(float))
        sum_x2_rgb[i] += np.sum(image[:,:,i].astype(float)**2)
        
        

In [30]:
channel_mean = sum_x_rgb/pixel_count
channel_mean

array([144.2913033 , 155.92557575, 165.05713939])

In [31]:
channel_sd = np.sqrt(sum_x2_rgb/pixel_count-channel_mean**2)
channel_sd

array([64.59221603, 63.77200451, 63.93791888])

# TODO:  Generate small training sample for debugging 

sample by id from large training sample

In [15]:
sample_idx = np.random.choice(train_df[train_df.train==1].shape[0], 8000)
sample_train = list(train_df[train_df.train==1].iloc[sample_idx,:].id)
sample_train_files = [x+".png" for x in sample_train]

sample_idx = np.random.choice(train_df[train_df.val==1].shape[0], 1000)
sample_val = list(train_df[train_df.val==1].iloc[sample_idx,:].id)
sample_val_files = [x+".png" for x in sample_val]

sample_idx = np.random.choice(train_df[train_df.test==1].shape[0], 1000)
sample_test = list(train_df[train_df.test==1].iloc[sample_idx,:].id)
sample_test_files = [x+".png" for x in sample_test]

In [22]:
# to folder structure and copy those files to a separate folder
def copy_dir(src: str, dst: str, pattern: str = '*'):
    if not os.path.isdir(dst):
        pathlib.Path(dst).mkdir(parents=True, exist_ok=True)
    for f in fnmatch.filter(os.listdir(src), pattern):
        shutil.copyfile(os.path.join(src, f), os.path.join(dst, f))

In [25]:
train_split_dir_sample =  "/mnt/disks/large/debug_data/train"
val_split_dir_sample =  "/mnt/disks/large/debug_data/val"
test_split_dir_sample =  "/mnt/disks/large/debug_data/test"

In [None]:
for i,f in enumerate(sample_train_files):
    copy_dir(train_split_dir,train_split_dir_sample , f)
    print("{}/{} files copied".format(i, len(sample_train_files)))

In [None]:
for i,f in enumerate(sample_val_files):
    copy_dir(val_split_dir,val_split_dir_sample , f)
    print("{}/{} files copied".format(i, len(sample_val_files)))

In [None]:
for i,f in enumerate(sample_test_files):
    copy_dir(test_split_dir,test_split_dir_sample , f)
    print("{}/{} files copied".format(i, len(sample_test_files)))

In [None]:
# compute channel mean and std
sum_x_rgb = np.array([0.0]*3)
sum_x2_rgb = np.array([0.0]*3)
file_count =0
pixel_count=0
for i, file in enumerate(sample_train_files):
    if i%10==0:
        print("{}/{} files read".format(i+1, len(sample_train_files)))
    file_count+=1
    
    image = cv2.imread(train_split_dir_sample+"/"+file)
    pixel_count += image.shape[0]*image.shape[1]
    # R, G, B
    for i in range(3):
        sum_x_rgb[i] += np.sum(image[:,:,i].astype(float))
        sum_x2_rgb[i] += np.sum(image[:,:,i].astype(float)**2)
        
        

In [32]:
channel_mean = sum_x_rgb/pixel_count
channel_mean

array([144.46578057, 156.0374637 , 165.11049366])

In [33]:
channel_sd = np.sqrt(sum_x2_rgb/pixel_count-channel_mean**2)
channel_sd

array([64.50104194, 63.66100116, 63.80019134])