In [1]:
import os
import shutil
import pandas as pd
from sklearn.model_selection import train_test_split

I have a directory of images on which to train a classifier, but it is not yet divided into training and testing sets. The training directory is not divided into subdirectories according to its target classes. I have a csv of target information in which the ID ('GalaxyID') links the name of each image to its target data.

The purpose of this code is to take the target data, split it into two dataframes for testing and training, the use the IDs to copy the test images from the training images directory to a test images directory which I have already created. The training images will then be divided according to image classes.

In [2]:
# assign names of pre-existing training and testing folders to variables.
data_dir = '../data/images_training_rev1/'
train_dir = '../data/train_images/'
test_dir = '../data/test_images/'
valid_dir = '../data/validation_images/'
#class_dir = ['e_class/', 's_class/', 'i_class/', 'o_class/']

In [3]:
# load feature and targets data to dataframe
df = pd.read_csv('../data/target_data/targets.csv')

In [4]:
df.head()

Unnamed: 0,GalaxyID,Class1.1,Class1.2,Class1.3,Class2.1,Class2.2,Class3.1,Class3.2,Class4.1,Class4.2,...,Class9.3,Class10.1,Class10.2,Class10.3,Class11.1,Class11.2,Class11.3,Class11.4,Class11.5,Class11.6
0,100008,0.383147,0.616853,0.0,0.0,0.616853,0.038452,0.578401,0.418398,0.198455,...,0.0,0.279952,0.138445,0.0,0.0,0.092886,0.0,0.0,0.0,0.325512
1,100023,0.327001,0.663777,0.009222,0.031178,0.632599,0.46737,0.165229,0.591328,0.041271,...,0.018764,0.0,0.131378,0.45995,0.0,0.591328,0.0,0.0,0.0,0.0
2,100053,0.765717,0.177352,0.056931,0.0,0.177352,0.0,0.177352,0.0,0.177352,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,100078,0.693377,0.238564,0.068059,0.0,0.238564,0.109493,0.129071,0.189098,0.049466,...,0.0,0.094549,0.0,0.094549,0.189098,0.0,0.0,0.0,0.0,0.0
4,100090,0.933839,0.0,0.066161,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
identifier_list = ['GalaxyID']
targets_list = [t for t in df.columns if t not in identifier_list]

In [6]:
# separate features (image names) from targets
X = df[identifier_list]
y = df[targets_list]

In [7]:
# split training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state=42)
X_test, X_valid, y_test, y_valid = train_test_split(X_test, y_test, test_size = 0.5, random_state=42)

In [8]:
# combine features and targets into separate training and testing dataframes
df_train = pd.concat([X_train, y_train], join='inner', axis = 1)
df_test = pd.concat([X_test, y_test], join='inner', axis = 1)
df_valid = pd.concat([X_valid, y_valid], join='inner', axis = 1)

In [9]:
# simple function to move files from one directory to another
def move_file(fn, old_dir, new_dir):
    shutil.move(old_dir + fn, new_dir + fn)
    return None

In [10]:
for g in df_train['GalaxyID']:
    fn = (str(g) + '.jpg')
    move_file(fn, data_dir, train_dir)

In [11]:
# move files identified in testing dataframe from training directory to the test directory named above
for g in df_test['GalaxyID']:
    fn = (str(g) + '.jpg')
    move_file(fn, data_dir, test_dir)

In [12]:
for g in df_valid['GalaxyID']:
    fn = (str(g) + '.jpg')
    move_file(fn, data_dir, valid_dir) 

In [13]:
# save the separate testing and training data to csvs
df_train.to_csv('../data/target_data/train_targets.csv')
df_test.to_csv('../data/target_data/test_targets.csv')
df_valid.to_csv('../data/target_data/valid_targets.csv')