In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import sys
import shutil

from sklearn.model_selection import train_test_split
from tqdm import tqdm
tqdm.pandas(file=sys.stdout, ascii=True)

## 0. Config

In [2]:
RANDOM_SEED = 42
METADATA_FILE_PATH = '../data/metadata/metadata.tsv'
IMAGS_DIR_PATH = '../data/images'
RES_SUFFICES = ['69pix', '128pix']

PACKAGE_DIR = '../data/export'

### 1. Load Data

In [3]:
# Metadata file
metadata = pd.read_csv(METADATA_FILE_PATH, sep='\t')
print(metadata.shape)
metadata.head(3)

(73627, 5)


Unnamed: 0,SDSS_ID,M/L,L_g,distance_Mpc,galsize_kpc
0,1237662301367173202,2.398229,8880467000.0,326.142854,48.655727
1,1237662301905813735,0.974585,7007162000.0,293.571417,38.145197
2,1237662534360301834,0.678322,10474460000.0,167.142861,30.677076


In [4]:
# images
IMAGES_128PX_DIR_PATH = IMAGS_DIR_PATH + '/' + RES_SUFFICES[1]
image_file_names = os.listdir(IMAGES_128PX_DIR_PATH)
image_file_paths = [IMAGES_128PX_DIR_PATH + '/' + path for path in image_file_names]
image_ids = [int(name.split('.')[0]) for name in image_file_names]
image_path_df = pd.DataFrame({'SDSS_ID': image_ids, 'image_name': image_file_names})
image_path_df.head(3)

Unnamed: 0,SDSS_ID,image_name
0,1237645879578460255,1237645879578460255.png
1,1237645941824356443,1237645941824356443.png
2,1237645943974396134,1237645943974396134.png


In [5]:
# Merge data
metadata = pd.merge(left=metadata, right=image_path_df, how='left', on='SDSS_ID')
metadata.head(3)

Unnamed: 0,SDSS_ID,M/L,L_g,distance_Mpc,galsize_kpc,image_name
0,1237662301367173202,2.398229,8880467000.0,326.142854,48.655727,1237662301367173202.png
1,1237662301905813735,0.974585,7007162000.0,293.571417,38.145197,1237662301905813735.png
2,1237662534360301834,0.678322,10474460000.0,167.142861,30.677076,1237662534360301834.png


In [6]:
# Shuffle data
metadata = metadata.sample(metadata.shape[0], replace=False)

In [7]:
# Check nan
assert pd.isna(metadata).sum().sum() == 0

## 2. Split train, test_public and test_private

train(70%), test_public(10%), test_private(20%)

In [8]:
# Define sample size
n_size = metadata.shape[0]
train_size = int(n_size * 0.7)
test_public_size = int(n_size * 0.1)
test_private_size = n_size - train_size - test_public_size

print(f'train_size: {train_size}')
print(f'test_public_size: {test_public_size}')
print(f'test_private_size: {test_private_size}')

train_size: 51538
test_public_size: 7362
test_private_size: 14727


In [9]:
# Split id
np.random.seed(42)
indexer = np.arange(metadata.shape[0])
train_indexer, test_inexer = train_test_split(indexer, 
                                              test_size=(test_public_size + test_private_size),
                                              random_state=RANDOM_SEED, 
                                              shuffle=True)
test_public_indexer, test_private_indexer = train_test_split(test_inexer, 
                                                             test_size=test_private_size,
                                                             random_state=RANDOM_SEED, 
                                                             shuffle=True)

# Assert shape match
assert len(train_indexer) == train_size
assert len(test_public_indexer) == test_public_size
assert len(test_private_indexer) == test_private_size

In [10]:
# Fetch correpsonding ids
train_meta = metadata.iloc[train_indexer, :].reset_index(drop=True)
test_public_meta = metadata.iloc[test_public_indexer, :].reset_index(drop=True)
test_private_meta = metadata.iloc[test_private_indexer, :].reset_index(drop=True)

## 3. Move & Export files

Data released to public:
- `train_metadata.tsv` containing true M/L values
- `train_images_69pix.zip`
- `train_images_128pix.zip`
- `test_metadata.tsv` without M/L values
- `test_images_69pix.zip`
- `test_images_128pix.zip`
- `sample_submission.tsv` sample submission file

Data used for evaluation:
- `evaluation_test_public_metadata.tsv`
- `evaluation_test_private_metadata.tsv`

In [11]:
# Ensure export directory is created
if not os.path.exists(PACKAGE_DIR):
    os.makedirs(PACKAGE_DIR)

### 3.1. Compile metadata files

In [12]:
# Create train metadata file
train_metadata = train_meta.copy()
train_metadata.to_csv(PACKAGE_DIR + '/train_metadata.tsv', sep='\t', index=False)

In [13]:
# Create public metadata for test datasets
test_metadata = pd.concat([test_public_meta, test_private_meta], axis=0, sort=False)

# Shuffle data to prevent guessing of public/private split
test_metadata = test_metadata.sample(test_metadata.shape[0], replace=False).reset_index(drop=True)
test_metadata = test_metadata.drop(['M/L'], axis=1)

test_metadata.to_csv(PACKAGE_DIR + '/test_metadata.tsv', sep='\t', index=False)

In [14]:
# Internal versions for evaluation
test_public_meta.to_csv(PACKAGE_DIR + '/evaluation_test_public_metadata.tsv', sep='\t', index=False)
test_private_meta.to_csv(PACKAGE_DIR + '/evaluation_test_private_metadata.tsv', sep='\t', index=False)

### 3.2. Copy image files

In [15]:
# Create folders if non-existent
for folder_prefix in ('train_images_', 'test_images_'):
    for res_suffix in RES_SUFFICES:
        dir_path = PACKAGE_DIR + '/' + folder_prefix + res_suffix
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

In [16]:
# Move train images
for img_name in tqdm(train_metadata['image_name'].values, file=sys.stdout, ascii=True):
    for res_suffix in RES_SUFFICES:
        _from = '../data/images/' + res_suffix + '/' + img_name
        _to = PACKAGE_DIR + '/train_images_' + res_suffix + '/' + img_name
        shutil.copyfile(_from, _to)

100%|##########################################################################| 51538/51538 [00:39<00:00, 1294.79it/s]


In [17]:
# Move train images
for img_name in tqdm(test_metadata['image_name'].values, file=sys.stdout, ascii=True):
    for res_suffix in RES_SUFFICES:
        _from = '../data/images/' + res_suffix + '/' + img_name
        _to = PACKAGE_DIR + '/test_images_' + res_suffix + '/' + img_name
        shutil.copyfile(_from, _to)

100%|##########################################################################| 22089/22089 [00:18<00:00, 1193.07it/s]


### 3.3. Create sample submission file

In [18]:
# Drop all other columns
sample_submission = test_metadata.copy()
sample_submission = sample_submission.loc[:, ['SDSS_ID']]
sample_submission.loc[:, 'pred_ml'] = 0.0

sample_submission.to_csv(PACKAGE_DIR + '/sample_submission.tsv', sep='\t', index=False)