# 03 :: Stratified Sampling

**Objectives:**
* split the data into training and test sets, stratified by the major categories: AGE, GT, ...


In [None]:
# %load common.py
import pandas as pd
import numpy as np
import os

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")

# Commonly used constants.

slides = [
    'B02_D1', 'B02_E1', 'B03_C2', 'B03_D2', 'B04_D1',
    'B04_E1', 'B05_D2', 'B05_E2', 'B06_E1', 'B07_C2',
    'N02_C1', 'N02_D1', 'N03_C2', 'N03_D2', 'N04_D1',
    'N04_E1', 'N05_C2', 'N05_D2', 'N06_D2', 'N07_C1']

GT = 'GT'
YEN = 'AB1_StdDev_Yen'
REGION = 'Region_predict'
ASTROCYTE = 'astrocyte'
NEURON = 'neuron'
INTERNEURON = 'interneuron'
AGE = 'age'
DAYS = 'age_days'
MONTHS = 'age_months'
AGE_GT ='age_GT'
SAMPLE_ID = 'sampleID'
SPOT_UID = 'spot_UID'

# The merged data file in Parquet format.

parquet = '/media/tmo/data/work/datasets/02_ST/parquet/'
st_full = parquet + 'st_full'

def read_full(path=st_full):
    return pd.read_parquet(st_full)

def enrich(full):
    # Add age column (young, old)
    full[AGE] = np.where(full[MONTHS] < 10, 'young', 'old')
    # Add combined column age_GT.
    full[AGE_GT] = full[[AGE, GT]].apply(lambda x: '_'.join(x), axis=1)
    
    return full

def read_enriched(path=st_full):
    return enrich(read_full(path))
    
# TODO
# * add 'coarse_region' that joins regions with <500 entries in them into one region OTHER

In [None]:
full = read_enriched()

In [5]:
from sklearn.model_selection import train_test_split

In [10]:
X = full[[SPOT_UID]]
y = full[[AGE]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=y[AGE], random_state=3)

In [11]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9294 entries, 199 to 134
Data columns (total 1 columns):
spot_UID    9294 non-null object
dtypes: object(1)
memory usage: 145.2+ KB


In [12]:
y_train[AGE].value_counts()

young    4776
old      4518
Name: age, dtype: int64

In [13]:
train_test_split?

[0;31mSignature:[0m [0mtrain_test_split[0m[0;34m([0m[0;34m*[0m[0marrays[0m[0;34m,[0m [0;34m**[0m[0moptions[0m[0;34m)[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Split arrays or matrices into random train and test subsets

Quick utility that wraps input validation and
``next(ShuffleSplit().split(X, y))`` and application to input data
into a single call for splitting (and optionally subsampling) data in a
oneliner.

Read more in the :ref:`User Guide <cross_validation>`.

Parameters
----------
*arrays : sequence of indexables with same length / shape[0]
    Allowed inputs are lists, numpy arrays, scipy-sparse
    matrices or pandas dataframes.

test_size : float, int, or None (default is None)
    If float, should be between 0.0 and 1.0 and represent the
    proportion of the dataset to include in the test split. If
    int, represents the absolute number of test samples. If None,
    the value is automatically set to the complement of the train size.
    If train size is al