In [12]:
import numpy as np
from sklearn import preprocessing
import pandas

import podomics

In [2]:
omics = podomics.read_csv("examples/exampledata1.csv", sample="Sample", time="Timepoint")

In [3]:
omics.data

Unnamed: 0_level_0,Timepoint,0,1,2,3,4,5,6,7,8,...,40,41,42,43,44,45,46,47,48,49
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
T0R0,0.0,1.091971,0.892355,0.926898,0.854174,0.886741,0.826151,0.657382,0.655107,0.587586,...,0.00063,0.014822,0.020872,0.017413,0.085254,0.008887,0.058203,0.03918,0.038306,0.007283
T0R1,0.0,1.014665,1.01362,0.911735,0.863435,0.800986,0.767051,0.685029,0.584403,0.537806,...,0.017506,0.084409,0.004945,0.007825,0.030681,0.022009,0.005373,0.046301,0.033069,0.005454
T0R2,0.0,1.052723,0.936461,0.951647,0.860563,0.840253,0.700175,0.691526,0.601228,0.511231,...,0.076452,0.018898,0.043057,0.046036,0.091006,0.040499,0.051338,0.158052,0.010829,0.081645
T1R0,0.25,0.840889,0.763162,0.725462,0.711929,0.619594,0.621426,0.624011,0.508073,0.442128,...,0.102683,0.113124,0.175935,0.129763,0.141581,0.180714,0.198129,0.192937,0.23744,0.201601
T1R1,0.25,0.836676,0.718542,0.78854,0.710917,0.590258,0.623417,0.561478,0.469305,0.446682,...,0.112608,0.096585,0.195339,0.114256,0.157391,0.190442,0.172018,0.244459,0.211275,0.208334
T1R2,0.25,0.762993,0.76376,0.851142,0.7601,0.682543,0.631249,0.629989,0.513312,0.429939,...,0.11155,0.129665,0.143112,0.138439,0.166005,0.176165,0.267653,0.181388,0.211051,0.229709
T2R0,0.5,0.657435,0.621553,0.632048,0.589488,0.501403,0.47038,0.477851,0.371925,0.335059,...,0.14704,0.15919,0.276875,0.273507,0.312756,0.360023,0.379286,0.373678,0.440921,0.432045
T2R1,0.5,0.677215,0.597299,0.573971,0.523475,0.504239,0.485768,0.442679,0.380225,0.342189,...,0.184677,0.193307,0.226401,0.239087,0.294074,0.325034,0.343095,0.339842,0.424291,0.396736
T2R2,0.5,0.701883,0.63422,0.594034,0.537168,0.518636,0.473924,0.481943,0.330808,0.34808,...,0.159753,0.167062,0.215952,0.290881,0.304037,0.332979,0.340364,0.394729,0.38874,0.402211
T3R0,0.75,0.4093,0.445131,0.445437,0.442212,0.403627,0.311322,0.311874,0.271982,0.234673,...,0.293809,0.273318,0.36941,0.40863,0.420669,0.466449,0.505094,0.621099,0.614235,0.583076


In [4]:
omics.timepoints

array([0.  , 0.25, 0.5 , 0.75, 1.  ])

## Data rescaling

The recommended way to rescale the data is to divide each feature by a typical value, for example its maximum or median value. A good alternative can be a log-scaling. The implementations of these methods is illustrated below.

z-score normalization or similar is not recommended, since it removes difference in variance among features that may contain relevant timeseries information.

### Scaling by maximum value

Scaling with maximum value is the default in podomics and can be achieved by calling the `rescale()` method without any parameters.

In [5]:
omics_maxscaled = omics.rescale()
np.array([omics_maxscaled.data[f].max() for f in omics_maxscaled.features])

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

The output shows that all features in the rescaled dataset have a maximum value of 1.

### Scaling by median

Unfortunately there is now median scaler in the `sklearn.preprocessing` currently. We provide a custom `MedianScaler` class in the `dataset` module, that closely follows the implementation of the `MaxAbsScaler` class.

An instance of the `MedianScaler` class has to be passed to the `rescale()` method to do a scaling wit the median.

In [6]:
omics_medscaled = omics.rescale(method=podomics.dataset.MedianScaler())
np.array([omics_medscaled.data[f].median() for f in omics_medscaled.features])

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

The output shows that all features in the rescaled dataset have a median value of 1.

In [8]:
omics_medscaled.features

['0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '20',
 '21',
 '22',
 '23',
 '24',
 '25',
 '26',
 '27',
 '28',
 '29',
 '30',
 '31',
 '32',
 '33',
 '34',
 '35',
 '36',
 '37',
 '38',
 '39',
 '40',
 '41',
 '42',
 '43',
 '44',
 '45',
 '46',
 '47',
 '48',
 '49']

### Log scaling

TODO, use sklearn.preprocessing.FunctionTransformer

## Data plotting

In [9]:
np.allclose?

In [14]:
from podomics.dataset import *

In [17]:
    >>> df = pandas.read_csv("examples/exampledata1.csv", index_col="Sample")
    >>> omics = Dataset(df, time="Timepoint", features=['0', '1', '3', '4', '6'])
    >>> scaled = omics.rescale()
    >>> [scaled.data[f].max() for f in scaled.features]

[1.0,
 1.0,
 0.9516474640178868,
 1.0,
 1.0,
 0.8261512435688412,
 1.0,
 0.6551073154530921,
 0.5875860297898728,
 0.4797354925482742,
 0.4187241960429027,
 0.311319084420503,
 0.2930108918282412,
 0.2662218161682289,
 0.217655109405586,
 0.1287785385782666,
 0.1101888653744003,
 0.0973284060988426,
 0.1135775302049094,
 0.0798681635227758,
 0.0622985434343887,
 0.0726770804908452,
 0.057361483731124,
 0.0952893804364631,
 0.0636502577710839,
 0.077873623191062,
 0.0915543044449836,
 0.0884158512573791,
 0.0718151510684229,
 0.0698498274809565,
 0.0934331125718313,
 0.0851340246694963,
 0.0908971647422708,
 0.0887848181383799,
 0.1063883139794093,
 0.1562205191823662,
 0.1712334864151145,
 0.2031652117547283,
 0.219805758237921,
 0.2722647679462719,
 0.3230355966090932,
 0.4550062533168799,
 0.4529228049988377,
 0.5284319123357951,
 0.6284838796486079,
 0.6358005530866389,
 0.7188638642790988,
 0.7465293474876191,
 0.7983101679706505,
 0.8993032794573086]

In [18]:
omics.features

['0', '1', '3', '4', '6']

In [19]:
omics.data

Unnamed: 0_level_0,Timepoint,0,1,2,3,4,5,6,7,8,...,40,41,42,43,44,45,46,47,48,49
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
T0R0,0.0,1.091971,0.892355,0.926898,0.854174,0.886741,0.826151,0.657382,0.655107,0.587586,...,0.00063,0.014822,0.020872,0.017413,0.085254,0.008887,0.058203,0.03918,0.038306,0.007283
T0R1,0.0,1.014665,1.01362,0.911735,0.863435,0.800986,0.767051,0.685029,0.584403,0.537806,...,0.017506,0.084409,0.004945,0.007825,0.030681,0.022009,0.005373,0.046301,0.033069,0.005454
T0R2,0.0,1.052723,0.936461,0.951647,0.860563,0.840253,0.700175,0.691526,0.601228,0.511231,...,0.076452,0.018898,0.043057,0.046036,0.091006,0.040499,0.051338,0.158052,0.010829,0.081645
T1R0,0.25,0.840889,0.763162,0.725462,0.711929,0.619594,0.621426,0.624011,0.508073,0.442128,...,0.102683,0.113124,0.175935,0.129763,0.141581,0.180714,0.198129,0.192937,0.23744,0.201601
T1R1,0.25,0.836676,0.718542,0.78854,0.710917,0.590258,0.623417,0.561478,0.469305,0.446682,...,0.112608,0.096585,0.195339,0.114256,0.157391,0.190442,0.172018,0.244459,0.211275,0.208334
T1R2,0.25,0.762993,0.76376,0.851142,0.7601,0.682543,0.631249,0.629989,0.513312,0.429939,...,0.11155,0.129665,0.143112,0.138439,0.166005,0.176165,0.267653,0.181388,0.211051,0.229709
T2R0,0.5,0.657435,0.621553,0.632048,0.589488,0.501403,0.47038,0.477851,0.371925,0.335059,...,0.14704,0.15919,0.276875,0.273507,0.312756,0.360023,0.379286,0.373678,0.440921,0.432045
T2R1,0.5,0.677215,0.597299,0.573971,0.523475,0.504239,0.485768,0.442679,0.380225,0.342189,...,0.184677,0.193307,0.226401,0.239087,0.294074,0.325034,0.343095,0.339842,0.424291,0.396736
T2R2,0.5,0.701883,0.63422,0.594034,0.537168,0.518636,0.473924,0.481943,0.330808,0.34808,...,0.159753,0.167062,0.215952,0.290881,0.304037,0.332979,0.340364,0.394729,0.38874,0.402211
T3R0,0.75,0.4093,0.445131,0.445437,0.442212,0.403627,0.311322,0.311874,0.271982,0.234673,...,0.293809,0.273318,0.36941,0.40863,0.420669,0.466449,0.505094,0.621099,0.614235,0.583076
