# `etl_pivot`
The input are parquet files (with long format) under the same directory.

## Extract
`pd.read_parquet()` extracts the input 

## Transformation
* transform from long to pivot format
* normalise (either log(x+1) or by rank)

## Load
* output the data as csv

In [1]:
import pandas as pd
import numpy as np
from scipy import stats

from set_path import *

In [2]:
def p_to_z(data):
    q=1-data/2
    z=stats.norm.ppf(data)
    return z
p_to_z=np.vectorize(p_to_z)

In [3]:
class TRANSFORM:
    def __init__(self,pep,norm_method,output1):
        self.dfpep=pep
        self.norm_method=norm_method
        self.output1=output1
    def read_pep(self):
        self.dfpep=pd.read_parquet(self.dfpep)
    def long_to_pivot(self):
        self.dfpep=self.dfpep.pivot(index='ProbenID', columns='Muster', values='Amplitude')
        self.dfpep.fillna(0,inplace=True)
        
    def pep_normalise(self):
        if self.norm_method=='none':
            pass
        elif self.norm_method=='ln': # log (x+1) transformation
            self.dfpep=np.log(self.dfpep+1)
        elif self.norm_method=='rank': # to z-scores
            self.dfpep = self.dfpep.rank()
            # pct is obtained by deviding rank /len(df)+1, in order to avoid getting '1' as pct
            # pct=1 gives z= infinity 
            # therfore we cannot directly apply self.dfpep.rank(pct=1)
            self.dfpep= self.dfpep/ (len(self.dfpep)+1)
            self.dfpep=pd.DataFrame(data=p_to_z(self.dfpep),index=self.dfpep.index,columns=self.dfpep.columns)
     #   print (self.dfpep.head)
        print (self.dfpep.head())
        print (self.dfpep.shape)
    def check_na(self):
        pass
    
    def export_csv(self):
        self.dfpep.to_csv(self.output1)


In [4]:
# ALL, from long to pivot
x1=TRANSFORM(pep=f2+'reduced_all/',norm_method='rank',output1=f2+'reduced_all_pivot.csv')
x1.read_pep()
x1.long_to_pivot()
x1.pep_normalise()
x1.export_csv()

ImportError: Unable to find a usable engine; tried using: 'pyarrow', 'fastparquet'.
A suitable version of pyarrow or fastparquet is required for parquet support.
Trying to import the above resulted in these errors:
 - Missing optional dependency 'pyarrow'. pyarrow is required for parquet support. Use pip or conda to install pyarrow.
 - Missing optional dependency 'fastparquet'. fastparquet is required for parquet support. Use pip or conda to install fastparquet.

In [7]:
# test, from long to pivot
x1=TRANSFORM(pep=f2+'reduced_test/',norm_method='rank',output1=f2+'reduced_test_pivot.csv')
x1.read_pep()
x1.long_to_pivot()
x1.pep_normalise()
# x1.export_csv()

Muster                          99900421  99900966  99901132  99901155  \
ProbenID                                                                 
50188-Calcification-Urin-87521 -0.693804  0.773842  0.278683  0.951278   
50189-Calcification-Urin-87522  1.970505 -0.475899 -0.215640  1.052254   
50190-Calcification-Urin-87523 -0.693804  0.091835 -0.342855 -0.475899   
50191-Calcification-Urin-87524 -0.693804  0.408472 -0.030573 -0.693804   
50192-Calcification-Urin-87525  0.773842  1.052254  0.091835 -0.030573   

Muster                          99901274  99901532  99901555  99901661  \
ProbenID                                                                 
50188-Calcification-Urin-87521  1.165288  1.052254  0.951278  0.859175   
50189-Calcification-Urin-87522 -0.693804  1.656795  1.052254  1.165288   
50190-Calcification-Urin-87523 -0.545564  1.165288 -0.733236 -1.227826   
50191-Calcification-Urin-87524  0.342855  0.773842 -0.091835  0.693804   
50192-Calcification-Urin-87525  0.859

In [9]:
# TRAIN, from long to pivot
x1=TRANSFORM(pep=f2+'reduced_train/',norm_method='none',output1=f2+'reduced_train_pivot.csv')
x1.read_pep()
x1.long_to_pivot()
x1.pep_normalise()
# x1.export_csv()

Muster                   99900259  99900340  99900421  99900467  99900579  \
ProbenID                                                                    
100174-Diamos-Urin-923     179.13      0.00    183.28    204.20      0.00   
100231-Diamos-Urin-980     475.77    436.30    376.23     75.78     90.33   
100236-Diamos-Urin-985     169.20      0.00    296.20      0.00      0.00   
100268-Diamos-Urin-1017      0.00    361.19      0.00      0.00      0.00   
100280-Diamos-Urin-1029    157.44    382.66    705.98      0.00     69.22   

Muster                   99900966  99901132  99901155  99901274  99901338  \
ProbenID                                                                    
100174-Diamos-Urin-923       0.00    780.57   1863.88    455.70      0.00   
100231-Diamos-Urin-980       0.00   4862.04   1018.31   1787.36    237.70   
100236-Diamos-Urin-985     421.72   1105.46    492.35   1089.96    318.64   
100268-Diamos-Urin-1017      0.00   2329.26   1440.38      0.00      0.00  