This short notebook create a file of input data from CNN encoded QPESUMS.

In [1]:
import numpy as np
import pandas as pd
import os, logging

xpath = '../data/qpesums_encoded'
ypath = '../examples/data/t1hr.csv'

logging.basicConfig(level=logging.INFO)

# Load input/output data for model
def loadIOTab(srcx, srcy, dropna=False):
    import pandas as pd
    import os
    # Read raw input and output
    #logging.info("Reading input X from: "+ srcx)
    logging.info("Reading input X from: "+ srcx)
    xfiles = []
    for root, dirs, files in os.walk(srcx): 
        for fn in files: 
            if fn.endswith('.npy'): 
                 xfiles.append({'date':fn.replace('.enc.npy',''), 'xuri':os.path.join(root, fn)})
    xfiles = pd.DataFrame(xfiles)
    logging.info("... read input size: "+str(xfiles.shape))
    #logging.info("Reading output Y from: "+ srcy)
    logging.info("Reading output Y from: "+ srcy)
    yraw = pd.read_csv(srcy, encoding='utf-8')
    yraw['date'] = yraw['date'].apply(str)
    logging.info("... read output size: "+str(yraw.shape))
    # Create complete IO-data
    logging.info("Pairing X-Y and splitting training/testing data.")
    iotab = pd.merge(yraw, xfiles, on='date', sort=True)
    logging.info("... data size after merging: "+str(iotab.shape))
    # Dro NA if specified
    if dropna:
        logging.info('Dropping records with NA')
        iotab = iotab.dropna()
        logging.info("... data size after dropping-NAs: "+str(iotab.shape))
    # Done
    return(iotab)

# Scan for input/output
iotab = loadIOTab(xpath, ypath)
print(iotab.shape)
print(iotab.head())


INFO:root:Reading input X from: ../data/qpesums_encoded
INFO:root:... read input size: (34402, 2)
INFO:root:Reading output Y from: ../examples/data/t1hr.csv
INFO:root:... read output size: (35064, 46)
INFO:root:Pairing X-Y and splitting training/testing data.
INFO:root:... data size after merging: (32953, 47)


(32953, 47)
         date  C0A580  C0A970  466940  C0A540  C0A550  C0A9A0  C0AC60  C0A870  \
0  2013010109     0.0     0.5     0.5     0.0     0.0     0.0     0.0     0.5   
1  2013010110     0.5     1.5     1.0     0.5     0.5     0.0     0.0     0.5   
2  2013010111     0.5     1.0     0.5     1.0     1.5     0.0     0.5     0.5   
3  2013010112     0.5     4.0     1.0     1.5     1.5     1.0     0.5     0.5   
4  2013010113     0.5     6.0     1.0     0.0     1.5     0.5     0.5     1.0   

   466920  ...  C0AD50  C0A9B0  C0A560  C0A950  C0A940  C0A570  C0A980  \
0     NaN  ...     0.0     0.0     0.5     0.0     0.5     0.0     0.0   
1     0.5  ...     0.0     0.5     1.5     0.5     0.5     1.0     0.5   
2     NaN  ...     0.0     0.0     0.5     0.0     1.5     1.0     0.0   
3     1.5  ...     0.5     0.0     0.0     1.0     2.5     0.5     0.5   
4     1.0  ...     0.5     1.0     0.0     1.5     3.0     0.5     1.0   

   C0A9C0  C0AD40                                       

In [22]:
xs = []
#print(xs.shape)

for i in range(iotab.shape[0]):
    uri = iotab['xuri'].iloc[i]
    tmp = np.load(uri)[0]
    xs.append(tmp)
    
print(np.array(xs).shape)

(32953, 64)


In [28]:
encqpesums = pd.concat([iotab['date'], pd.DataFrame(xs)], ignore_index=True, axis=1)
encqpesums.columns = ['date']+['enc'+str(i) for i in range(64)]
print(encqpesums.shape)
print(encqpesums.head())

(32953, 65)
         date      enc0      enc1      enc2     enc3      enc4      enc5  \
0  2013010109 -0.022890 -0.711302 -0.012069 -0.11087 -0.986854 -0.277102   
1  2013010110 -0.026171 -0.711302 -0.012069 -0.11087 -0.986854 -0.277102   
2  2013010111 -0.015834 -0.711302 -0.012069 -0.11087 -0.986854 -0.277102   
3  2013010112 -0.004246 -0.711302 -0.012069 -0.11087 -0.795049 -0.277102   
4  2013010113  0.000866 -0.711302 -0.012069 -0.11087 -0.633735 -0.277102   

       enc6      enc7      enc8  ...     enc54    enc55    enc56     enc57  \
0  0.093142 -0.006676  0.011295  ...  3.827515 -0.04449 -0.06204 -0.070955   
1  0.093142 -0.006676  0.011295  ...  3.966871 -0.04449 -0.06204 -0.070955   
2  0.093142 -0.006676  0.011295  ...  2.614695 -0.04449 -0.06204 -0.070955   
3  0.093142 -0.006676  0.011295  ...  2.272223 -0.04449 -0.06204 -0.070955   
4  0.093142 -0.006676  0.011295  ...  2.191800 -0.04449 -0.06204 -0.070955   

      enc58     enc59     enc60     enc61     enc62     enc63 

In [35]:
pd.DataFrame(xs).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
count,32953.0,32953.0,32953.0,32953.0,32953.0,32953.0,32953.0,32953.0,32953.0,32953.0,...,32953.0,32953.0,32953.0,32953.0,32953.0,32953.0,32953.0,32953.0,32953.0,32953.0
mean,-0.001037,0.189947,-0.01166,-0.021695,0.662522,-0.190714,0.061166,-0.004377,-0.003982,0.130515,...,1.308679,-0.042524,-0.070224,-0.084058,-0.004572,-0.004176,0.003899,-0.009679,-0.01335,-0.036237
std,0.007436,1.243145,0.009794,0.668624,1.606837,0.662818,0.291983,0.044794,0.104568,0.687556,...,1.137879,0.033215,0.078416,0.109167,0.049657,0.240684,0.020897,0.028106,0.094853,0.026042
min,-0.161084,-0.711302,-0.012069,-0.11087,-0.986854,-0.277102,-6.924861,-0.006676,-2.70609,0.045655,...,-1.311774,-0.04449,-2.16547,-2.641572,-4.390481,-8.438606,-1.281361,-0.011801,-0.018822,-0.039717
25%,0.000866,-0.711302,-0.012069,-0.11087,-0.498064,-0.277102,0.093142,-0.006676,0.011295,0.045655,...,0.650155,-0.04449,-0.06204,-0.070955,-0.002925,0.015202,0.004673,-0.011801,-0.018822,-0.039717
50%,0.000866,-0.254669,-0.012069,-0.11087,0.417512,-0.277102,0.093142,-0.006676,0.011295,0.045655,...,1.349309,-0.04449,-0.06204,-0.070955,-0.002925,0.015202,0.004673,-0.011801,-0.018822,-0.039717
75%,0.000866,0.743521,-0.012069,-0.11087,1.365176,-0.277102,0.093142,-0.006676,0.011295,0.045655,...,2.072802,-0.04449,-0.06204,-0.070955,-0.002925,0.015202,0.004673,-0.011801,-0.018822,-0.039717
max,0.000866,11.579381,0.684289,14.971416,12.90382,14.865412,0.093142,2.442583,0.011295,13.846336,...,5.698771,1.599839,-0.06204,-0.070955,-0.002925,0.015202,0.004673,0.937994,4.335531,0.442013


In [39]:
encqpesums.to_csv('data/encqpesums.csv', index=False)
encqpesums.to_pickle('data/encqpesums.pkl')
np.save('../data/encqpesums.npy', np.array(xs))