In [35]:
from helpers import *
import pandas as pd
import datetime
import os

In [36]:
from __init__ import const_datasetLibDir

In [37]:
def process_columns(x):
    '''This function processes data and forces the data into the ideal format.'''
    if type(x) == float:
        return x
    if type(x) == int:
        return float(x)
    if type(x) == unicode:
        try:
            x = str(x)
        except:
            x = x
    if type(x) == str:
        try:
            x = datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')
            return x
        except:
            try:
                x = datetime.datetime.strptime(x, '%Y-%m-%d')
                return x
            except:
                return x
    else:
        return None

In [51]:
def browse():
    '''Browse the library of datasets that are available. Returns a dataframe'''
    
    #search the directory (dev/prod are required)
    data = []
    for env in ['dev','prod']:
        libDir = os.path.join(const_datasetLibDir, env)
        #libDir = './' + env
        for folder in os.listdir(libDir):
            #libDirFolder = libDir + '/' + folder
            libDirFolder = os.path.join(libDir, folder)
            if os.path.isdir(libDirFolder):
                for file in sorted(os.listdir(libDirFolder)):
                    if os.path.isfile(os.path.join(libDirFolder, file)):
                        data.append((env,libDirFolder, folder, file))
    
    #re-map the columns and split the folder column into the respective features.
    dataset_lib = pd.DataFrame(data, columns = ['env','dir','folder','files'])
    dataset_lib['market'] = dataset_lib['folder'].apply(lambda x: x.split('_')[0])
    dataset_lib['dataset'] = dataset_lib['folder'].apply(lambda x: str(x.split('_')[1:-4]).replace("'",'').replace('[','').replace(']',''))
    dataset_lib['start'] = dataset_lib['folder'].apply(lambda x: x.split('_')[-3])
    dataset_lib['end'] = dataset_lib['folder'].apply(lambda x: x.split('_')[-2])
    dataset_lib['interval'] = dataset_lib['folder'].apply(lambda x: x.split('_')[-4])
    dataset_lib['updated'] = dataset_lib['folder'].apply(lambda x: x.split('_')[-1])
    dataset_lib = dataset_lib.reindex(['env','market','dataset','start','end','interval','updated','dir','files'], axis=1)

    #process the date fields
    dataset_lib['start'] =  pd.to_datetime(dataset_lib['start'], format='%Y-%m-%d')
    dataset_lib['end'] =  pd.to_datetime(dataset_lib['end'], format='%Y-%m-%d')
    dataset_lib['days'] =  dataset_lib['end'] - dataset_lib['start']
    dataset_lib['updated'] =  pd.to_datetime(dataset_lib['updated'], format='%Y-%m-%d %H%M%S')
    
    #group and concat the file list into one field
    dataset_lib = dataset_lib.groupby(['env','market','dataset','start','end', 'days','interval','updated','dir'])['files'].apply(lambda x: "%s" % ', '.join(x))
    dataset_lib = dataset_lib.reset_index()
    
    return dataset_lib

In [52]:
#this is slow because of applymap. I could selectively applymap to vastly increase the speed.
def get(libIndex):
    lib = browse()
    dir = lib.iloc[libIndex]['dir']
    files = lib.iloc[libIndex]['files'].split(', ')
    
    data = {}
    for i, file in enumerate(files):
        progress('Processing: ' + file + ' | ' + str(i+1) + ' of ' + str(len(files)))
        #dir_file = dir + '/' + file
        dir_file = os.path.join(const_datasetLibDir, dir, file)
        read = pd.read_csv(dir_file, sep='\t',index_col=0,parse_dates = True)
        read = read.applymap(lambda x: process_columns(x))
        data[file.split('.')[0]] = read.copy().drop_duplicates()
    print '\nFiles processed'
    
    return data

In [40]:
#lib = browse()
#dataset = get(1)
#dataset['symbols.csv']