In [None]:
from __future__ import division

import numpy as np

import time
from itertools import product

In [None]:
status = ['AT', 'LT']
# 16.02 
# load data of Feb, 16
month = ['10', '11', '12']
day = {'10':31, '11':30, '12':31}
data_prefix = 'ori-data/sorted-data-02/sorted-ZDJM_3GD_02_2015'
# the order of properties MATTERS!!!
properties = ['BILL', 'CALL', 'CREDIT', 'DEPOSIT',
              'PAYDEPOSIT', 'INCOME', 'STREAM', 'STATE']
# end of 16.02

# # 16.03
# # load data of March, 16
# month = ['1511', '1512', '1601']
# day = {'1511':30, '1512':31, '1601':31}
# data_prefix = 'ori-data/sorted-data-03/sorted-ZDJM_3GD_03_20'
# # the order of properties MATTERS!!!
# properties = ['BILL', 'CALL', 'CREDIT', 'DEPOSIT',
#               'PDEPOSIT', 'INCOME', 'STREAM', 'STATE']
# # end of 16.03

In [None]:
used_st = ['0','1','2','4','5','6',
           '7','9','A','B','F','K',
           'L','O','T','U','']
st_to_id = {used_st[i]:i for i in xrange(len(used_st))}

In [None]:
# if month = ['1511', '1512', '1601'] and day = {'1511':30, '1512':31, '1601':31}
# then used_day = {1:31, 2:62, 3:92}
# keys are the number of used month, and values are the corresponding sum of day
used_day = {}
for used_month in xrange(len(day.keys())+1):
    used_day[used_month] = sum([day[m] for m in month[-used_month:]])

In [None]:
# 16.02
# load data of Feb, 16
NUM_OF_AT_USER = 2110176
NUM_OF_LT_USER = 93827
# end of 16.02

# # 16.03
# # load data of March, 16
# NUM_OF_AT_USER = 2048501
# NUM_OF_LT_USER = 88842
# # end of 16.03

NUM_OF_STATE = 17
TRAIN_FRAC = 0.6
VALI_FRAC = 0.2
assert(NUM_OF_STATE == len(used_st))

In [None]:
def csv_to_np(filename, deli, data):
    '''
    load csv to data
    remove first column of csv
    ''' 
    print filename, data.shape
    num_of_user, num_of_day = data.shape
    with open(filename, 'r') as f:
        for user, line in enumerate(f.readlines()):
            sp_line = line.split(deli)
            assert(len(sp_line) == num_of_day + 1)
            for day, num in enumerate(sp_line[1:]):
                data[user, day] = float(num)
        assert(user + 1 == num_of_user)

In [None]:
def state_to_np(filename, deli, data):
    '''
    load state.csv to data
    if a state is presented, set to 1
    otherwise, set to -1
    remove first column of csv
    '''
    print filename, data.shape
    num_of_user, num_of_day, num_of_state = data.shape
    data.fill(-1.0)
    with open(filename, 'r') as f:
        for user, line in enumerate(f.readlines()):
            sp_line = line.split(deli)
            assert(len(sp_line) == num_of_day + 1)
            for day, st in enumerate(sp_line[1:]):
                st = st.replace('\n', '')
                data[user, day, st_to_id[st]] = 1.0
        assert(user + 1 == num_of_user)
    return data

In [None]:
def load_data(used_month, used_status, total_number_of_prpty):
    '''
    load all csv matching used_status and used_month
    example:
        supposed wanting to load data of march, then
        use previous comment section labeling 16.03
        
        if used_month = 2 and used_status = 'AT', then
        this function will load ZDJM_3GD_03_201512_AT_*.csv
        and ZDJM_3GD_03_201601_AT_*.csv
    '''
    def get_state_day_range(used_month):
        '''
        calculate the index range in data of all used month
        if used_month = 2 and loading data of 16.03,
        then day_range = {'1512':(0,31), '1601':(31,62)}
        '''
        day_range = {}
        for mo_idx, mo in enumerate(month[-used_month:]):
            if mo_idx == 0:
                day_range[mo] = (0, day[mo])
            else:
                prev_mo = month[-used_month+mo_idx-1]
                _, prev_end = day_range[prev_mo]
                day_range[mo] = (prev_end, prev_end+day[mo])
        
        return day_range
    
    day_range = get_state_day_range(used_month)
    num_of_row = NUM_OF_LT_USER if used_status == 'LT' else NUM_OF_AT_USER
    # initial data
    data = np.zeros((num_of_row, used_day[used_month], total_number_of_prpty))
    print data.shape
    
    for pr_idx, pr in enumerate(properties):
        print 'loading', pr
        if pr != 'STATE':
            for mo in month[-used_month:]:
                st_day, end_day = day_range[mo]
                print 'loading', mo, used_status, pr
                print 'start from', st_day, 'to', end_day
                # load csv to data[:, st_day:end_day, pr_idx]
                csv_to_np(data_prefix+'%s_%s_%s.csv' % (mo,used_status,pr),
                          ',', data[:, st_day:end_day, pr_idx])
            
            # whiten
            d_aver = data[:, :, pr_idx].mean(axis=1)
            d_aver = d_aver.reshape(len(d_aver),1)
            d_std = data[:, :, pr_idx].std(axis=1)
            d_std = d_std.reshape(len(d_std),1)
            data[:, :, pr_idx] -= d_aver
            data[:, :, pr_idx] /= (d_std + 1e-5)
                
        else: # pr == 'STATE'
            for mo in month[-used_month:]:
                st_day, end_day = day_range[mo]
                print 'loading', mo, used_status, pr
                # load csv to data[:, st_day:end_day, pr_idx:]
                state_to_np(data_prefix+'%s_%s_%s.csv' % (mo,used_status,pr),
                            ',', data[:, st_day:end_day, pr_idx:])
    
    return data

In [None]:
def load_hdf5(name):
    '''
    load data from hdf5 file with filename
    '''
    import h5py
    with h5py.File(name+'.h5', 'r') as hf:
        return np.copy(hf[name])

def dump_hdf5(name, data):
    '''
    dump data to hdf5 file with name as filename and index
    '''
    import h5py
    with h5py.File(name+'.h5', 'w') as hf:
        hf.create_dataset(name, data=data, compression="gzip")

In [None]:
def csv_to_h5():
    for p in product([1,2,3], ['LT','AT'], [24]):
        print 'dumping', p
        d = apply(load_data, p)
        dump_hdf5('%sm_%s_24pr' % (str(p[0]),p[1]), d)

In [None]:
for p in product([2], ['AT'], [24]):
    print 'dumping', p
    d = apply(load_data, p)
    dump_hdf5('1602_%sm_%s_24pr' % (str(p[0]),p[1]), d)

In [None]:
def shuf_split_data(data, tr_frac, val_frac):
    '''split data into train, validate and test set'''
    assert(0 < tr_frac < 1)
    assert(0 < val_frac < 1)
    assert(0 < 1 - tr_frac - val_frac < 1)
    np.random.shuffle(data)
    split_tr_id = int(len(data) * tr_frac)
    split_val_id = int(len(data) * val_frac)
    split_te_id = split_tr_id + split_val_id
    return (data[:split_tr_id], # train
            data[split_tr_id:split_te_id], # val
            data[split_te_id:]) # test

In [None]:
def load_h5_and_split(name):
    '''
    load data produced by load_data function (as hdf5 file)
    and split into train, validate and test set
    '''
    data = load_hdf5(name)
    print 'data', data.shape
    tr_data, val_data, te_data = shuf_split_data(data, TRAIN_FRAC, VALI_FRAC)
    print 'train', name, tr_data.shape
    dump_hdf5('tr_'+name, tr_data)
    print 'validate', name, val_data.shape
    dump_hdf5('val_'+name, val_data)
    print 'test', name, te_data.shape
    dump_hdf5('te_'+name, te_data)

In [None]:
# load_h5_and_split('1m_LT_23pr')
# load_h5_and_split('1m_AT_23pr')