In [None]:
import os
import time
import math
import uuid

import numpy as np
import pandas as pd
import dask.dataframe as dd
from turbodbc import connect, make_options, Megabytes, Rows

from omop_etl.io import to_csv
from omop_etl.load import Loader
from omop_etl.utils import timeitc, find

rpath = '//share.ahc.ufl.edu/share$/DSS/IDR_Projects/Cancer_Center/data_release'
config_path = '//share.ahc.ufl.edu/share$/DSS/IDR_Projects/Cancer_Center/omop_cc_etl/config.yml'
folders = ['clinical_data', 'health_system', 'vocabulary']

# for folder in folders:
#     dirpath = os.path.join(rpath, folder)
    
#     try:
#         os.mkdir(dirpath)
#     except (FileExistsError) as e:
#         print(f'Directory {folder} already exists. Nothing done.')
#         pass

loader = Loader(config_path)
server = loader.config.server
database = loader.config.project_database

In [None]:
table = 'drug_exposure'
schema = 'hipaa'
MB = 10000
partitions = 2

# for table in ['drug_exposure','observation','condition_occurrence']:

rpath = '//share.ahc.ufl.edu/share$/DSS/IDR_Projects/Cancer_Center/data_release/' + table

# Set batch size in number of rows
options = make_options(read_buffer_size = Megabytes(MB), 
                       prefer_unicode = True,
                       use_async_io = True,
                       limit_varchar_results_to_max = True)

con = connect(driver = '{SQL Server}', server = server, database = database, 
              trusted_connection = 'yes', turbodbc_options = options)

cursor = con.cursor()
cursor.execute(f"select * from {schema}.{table}")
batches = cursor.fetchnumpybatches()

count = 0

with timeitc(f'Exporting {table}'):
    if not os.path.exists(rpath):
        os.mkdir(rpath)
    
    for batch in batches:
        batch_id = uuid.uuid4()
        csvfile = os.path.join(rpath, table + f'-{count}-{batch_id}-*.csv')
        # Force all columns to data type object to avoid conversion errors with date values like '2266-03-23'.
        # See https://stackoverflow.com/questions/32888124/pandas-out-of-bounds-nanosecond-timestamp-after-offset-rollforward-plus-adding-a#37226672 
        df = pd.DataFrame(batch, dtype=str) 
        
        if count == 0:
            header = os.path.join(rpath, table)
            df.head(0).to_csv(header, index=False, sep='\t')
            dtypes = {t:batch[t].dtype.type for t in batch.keys()}
            # To avoid having ids exported as floats we need to convert to int.
            # However, a bug in pandas prevent to convert directly from object to int.
            # Instead, convert to float then to Int64
            dtypes_1 = {}
            dtypes_2 = {}
            for dtype in dtypes.keys():
                if dtypes[dtype] == np.int64:
                    dtypes_1[dtype] = 'Int64'
                    dtypes_2[dtype] = 'float'
                else:
                    dtypes_1[dtype] = 'str'
                    dtypes_2[dtype] = 'str'

        df = dd.from_pandas(df.astype(dtypes_2).astype(dtypes_1), npartitions=partitions)        
        df.to_csv(csvfile, header=False, index=False, sep='\t')
        count += 1     
        
    con.close()

In [None]:
table = 'drug_exposure'
schema = 'hipaa'
count = 0
partitions = 3

with timeitc(f'Exporting {table}'):

    with loader.engine.connect() as con:
        batches = pd.read_sql(f"select * from {schema}.{table}", con, chunksize=1000000)
    
        if not os.path.exists(rpath):
            os.mkdir(rpath)

        for batch in batches:
            batch_id = uuid.uuid4()
            csvfile = os.path.join(rpath, table + f'-{count}-{batch_id}-*.csv')
            # Force all columns to data type object to avoid conversion errors with date values like '2266-03-23'.
            # See https://stackoverflow.com/questions/32888124/pandas-out-of-bounds-nanosecond-timestamp-after-offset-rollforward-plus-adding-a#37226672 
            # df = pd.DataFrame(batch, dtype=str) 
            
            if count == 0:
                header = os.path.join(rpath, table)
                batch.head(0).to_csv(header, index=False, sep='\t')
                # dtypes = {t:batch[t].dtype.type for t in batch.keys()}
                # To avoid having ids exported as floats we need to convert to int.
                # However, a bug in pandas prevent to convert directly from object to int.
                # Instead, convert to float then to Int64
                # dtypes_1 = {}
                # dtypes_2 = {}
                # for dtype in dtypes.keys():
                #     if dtypes[dtype] == np.int64:
                #         dtypes_1[dtype] = 'Int64'
                #         dtypes_2[dtype] = 'float'
                #     else:
                #         dtypes_1[dtype] = 'str'
                #         dtypes_2[dtype] = 'str'
                # print(batch.shape)
                # print(batch.memory_usage(deep=True))
                # break

            df = dd.from_pandas(batch, npartitions=partitions)        
            df.to_csv(csvfile, header=False, index=False, sep='\t')
            count += 1     

In [None]:
batch.drug_exposure_end_date.dtype.name

In [None]:
table = 'drug_exposure'
schema = 'hipaa'
MB = 10000
partitions = 2

# for table in ['drug_exposure','observation','condition_occurrence']:

rpath = '//share.ahc.ufl.edu/share$/DSS/IDR_Projects/Cancer_Center/data_release/' + table

# Set batch size in number of rows
options = make_options(read_buffer_size = Megabytes(MB), 
                       prefer_unicode = True,
                       use_async_io = True,
                       limit_varchar_results_to_max = True)

con = connect(driver = '{SQL Server}', server = server, database = database, 
              trusted_connection = 'yes', turbodbc_options = options)

cursor = con.cursor()
cursor.execute(f"select * from {schema}.{table}")
batches = cursor.fetchnumpybatches()

count = 0

with timeitc(f'Exporting {table}'):
    if not os.path.exists(rpath):
        os.mkdir(rpath)
    
    for batch in batches:
        batch_id = uuid.uuid4()
        csvfile = os.path.join(rpath, table + f'-{count}-{batch_id}-*.csv')
        # Force all columns to data type object to avoid conversion errors with date values like '2266-03-23'.
        # See https://stackoverflow.com/questions/32888124/pandas-out-of-bounds-nanosecond-timestamp-after-offset-rollforward-plus-adding-a#37226672 
        df = pd.DataFrame(batch, dtype=str) 
        
        if count == 0:
            header = os.path.join(rpath, table)
            df.head(0).to_csv(header, index=False, sep='\t')
            dtypes = {t:batch[t].dtype.type for t in batch.keys()}
            # To avoid having ids exported as floats we need to convert to int.
            # However, a bug in pandas prevent to convert directly from object to int.
            # Instead, convert to float then to Int64
            # dtypes_1 = {}
            # dtypes_2 = {}
            # for dtype in dtypes.keys():
            #     if dtypes[dtype] == np.int64:
            #         dtypes_1[dtype] = 'Int64'
            #         dtypes_2[dtype] = 'float'
            #     else:
            #         dtypes_1[dtype] = 'str'
            #         dtypes_2[dtype] = 'str'
            con.close()
            print(df.shape)
            print(df.memory_usage(deep=True))
            break 
        
        # df = dd.from_pandas(df.astype(dtypes_2).astype(dtypes_1), npartitions=partitions)        
        # df.to_csv(csvfile, header=False, index=False, sep='\t')
        # count += 1     
        
    

In [None]:
# table = 'care_site'
# file = os.path.join(rpath, table + '.csv')
# count = 1
# batch_size = 1000000

# with loader.engine.connect() as con:
#     result = con.execute("EXEC sp_spaceused N'hipaa.{}';".format(table))
#     rows = int(result.fetchall()[0][1].strip())

# n_batches = math.ceil(rows/batch_size)

# # print('Total rows: ', rows, '\nBatch size: ', batch_size, '\nBatches:', n_batches)

# with timeitc(f'Exporting {table}'):
#     if os.path.exists(file):
#         os.remove(file)
    
#     with loader.engine.connect() as con:
#         header = pd.read_sql('select top 0 * from {}'.format(table), con)
#         header.to_csv(file, index=False, sep='\t')
#         for chunk in pd.read_sql('select * from {}'.format(table), con, coerce_float=False, chunksize=batch_size):
#             chunk.to_csv(file, header=False, index=False, sep='\t', mode='a')
#             print(f'Batch {count}/{n_batches} complete.', end='\r')
#             count=+1