In [1]:
import os
import time
import math
import uuid

import numpy as np
import pandas as pd
import dask.dataframe as dd
from turbodbc import connect, make_options, Megabytes, Rows

from omop_etl.io import to_csv
from omop_etl.load import Loader
from omop_etl.utils import timeitc, find

rpath = '//share.***REMOVED***/Cancer_Center/data_release'
config_path = '//share.***REMOVED***/Cancer_Center/omop_cc_etl/config.yml'
folders = ['clinical_data', 'health_system', 'vocabulary']

# for folder in folders:
#     dirpath = os.path.join(rpath, folder)
    
#     try:
#         os.mkdir(dirpath)
#     except (FileExistsError) as e:
#         print(f'Directory {folder} already exists. Nothing done.')
#         pass

loader = Loader(config_path)
server = loader.config.server
database = loader.config.project_database

Unable to import optional dependencies:
selenium: No module named 'selenium'


In [2]:
table = 'measurement'
schema = 'hipaa'
rpath = '//share.***REMOVED***/Cancer_Center/data_release/' + table
MB = 1000
partitions = 10

# Set batch size in number of rows
options = make_options(read_buffer_size = Megabytes(MB), 
                       prefer_unicode = True,
                       use_async_io = True,
                       limit_varchar_results_to_max = True)

con = connect(driver = '{SQL Server}', server = server, database = database, 
              trusted_connection = 'yes', turbodbc_options = options)

cursor = con.cursor()
cursor.execute(f"select * from {schema}.{table}")
batches = cursor.fetchnumpybatches()

count = 0

with timeitc(f'Exporting {table}'):
    if not os.path.exists(rpath):
        os.mkdir(rpath)
    
    for batch in batches:
        batch_id = uuid.uuid4()
        file = os.path.join(rpath, table + f'-{count}-{batch_id}-*.csv')
        df = dd.from_pandas(pd.DataFrame(batch), npartitions=partitions)
        
        if count == 0:
            header = os.path.join(rpath, table)
            df.head(0).to_csv(header, index=False, sep='\t')
            dtypes = {t:batch[t].dtype.type for t in batch.keys()}
            for dtype in dtypes.keys():
                if dtypes[dtype] == np.int64:
                    dtypes[dtype] = 'Int64'
                else:
                    dtypes[dtype] = 'str'
            
        df = df.astype(dtypes)            
        df.to_csv(file, header=False, index=False, sep='\t')
        count += 1     
        
    con.close()

Exporting measurement finished in 03:08:30


In [3]:
# table = 'care_site'
# file = os.path.join(rpath, table + '.csv')
# count = 1
# batch_size = 1000000

# with loader.engine.connect() as con:
#     result = con.execute("EXEC sp_spaceused N'hipaa.{}';".format(table))
#     rows = int(result.fetchall()[0][1].strip())

# n_batches = math.ceil(rows/batch_size)

# # print('Total rows: ', rows, '\nBatch size: ', batch_size, '\nBatches:', n_batches)

# with timeitc(f'Exporting {table}'):
#     if os.path.exists(file):
#         os.remove(file)
    
#     with loader.engine.connect() as con:
#         header = pd.read_sql('select top 0 * from {}'.format(table), con)
#         header.to_csv(file, index=False, sep='\t')
#         for chunk in pd.read_sql('select * from {}'.format(table), con, coerce_float=False, chunksize=batch_size):
#             chunk.to_csv(file, header=False, index=False, sep='\t', mode='a')
#             print(f'Batch {count}/{n_batches} complete.', end='\r')
#             count=+1

Exporting care_site finished in 00:00:00
