In [71]:
# simulate subphases for clone and copy operations
# clone durations simulated from lognormal random w/ mu=1.5 and v=0.5
#  creates a lognormal distribution with median near 4.4 and mean near 5.0
# copy durations simulated from lognormal random w/ mu=7 and v=1
import pandas as pd
import numpy as np
import uuid as uuid

numClonePerDay = 1000
numCopyPerDay = 50
cloneMu = 1.5
cloneV = 0.5

copyOrgParams = {
    'orgSizes':           [5,    10,   20,   50,  100, 250, 750, 1250, 2500],
    'orgSizeFreqWeights': [60,   25,   20,   15,   12,  10,  5,   2,    1],
    'muForSize':          [4.0, 4.5,  5.0,  6.0,  7.0, 8.0, 9.0, 10.0,  12.0],
    'vForSize':           [0.5, 0.5,  0.75, 0.75, 1.0, 1.0, 1.0, 1.5,  1.5]
}
copyOrgParams['orgSizeFreq'] = []
totalWeight = float(np.sum(copyOrgParams['orgSizeFreqWeights']))
cumWt = 0
for wt in copyOrgParams['orgSizeFreqWeights']:
    cumWt += float(wt)/totalWeight
    copyOrgParams['orgSizeFreq'].append(cumWt)

numDays = 30

initDate = np.datetime64('2018-11-01')
copyData = {}
cloneData = {}
fieldNames = [
        'subphase_pod',
        'sourceOrgId',
        'org_size',
        'targetorgid',
        'clone_subphase_name',
        'subphase_min_time',
        'subphase_min_time_str',
        'subphase_max_time',
        'subphase_max_time_str',
        'subphase_duration_secs',
        'subphase_logline_count']
for f in fieldNames:
    copyData[f] = []
    cloneData[f] = []
    
timeMap = {
    'copy': {
        'phases': ['COPY_IMPORT_META', 'ACTIVATION_CONFIRMED'],
        'duration_percs': [0.25, 0.75]
    },
    'clone': {
        'phases': ['COPY_IMPORT_META', 'TABLE_GEN_SAYONARA_SANDBOX_SWIZZLER_CHUNK', 'ACTIVATION_CONFIRMED'],
        'duration_percs': [0.20, 0.05, 0.75]
    }
}
########################################################
def simulateOperationLog(minTime, duration, orgId, orgSize, opData, opsTimeMap):
    # partition duration into subphases defined by timeMap
    subDurs = {}
    startTime = minTime
    for i in range(len(opsTimeMap['phases'])):
        spn = opsTimeMap['phases'][i]
        durPerc = opsTimeMap['duration_percs'][i]
        dur = durPerc * duration
        durSecs = int(dur)
        durMS = int((dur - durSecs)*1000)
        durNS = int((dur - durSecs - durMS/1000)*1000000)
        
        endTime = startTime + np.timedelta64(durSecs, 's') + np.timedelta64(durMS, 'ms') + np.timedelta64(durNS, 'ns')

        opData['subphase_pod'].append('cs999')
        opData['sourceOrgId'].append('NA')
        opData['org_size'].append(orgSize)
        opData['targetorgid'].append(orgId)
        opData['clone_subphase_name'].append(spn)
        opData['subphase_min_time'].append(startTime)
        opData['subphase_min_time_str'].append(pd.to_datetime(startTime).strftime('%Y-%m-%d %H:%M:%S'))
        opData['subphase_max_time'].append(endTime)
        opData['subphase_max_time_str'].append(pd.to_datetime(endTime).strftime('%Y-%m-%d %H:%M:%S'))
        opData['subphase_duration_secs'].append(dur)
        opData['subphase_logline_count'].append(1)
        
        startTime = endTime
        
    if False:
        print endTime - minTime, duration
    return(opData)


####  MAIN ####
for d in range(numDays):
    myDate = initDate + np.timedelta64(d, 'D')
    
    # simulate copy operations for this date
    #  first randomly select an org size, then sample from that org's duration parameters
    orgSizeRand = np.random.uniform()
    orgSizeIndex = min([i for i in range(len(copyOrgParams['orgSizeFreq'])) if copyOrgParams['orgSizeFreq'][i] > orgSizeRand])
    orgSize = copyOrgParams['orgSizes'][orgSizeIndex]
    copyDurations = np.random.lognormal(copyOrgParams['muForSize'][orgSizeIndex], 
                                        copyOrgParams['vForSize'][orgSizeIndex], numCopyPerDay)
    for duration in copyDurations:
        minTimeSecs = np.random.randint(low=0, high=24*60*60)
        minTime = myDate + np.timedelta64(minTimeSecs, 's')
        orgId = uuid.uuid4()
        copyData = simulateOperationLog(minTime, duration, orgId, orgSize, copyData, timeMap['copy'])
        
    # simulate clone operations for this date
    cloneDurations = np.random.lognormal(cloneMu, cloneV, numClonePerDay)
    for duration in cloneDurations:
        minTimeSecs = np.random.randint(low=0, high=24*60*60)
        minTime = myDate + np.timedelta64(minTimeSecs, 's')
        orgId = uuid.uuid4()
        cloneData = simulateOperationLog(minTime, duration, orgId, 1, cloneData, timeMap['clone'])
        
# create outputs      
opsDfs = {'copy': pd.DataFrame(copyData),
          'clone': pd.DataFrame(cloneData)}
for ot in opsDfs.keys():
    odfn = '{0}_testData.tsv'.format(ot)
    opsDfs[ot].to_csv(odfn, sep='\t')

    osfn = '{0}_simulatedDataInserts.sql'.format(ot)

    lines = [
        'INSERT INTO TABLE sandom_copy_clone_subphases VALUES',
    ]
    numI = len(opsDfs[ot].index.values)
    print ot, numI
    i = 0
    for i, r in opsDfs[ot].iterrows():
        ol = '('
        for fn in fieldNames:
            if fn in ['subphase_duration_secs', 'subphase_logline_count', 'org_size']:
                ol += '{0}'.format(r[fn])
            elif fn in ['subphase_min_time','subphase_max_time']:
                v = int((r[fn] - np.datetime64('1970-01-01T00:00:00Z')) / np.timedelta64(1, 's'))
                ol += '{0}'.format(v)
            else:
                ol += '"{0}"'.format(r[fn])
            if fn != 'subphase_logline_count':
                ol += ', '
        ol += ')'

        i += 1
        if i < numI:
            ol += ','
        lines.append(ol)
    lines.append(';')
    
    fp = open(osfn, 'wb')
    fp.write('\n'.join(lines))
    fp.close()

clone 90000




copy 3000
