# Example processing: Intra-train heating
This uses a simplified version of the process_dssc_module function:

* frames are not grouped (e.g., in 'pumped', 'unpumped' or 'dark' frames)
* no selection of trains and/or pulses is done

The purpose of this analysis is to check how robust the diffraction signal is with respect to the repeated heat and radiation load from high repetition-rate pump-probe runs. Thus, we average over all trains within the run, but keep all individual pulses.

In [1]:
import karabo_data as kd
import numpy as np
import xarray as xr
import matplotlib.pyplot as plt
import multiprocessing
from tqdm.auto import tqdm
from time import strftime
from matplotlib.colors import LogNorm, BoundaryNorm
import os
import dssc_process as dp

from dask.distributed import Client
from dask_jobqueue import SLURMCluster
from dask import delayed, compute

kd.__version__

'0.7.0'

In [2]:
%matplotlib widget

In [3]:
# make sure subfolders exist
for f in ['tmp', 'images', 'processed_runs']:
    if not os.path.isdir(f):
        os.mkdir(f)

## setup run information and index non-DSSC data

In [4]:
%%time

# basic run information
proposal = 2212
run_nr = 89
is_dark = False
#Other files used during testing:
#dirpath = '/gpfs/exfel/exp/XMPL/201750/p700000/raw/r0026' #GPFS
#dirpath = '/gpfs/exfel/exp/SCS/201901/p002212/raw/r0125' #dCache

# index non-DSSC data
run = kd.open_run(proposal, run_nr, include='*DA*')
run.info()

# of trains:    1691
Duration:       0:02:49
First train ID: 517552829
Last train ID:  517554519

0 detector modules ()

3 instrument sources (excluding detectors):
  - SA3_XTD10_XGM/XGM/DOOCS:output
  - SCS_BLU_XGM/XGM/DOOCS:output
  - SCS_UTC1_ADQ/ADC/1:network

20 control sources:
  - P_GATT
  - SA3_XTD10_MONO/ENC/GRATING_AX
  - SA3_XTD10_MONO/MDL/PHOTON_ENERGY
  - SA3_XTD10_MONO/MOTOR/GRATINGS_X
  - SA3_XTD10_MONO/MOTOR/GRATING_AX
  - SA3_XTD10_MONO/MOTOR/HE_PM_X
  - SA3_XTD10_MONO/MOTOR/LE_PM_X
  - SA3_XTD10_VAC/DCTRL/AR_MODE_OK
  - SA3_XTD10_VAC/DCTRL/D12_APERT_IN_OK
  - SA3_XTD10_VAC/DCTRL/D6_APERT_IN_OK
  - SA3_XTD10_VAC/DCTRL/N2_MODE_OK
  - SA3_XTD10_VAC/GAUGE/G30470D_IN
  - SA3_XTD10_VAC/GAUGE/G30480D_IN
  - SA3_XTD10_VAC/GAUGE/G30490D_IN
  - SA3_XTD10_VAC/GAUGE/G30510C
  - SA3_XTD10_XGM/XGM/DOOCS
  - SCS_BLU_XGM/XGM/DOOCS
  - SCS_RR_UTC/MDL/BUNCH_DECODER
  - SCS_RR_UTC/TSYS/TIMESERVER
  - SCS_UTC1_ADQ/ADC/1

CPU times: user 26.5 ms, sys: 29 ms, total: 55.5 ms
Wall time: 267 

## load XGM (but no filtering/ thresholding)

In [5]:
if not is_dark:
    xgm = dp.load_xgm(run, print_info=True)
    
xgm

SASE3 bunches per train: 75


<xarray.DataArray (trainId: 1691, pulse: 75)>
array([[ 2180.5166,  6488.1846,  5275.4434, ..., 10196.676 , 32754.41  ,
         6789.0996],
       [ 1624.2289,  4335.608 ,  3925.5168, ..., 13686.537 , 17298.703 ,
        13098.537 ],
       [ 3515.1462,  3991.5251,  3081.2363, ..., 20455.729 ,  5150.4116,
        10864.515 ],
       ...,
       [ 5501.5635,  3364.2139,  3501.4333, ..., 31223.012 , 20917.727 ,
         5331.0703],
       [ 3316.4705,  2695.3838,  7522.035 , ..., 12040.346 ,  9120.154 ,
        10832.564 ],
       [ 3241.3948,  6771.3486,  6503.9697, ..., 14620.738 , 19954.115 ,
         9423.8125]], dtype=float32)
Coordinates:
  * trainId  (trainId) uint64 517552829 517552830 ... 517554518 517554519
Dimensions without coordinates: pulse

## plot XGM

In [6]:
if not is_dark:
    fig, ax1 = plt.subplots(nrows=1, sharex=True)

    # ax1.plot(scan.xgm.mean('dim_0'), label='pumped')
    ax1.plot(xgm.trainId, xgm, 'o', c='C0', ms=1)
    ax1.set_ylabel('xgm')
    ax1.set_xlabel('trainId')

    ax1.set_title(f'run: {run_nr}')

    tstamp = strftime('%y%m%d_%H%M')
    fig.savefig(f'images/run{run_nr}_xgm_{tstamp}.png', dpi=200)

FigureCanvasNbAgg()

In [7]:
cluster = SLURMCluster(
    queue='exfel',
    processes=8,
    cores=8, memory='256GB'
)

In [8]:
cluster

VBox(children=(HTML(value='<h2>SLURMCluster</h2>'), HBox(children=(HTML(value='\n<div>\n  <style scoped>\n    …

In [9]:
cluster.scale(128)

In [10]:
client = Client(cluster)
print("Created dask client:", client)

Created dask client: <Client: 'tcp://131.169.182.59:38465' processes=128 threads=128, memory=4.10 TB>


## calculate chunksize
This is a conservative estimate for the maximum number of trains to process simultaneously without using more than "max_GB" gigabytes of memory.

In [11]:
max_GB = 300
fpt = dp.load_dssc_info(proposal, run_nr)['frames_per_train']

# max_GB / (8byte * 16modules * 128px * 512px * N_pulses)
chunksize = int(max_GB * 128 // fpt)
chunksize = min(512, chunksize)  # more than 512 trains doesn't seem to give any performance benefit
print('processing', chunksize, 'trains per chunk')

processing 512 trains per chunk


## create joblist for multiprocessing

In [12]:
jobs = []
for m in range(16):
    jobs.append(dict(
        proposal=proposal,
        run_nr=run_nr,
        module=m,
        chunksize=chunksize,
        fpt=fpt,
    ))

## create multiprocessing pool and execute

In [13]:
#%%time

mod_data = []
for i in range(0,16):
    res = delayed(dp.process_intra_train)(jobs[i])
    mod_data.append(res)

mod_data

[Delayed('process_intra_train-fa57e54b-6884-4096-bcf6-2d50b65e4510'),
 Delayed('process_intra_train-36ed8823-4dd6-48d7-9ee6-fe8a2aba434a'),
 Delayed('process_intra_train-0066f37c-0099-4472-ac00-7e55b71fda8b'),
 Delayed('process_intra_train-8bf09751-bb20-428d-b5fe-1f27c437eb2b'),
 Delayed('process_intra_train-136a3f49-7576-4236-9ce9-3209dbcc9dfc'),
 Delayed('process_intra_train-57283ad3-9f42-4597-8eb5-667c93da90ac'),
 Delayed('process_intra_train-3ff4f35e-9915-4157-b8d9-e37277fe857d'),
 Delayed('process_intra_train-ea994062-fce8-490d-814f-7380ba958877'),
 Delayed('process_intra_train-33e58888-897d-4109-b45a-bdef14427d59'),
 Delayed('process_intra_train-f488db8d-ddee-46eb-bdac-48069cecc6b7'),
 Delayed('process_intra_train-2c6937b3-7347-44a1-82f3-f1bc55f442db'),
 Delayed('process_intra_train-a62b2a7e-6d44-42c6-b89f-3249d479fd7e'),
 Delayed('process_intra_train-1970a1be-58f1-426e-b372-f8779e572903'),
 Delayed('process_intra_train-59ca02d4-b873-4a30-b730-95ec03e68390'),
 Delayed('process_in

In [15]:
%%time
result = compute(mod_data)[0]

CPU times: user 20.8 s, sys: 4.52 s, total: 25.4 s
Wall time: 3min 32s


In [16]:
module_data = xr.concat(result, dim='module')
module_data['run'] = run_nr
module_data = module_data.transpose('pulse', 'module', 'x', 'y')

## merge processed data with scan variable and normalization data

In [17]:
if not is_dark:
    xgm['pulse'] = np.arange(fpt, dtype=int)
    xgm = xgm.mean('trainId')
    xgm.name = 'xgm'
    module_data = xr.merge([module_data, xgm])

In [18]:
module_data

<xarray.Dataset>
Dimensions:    (module: 16, pulse: 75, x: 128, y: 512)
Coordinates:
  * pulse      (pulse) int64 0 1 2 3 4 5 6 7 8 9 ... 66 67 68 69 70 71 72 73 74
Dimensions without coordinates: module, x, y
Data variables:
    image      (pulse, module, x, y) float64 48.82 50.98 44.96 ... 48.3 45.63
    sum_count  (pulse, module) float64 1.691e+03 1.691e+03 ... 1.691e+03
    run        int64 89
    xgm        (pulse) float32 3959.062 3894.9146 ... 11278.968 11380.062

## save to hdf5

In [21]:
import h5py

overwrite = True

save_folder = './processed_runs/'
prefix = ''

if is_dark:
    fname = f'{prefix}run{run_nr}.h5'  # no scan
else:
    fname = f'{prefix}run{run_nr}_by-pulse.h5'  # run with delay scan (change for other scan types!)


save_path = os.path.join(save_folder, fname)
file_exists = os.path.isfile(save_path)

if (not file_exists) or (file_exists and overwrite):
    if file_exists:
        os.remove(save_path)
    h5f = h5py.File(save_path, 'w')
    h5f.create_dataset('module_data', data=module_data.to_array())
    h5f.create_dataset('xgm', data=xgm)
    h5f.close()
    print('saving: ', save_path)
else:
    print('file', save_path, 'exists and overwrite is False')

saving:  ./processed_runs/run89_by-pulse.h5


In [22]:
client.close()
cluster.close()