In [356]:
import pandas as pd
import numpy as np
from pprint import pprint
from sqlalchemy.orm import sessionmaker
from sqlalchemy import create_engine, MetaData, insert, delete, select, text

def connect(URI):
    engine = create_engine(URI)
    metadata_obj = MetaData()
    metadata_obj.reflect(engine)
    return metadata_obj, engine

URI = 'mysql+pymysql://root:root@localhost:3306/mast_drupal'
metadata, engine = connect(URI)
Session = sessionmaker(bind = engine)
session = Session()

# pprint([t for t in list(metadata.tables.keys()) if 'field_data_field' in t])
pprint(list(metadata.tables.keys()))

['actions',
 'authmap',
 'backup_migrate_destinations',
 'backup_migrate_profiles',
 'backup_migrate_schedules',
 'backup_migrate_sources',
 'batch',
 'block',
 'block_custom',
 'block_node_type',
 'block_role',
 'blocked_ips',
 'cache',
 'cache_admin_menu',
 'cache_block',
 'cache_bootstrap',
 'cache_features',
 'cache_feeds_http',
 'cache_field',
 'cache_filter',
 'cache_form',
 'cache_image',
 'cache_media_oembed',
 'cache_menu',
 'cache_page',
 'cache_panels',
 'cache_path',
 'cache_token',
 'cache_update',
 'cache_views',
 'cache_views_data',
 'ckeditor_input_format',
 'ckeditor_settings',
 'comment',
 'conditional_fields',
 'ctools_css_cache',
 'ctools_object_cache',
 'date_format_locale',
 'date_format_type',
 'date_formats',
 'facetapi',
 'features_signature',
 'feeds_importer',
 'feeds_item',
 'feeds_log',
 'feeds_push_subscriptions',
 'feeds_source',
 'field_config',
 'field_config_instance',
 'field_data_body',
 'field_data_comment_body',
 'field_data_field_comment_diagnosti

In [328]:
table_names = [
    'shot_physics_div_config',
    'shot_physics_heating',
    'shot_physics_ip_range',
    'shot_physics_shape',
    'shot_phys_pellets',
    'shot_phys_rmp_coils',
    'shot_preshot',
    'shot_postshot',
    'shot_owner',
    'shot_reference',
    'shot_scenario',
    'shot_datetime',
    'shot_sessionlog'
]

name_mapping = dict(
    shot_reference='field_shot_reference_target_id',
    shot_scenario='field_shot_scenario_tid',
    shot_phys_pellets='field_shot_phys_pellets_tid',
    shot_phys_rmp_coils='field_shot_phys_rmp_coils_tid',
    shot_sessionlog='field_shot_sessionlog_target_id'
)

In [365]:
dfs = []
for name in table_names:
    table = metadata.tables[f'field_data_field_{name}']
    column = name_mapping[name] if name in name_mapping else f'field_{name}_value'
    stmt = select(table.c.entity_id, table.c.bundle, table.columns[column])
        
    df = pd.read_sql(stmt, con=engine.connect())
    df = df.loc[df.bundle == 'mast_shot']
    df = df.drop('bundle', axis=1)
    df = df.set_index('entity_id')

    if '_tid' in column:
        tname = 'taxonomy_term_data'
        table = metadata.tables[tname]
        stmt = select(table.c.tid, table.c.name)
        terms = pd.read_sql(stmt, con=engine.connect())
        terms = terms.set_index('tid')
        df = pd.merge(df, terms, left_on=column, right_index=True)
        s = column.replace('_tid', '_name')
        df = df.rename(dict(name=s), axis=1)

    dfs.append(df)

df = pd.concat(dfs, axis=1).sort_index()

sname = 'field_data_field_sllog_campaign'
table = metadata.tables[sname]
stmt = select(table.c.entity_id, table.c.field_sllog_campaign_value)
slog = pd.read_sql(stmt, con=engine.connect())
slog = slog.set_index('entity_id')
df = pd.merge(df, slog, left_on='field_shot_sessionlog_target_id', right_index=True)

df.sample(50)

Unnamed: 0_level_0,field_shot_physics_div_config_value,field_shot_physics_heating_value,field_shot_physics_ip_range_value,field_shot_physics_shape_value,field_shot_phys_pellets_tid,field_shot_phys_pellets_name,field_shot_phys_rmp_coils_tid,field_shot_phys_rmp_coils_name,field_shot_preshot_value,field_shot_postshot_value,field_shot_owner_value,field_shot_reference_target_id,field_shot_scenario_tid,field_shot_scenario_name,field_shot_datetime_value,field_shot_sessionlog_target_id,field_sllog_campaign_value
entity_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
30991,Conventional,,,,884.0,No,,,\nRepeat 22026 after adding D to the impurity ...,\nNo conclusive sign of Ne in the vessel.\n,,30987.0,,,2009-04-23 12:50:00,20160,M7
40738,,,,,884.0,No,888.0,No,[TF] Run 12 kA pulse to create a reference for...,Pulse ran successfully to provide a reference ...,,,,,2020-09-09 14:02:16,40718,MU1
33080,Conventional,,,,884.0,No,,,\nReload shot 20691. Disabled FA3 and FA4 and ...,\nNo H-mode. Good beam.\n,,29672.0,,,2010-01-22 10:22:00,20289,M7
27612,Conventional,,,,884.0,No,,,\nRepeat - SW only\n,\nBetter SW beam.\n,,27294.0,,,2007-06-29 10:35:00,19955,M6
22135,Conventional,,,,884.0,No,,,"\nprolonged G1-4 puff by 2ms, remove spike fro...",\n\n,,22134.0,,,2005-06-07 14:03:00,19615,M5
36860,Conventional,"2 Beams,SS Beam,SW Beam",700 kA,,884.0,No,,,\nrepeat with two beams for new neutron camera...,\nbeam late\n,,36852.0,,,2012-01-25 17:44:00,20550,M8
48759,,,,,884.0,No,888.0,No,[TF]. Restore 44483. TF 65kA Test Shot. Disabl...,Sparking recorded on camera and Rogowskii coil...,Internal,48712.0,,,2021-07-20 15:18:30,48754,MU1
33742,Conventional,,,,884.0,No,,,\nPF retest with only P2 and P3 left on\n,\nCCBV24-28 still awry after software work\n,,33490.0,,,2010-03-16 10:20:00,20327,M7
41576,,,,,884.0,No,888.0,No,[TF]. Restore amd repeat 41446.,Good match to 41446.,,,,,2020-10-05 12:53:17,41559,MU1
33610,Conventional,,,,884.0,No,,,\nFinal current dropped to 400 kA. Same decay ...,\nNo SS beam again. Some suspicion that a prob...,,33609.0,,,2010-03-04 13:17:00,20319,M7


In [382]:
name =  'node'
table = metadata.tables[name]
stmt = select(table)
pd.set_option('display.max_columns', None)
df_node = pd.read_sql(stmt, con=engine.connect())
df_node = df_node.set_index('nid')
df_node = df_node.loc[df_node.type == 'mast_shot']

df_shot = df_node.join(df)

# Drop uninteresting columns
df_shot = df_shot.drop(['vid', 'type', 'language', 'uid', 'comment', 
              'promote', 'sticky', 'translate', 'tnid', 'status', 
              'created', 'changed'], axis=1)


# Rename types
df_shot = df_shot.rename({key: key.replace('field_shot_', '').replace('_value', '').replace('_name', '')
                          for key in df_shot.columns}, axis=1)
df_shot = df_shot.rename(dict(title='shot_id', reference_target_id='reference_id', 
                              owner='comissioner', field_sllog_campaign='campaign'), axis=1)

# Tidy up types
df_shot['shot_id'] = df_shot['shot_id'].astype(int)
df_shot['reference_id'] = df_shot['reference_id'].astype('Int64')
df_shot['datetime'] = pd.to_datetime(df_shot.datetime)

id_columns = [
    'sessionlog_target_id',
    'scenario_tid',
    'phys_rmp_coils_tid',
    'phys_pellets_tid',
]
df_shot = df_shot.drop(id_columns, axis=1)

# Get reference shot ID
ref_index = df_shot['reference_id'].dropna().index
ref_ids = df_shot['reference_id'].dropna().values
ref_shot_ids = df_shot['shot_id'].loc[ref_ids].values
df_shot.loc[ref_index, 'reference_shot_id'] = ref_shot_ids
df_shot['reference_shot_id'] = df_shot['reference_shot_id'].astype('Int64')


scenarios = pd.DataFrame(dict(scenario=df_shot.scenario.unique())).reset_index()
scenarios = scenarios.drop(0)
scenarios  =scenarios.rename(dict(index='scenario_id'), axis=1)
df_shot = pd.merge(df_shot, scenarios, left_on='scenario', right_on='scenario', how='outer')

def null_bool(item):
    if item is np.nan:
        return None
    else:
        return item != 'No'

def comissioner_normalize(item):
    if item == 'Internal':
        return 'UKAEA'
    elif item == 'Eurofusion':
        return 'EuroFusion'

df_shot['phys_rmp_coils'] = df_shot['phys_rmp_coils'].map(null_bool)
df_shot['phys_pellets'] = df_shot['phys_pellets'].map(null_bool)

df_shot['comissioner'] = df_shot['comissioner'].map(comissioner_normalize)

df_shot = df_shot.rename(dict(
    reference_shot_id='reference_shot',
    physics_ip_range='current_range',
    physics_div_config='divertor_config',
    physics_heating='heating',
    physics_shape='plasma_shape',
    preshot='preshot_description',
    postshot='postshot_description',
    phys_pellets='pellets',
    phys_rmp_coils='rmp_coil',
    datetime='timestamp'
), axis=1)

df_shot = df_shot.sort_values('shot_id')
df_shot.sample(10)

Unnamed: 0,shot_id,divertor_config,heating,current_range,plasma_shape,pellets,rmp_coil,preshot_description,postshot_description,comissioner,reference_id,scenario,timestamp,campaign,reference_shot,scenario_id
9366,21063,Conventional,,,,False,,"\nRestore 21061, but run with Ar puff prolonge...","\nBeams from 0.05s good, but VDE at 0.244s due...",,30042.0,,2008-12-10 16:52:00,M7,21061.0,
14850,28968,Conventional,Ohmic,,,False,,\nrepeat\n,"\nSeemed to stop charging at about 3800, then ...",,37543.0,,2013-06-10 10:43:00,M9,28967.0,
3040,14735,Conventional,,,,False,,\n0.5T test shot\n,\nOK\n,,23716.0,,2006-01-26 09:28:00,M6,14733.0,
4744,16441,Conventional,,,,False,,\nStandard ohmic shot\n,\nOkay\n,,25048.0,,2006-09-06 09:02:00,M6,16065.0,
3043,14738,Conventional,,,,False,,\n0.5T test shot\n,\nOK\n,,23720.0,,2006-01-26 10:52:00,M6,14737.0,
21217,29957,Conventional,"2 Beams,SS Beam,SW Beam",400 kA,,True,,\nRestore 29953. Include pellets. Change TS t...,\nOne breakdown on SW beam.\n,,38529.0,S8,2013-08-21 15:44:00,M9,29953.0,2.0
18049,42874,,,,,False,False,[Vacuum] reload 42850 - deselect gas valvle,LVPS mod bus alarm - LVPS tripped so shot term...,,,,2021-01-13 20:01:33,MU1,,
2149,13844,Conventional,,,,False,,\nTf test shot\n Reference => 10084.,\nok\n,,,,2005-09-07 10:58:00,M5,,
16056,40881,,,,,False,False,[D3] Try repeating pulse 40879 on D3.,Pulse ran successfully.,,,,2020-08-18 10:42:02,MU1,,
7178,18875,Conventional,,,,False,,\nStarting high betaT program. Restore 18578 w...,\nvery similar to 18578\n,,27559.0,,2007-07-17 17:36:00,M6,18578.0,


In [383]:
df_shot.to_parquet('../data/shot_metadata.parquet')

In [384]:
df_shot['divertor_config'].unique()

array(['Conventional', nan, 'Super-X', 'X Divertor', 'Snowflake',
       'Limiter'], dtype=object)

In [342]:
name = 'taxonomy_term_data'
# name = 'field_data_field_sllog_campaign'
# name = 'field_data_field_shot_sessionlog'
table = metadata.tables[name]
stmt = select(table)
# stmt = select(table.c.entity_id, table.columns[f'field_{name}_target_id'])
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 500)
df = pd.read_sql(stmt, con=engine.connect())
# df = df.set_index('entity_id')
df

Unnamed: 0,tid,vid,name,description,format,weight
0,315,5,NBI,,wysiwyg,112
1,316,5,Plasma Control System (PCS),,wysiwyg,7
2,317,5,DATAC,,wysiwyg,0
3,318,5,Power Supplies,,wysiwyg,132
4,319,5,Scheduler,,wysiwyg,0
...,...,...,...,...,...,...
1235,1867,4,MU02-FPP,,wysiwyg,1
1236,1868,4,MU02-MHD,,wysiwyg,5
1237,1869,4,MU02-EXH,,wysiwyg,2
1238,1870,4,MU02-FFP,,wysiwyg,0
