test code for speed experiment

In [92]:
import os
import json
import datetime

import pandas as pd
import numpy as np

import driverlessai
driverlessai.__version__

'1.10.1.3'

In [16]:
# Driverless AIのuser nameとpasswordの読み込み
with open(os.path.join('..', 'idpass.json')) as f:
    idpass = json.load(f)

In [17]:
def get_dai_client(daiaddress, daipassword) -> 'driverlessai._core.Client':
    '''
    DAIサーバへの接続
    ----------
    daiaddress : str
    daipassword : str
    '''
    print('----- start server connection : get_dai_client -----')
    # Driverless AIサーバーへの接続
    dai = driverlessai.Client(address=daiaddress, username=idpass['id'], password=daipassword)
    return dai

def get_dataset(daiobj, dataname, dataurl) -> 'driverlessai._datasets.Dataset': 
    '''
    データオブジェクトの取得
    ----------
    daiobj : driverlessai._core.Client
    dataname : str
    dataurl : str
    '''
    print('----- start get data : get_dataset -----')
    # DAI上のデータ一覧
    uploaded_data = {i.name:i.key for i in daiobj.datasets.list()}
    print('Uploaded data name : key >> ', uploaded_data)

    # データ取得
    if dataname in uploaded_data.keys():
        print('Data is already uploaded in DAI')
        ds = daiobj.datasets.get(uploaded_data[dataname]) 
    else:
        print('Data is uploading to DAI.')
        ds = daiobj.datasets.create(data=dataurl, data_source='s3')
    
    return ds

def get_experiment(daiobj, dataobj, target_column, task, drop_columns, enable_gpus) -> 'driverlessai._experiments.Experiment':
    '''
    Experimentの実行とExperimentオブジェクトの取得
    ----------
    daiobj : driverlessai._core.Client
    dataobj : driverlessai._datasets.Dataset
    target_column : str
    task : str
    drop_columns : List[str]
    enable_gpus : bool
    '''
    print('----- start experiment : get_experiment -----')
    # Experiment設定
    dai_settings = {
        'train_dataset': dataobj, 
        'target_column': target_column,
        'task': task,
        'drop_columns': drop_columns,
        'enable_gpus': enable_gpus
    }
    # Experimentの実行
    ex = daiobj.experiments.create(**dai_settings)
    return ex

In [18]:
df_expperiments_info = pd.read_csv('../Management/spped_test/Experiments_Params.csv')
df_expperiments_info

Unnamed: 0,try_n,data_name,s3url,target_column,task,drop_columns,enable_gpus
0,1,BostonHousing.csv,s3://h2oai-jp-public/sample_data/boston_house/...,MEDV,regression,,False
1,1,UCI_Credit_Card3.csv,s3://h2oai-jp-public/sample_data/UCI_CreditCar...,default_payment_next_month,classification,"ID,LIMIT_BAL",False


In [31]:
df_expperiments_info.dtypes

try_n             int64
data_name        object
s3url            object
target_column    object
task             object
drop_columns     object
enable_gpus        bool
dtype: object

In [102]:
# Driverless AIサーバー情報
dai_address = 'http://3.82.142.224'
dai_password = idpass['pass11013']

for _, row in df_expperiments_info.iterrows():
    #**********  実験のパラメータ情報  **********#
    # データ情報
    data_name = row['data_name']
    start_time = datetime.datetime.now().strftime('%Y年%m月%d日%H時%M分%S秒')
    print('#####-----  開始時間: ', start_time, '  -----#####')
    print('#####-----  利用データ: ', data_name, '  -----#####')
    for exp_try in range(row['try_n']):
        print('#####-----  Try: ', exp_try, '  -----#####')
        s3url = row['s3url']  # DAIにアップされてない場合の取得先S3
        # Experiment設定
        target_column = row['target_column']
        task = row['task']    # 'regression', 'classification', or 'unsupervised'
        if row['drop_columns']  is np.nan:     # dropped clmを指定しない場合
            drop_columns = []
        else:
            drop_columns = row['drop_columns'] .split(',')     # strをList化
        #print(drop_columns)
        enable_gpus = False


        print('*************** DAIへ接続 ***************')
        dai = get_dai_client(daiaddress=dai_address, daipassword=dai_password)
        print(type(dai))
        print('DAIバージョン: {}'.format(dai.server.version))


        print('*************** データの取得 ***************')
        ds = get_dataset(daiobj=dai, dataname=data_name, dataurl=s3url) 

        print(type(ds))
        print('Dataサイズ(byte): {}'.format(ds.file_size))
        print('Dataサイズ(mega byte): {}'.format(ds.file_size/1024**2))
        print('Data shape: {}'.format(ds.shape))


        print('*************** Experimentの実施 ***************')
        ex = get_experiment(daiobj=dai, dataobj=ds, 
                            target_column=target_column, task=task, drop_columns=drop_columns, enable_gpus=enable_gpus)

        print(type(ds))
        print('学習時間（sec）：{}'.format(ex.run_duration))
        print('学習時間（min）：{}'.format(ex.run_duration/60))
        print('Experimentサイズ（byte）：{}'.format(ex.size))
        print('Experimentサイズ（mega byte）：{}'.format(ex.size/1024**2))
        print('********** Experiment Summary **********')
        ex.summary()
        
        save_dict = dict(Data_Name=data_name,
                 Try=row['try_n'],
                 Datasize_mb = ds.file_size/1024**2,
                 N_Observation = ds.shape[0],
                 N_features = ds.shape[1] - len(drop_columns) - 1,
                 Duration_min = ex.run_duration/60,
                 Experiment_Size_mb = ex.size/1024**2,
                 Acc_Time_Interpret = (ex.settings['accuracy'], ex.settings['time'], ex.settings['interpretability'])
                )
        with open('speedtest_{}.json'.format(start_time), 'w') as f:
            json.dump(save_dict, f, indent=4)

#####-----  開始時間:  2022年03月07日17時30分45秒   -----#####
#####-----  利用データ:  BostonHousing.csv   -----#####
#####-----  Try:  0   -----#####
*************** DAIへ接続 ***************
----- start server connection : get_dai_client -----
<class 'driverlessai._core.Client'>
DAIバージョン: 1.10.1.3
*************** データの取得 ***************
----- start get data : get_dataset -----
Uploaded data name : key >>  {'UCI_Credit_Card3.csv': 'e04af12e-9dec-11ec-a1da-0242ac110002', 'walmart_ts_6_fcst_grp_train.csv': '4d2ad106-9b86-11ec-8e63-0242ac110002', 'walmart_ts_6_fcst_grp_test.csv': '4d2a15fe-9b86-11ec-8e63-0242ac110002', 'kaggle_train_index.csv': '7477e2b4-9369-11ec-ae6e-0242ac110002', 'kaggle_train2_int.csv': '757ae702-9368-11ec-ae6e-0242ac110002', 'BostonHousing.csv': '653b25c2-91de-11ec-8bed-0242ac110002'}
Data is already uploaded in DAI
<class 'driverlessai._datasets.Dataset'>
Dataサイズ(byte): 57768
Dataサイズ(mega byte): 0.05509185791015625
Data shape: (506, 14)
*************** Experimentの実施 ***************

In [78]:
log_dataname = data_name
log_try_n = row['try_n']
log_datasize_mb = ds.file_size/1024**2     # 元データサイズ（MB）
log_n_observation = ds.shape[0]    # オブザベーション数
log_n_features = ds.shape[1] - len(drop_columns) - 1    # 特徴量数
log_duration_min = ex.run_duration/60    # 実行時間（分）
log_expsize_mb = ex.size/1024**2     # Experimentサイズ（MB）
log_accuracy_time_interpretability = ex.settings['accuracy'], ex.settings['time'], ex.settings['interpretability']   # accuracy, time, interpretability

print(log_datasize_mb)
print(log_n_observation)
print(log_n_features)
print(log_duration_min)
print(log_expsize_mb)
print(log_accuracy_time_interpretability)

0.05509185791015625
506
13
4.029468584060669
1073.3076963424683
(5, 4, 6)


In [86]:
save_dict = dict(Data_Name=data_name,
                 Try=row['try_n'],
                 Datasize_mb = ds.file_size/1024**2,
                 N_Observation = ds.shape[0],
                 N_features = ds.shape[1] - len(drop_columns) - 1,
                 Duration_min = ex.run_duration/60,
                 Experiment_Size_mb = ex.size/1024**2,
                 Acc_Time_Interpret = (ex.settings['accuracy'], ex.settings['time'], ex.settings['interpretability'])
                )
save_dict

{'Data_Name': 'BostonHousing.csv',
 'Try': 1,
 'Datasize_mb': 0.05509185791015625,
 'N_Observation': 506,
 'N_features': 13,
 'Duration_min': 4.029468584060669,
 'Experiment_Size_mb': 1073.3076963424683,
 'Acc_Time_Interpret': (5, 4, 6)}

In [101]:
start_time = datetime.datetime.now().strftime('%Y年%m月%d日%H時%M分%S秒')
print(start_time)

with open('speedtest_{}.json'.format(start_time), 'w') as f:
    json.dump(save_dict, f, indent=4)

2022年03月07日17時27分58秒


In [100]:
datetime.datetime.now().strftime('%Y年%m月%d日%H時%M分%S秒')

'2022年03月07日17時27分40秒'