In [1]:
from azureml.core import Workspace

ws = Workspace.from_config()
print('Workspace name: ' + ws.name,
        'Azure region: ' + ws.location,
        'Subscription ID: ' + ws.subscription_id,
        'Resource Group: ' + ws.resource_group,
        sep='\n')

Workspace name: labuser15ml
Azure region: koreacentral
Subscription ID: f5bc93f2-df0a-4b1a-ab9e-2b0203fc7d26
Resource Group: rg15


In [2]:
from azureml.core import Experiment

experiment = Experiment(workspace = ws, name = 'diabetes-experiment') # 실험 공간 생성

In [9]:
from azureml.opendatasets import Diabetes
from sklearn.model_selection import train_test_split

x_df = Diabetes.get_tabular_dataset().to_pandas_dataframe().dropna() # null data/공백을 drop하고 표 형식으로 당뇨병 자료를 읽음
y_df = x_df.pop('Y') # x_df.column('Y')

X_train, X_test, y_train, y_test = train_test_split(x_df, y_df, test_size = 0.2, random_state = 66) 
# X : feature, y : label, _train : 학습용, _test : 테스트용

In [11]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error # 정확도 평가 (mse)
from sklearn.externals import joblib
import math

alphas = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
for alpha in alphas:
    run = experiment.start_logging() # 실험에 대해 기록 시작
    run.log('alpha_value', alpha) # 현재 alpha value 기록
    
    model = Ridge(alpha=alpha) # parameter : alpha로 ridge regression
    model.fit(X = X_train, y = y_train) # ridge regression 실행
    
    y_pred = model.predict(X=X_test)
    rmse = math.sqrt(mean_squared_error(y_true = y_test, y_pred = y_pred)) #평균 제곱 오차, rmse
    run.log('rmse', rmse) # rmse 기록

    model_name = 'model_apha_' + str(alpha) + '.pkl' # pkl : sklearn에서 model 저장을 위한 확장자
    filename = 'outputs/' + model_name

    joblib.dump(value = model, filename = filename) # 저장 공간에 model 저장
    run.upload_file(name = model_name, path_or_stream = filename)

    run.complete() # 기록 종료

    print(f'{alpha} exp completed')

0.1 exp completed
0.2 exp completed
0.3 exp completed
0.4 exp completed
0.5 exp completed
0.6 exp completed
0.7 exp completed
0.8 exp completed
0.9 exp completed
1 exp completed


In [12]:
experiment

Name,Workspace,Report Page,Docs Page
diabetes-experiment,labuser15ml,Link to Azure Machine Learning studio,Link to Documentation


In [14]:
# 오차가 가장 작은 (minimum rmse) 찾고 저장하기

minimum_rmse_runid = None
minimum_rmse = None

for run in experiment.get_runs():
    run_metrics = run.get_metrics()
    run_details = run.get_details()
    
    run_rmse = run_metrics['rmse']
    run_id = run_details['runId']

    if minimum_rmse is None:
        minimum_rmse = run_rmse
        minimum_rmse_runid = run_id
    else:
        if run_rmse < minimum_rmse:
            minimum_rmse = run_rmse
            minimum_rmse_runid = run_id

print('Best run_id: ' + minimum_rmse_runid)
print('Best run_id rmse: ' + str(minimum_rmse))

Best run_id: ae268701-3a06-4278-a9d2-8f1602104748
Best run_id rmse: 56.60520331339142


In [17]:
from azureml.core import Run

best_run = Run(experiment = experiment, run_id = minimum_rmse_runid)
print(best_run.get_file_names())

['model_apha_0.1.pkl', 'outputs/model_apha_0.1.pkl']


In [18]:
best_run.download_file(name = str(best_run.get_file_names()[0]))

In [19]:
import numpy as np
from azureml.core import Dataset

# experiment에 사용한 data들 file로 저장
np.savetxt('features.csv', X_train, delimiter = ',')
np.savetxt('labels.csv', y_train, delimiter = ',')

datastore = ws.get_default_datastore()
datastore.upload_files(
    files = ['./features.csv', './labels.csv'],
    target_path = 'diabetes-experiment/',
    overwrite = True # 중복 데이터 덮어 쓰기
)

Uploading an estimated of 2 files
Uploading ./features.csv
Uploaded ./features.csv, 1 files out of an estimated total of 2
Uploading ./labels.csv
Uploaded ./labels.csv, 2 files out of an estimated total of 2
Uploaded 2 files


$AZUREML_DATAREFERENCE_bf174352721540b39dfcf1b7bff71aa4

"datastore.upload_files" is deprecated after version 1.0.69. Please use "FileDatasetFactory.upload_directory" instead. See Dataset API change notice at https://aka.ms/dataset-deprecation.
