In [1]:
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, roc_auc_score

from utils import load, pprint, root_mean_squared_error

In [2]:
folders = [
    'check_1_r', 'check_2_r', 'check_3_r', 'check_4_c', 'check_5_c',
    'check_6_c', 'check_7_c', 'check_8_c',
]

for folder in folders:
    task, problem = folder.replace('check_', '').split('_')
    train_path = f'~/cnt/sdsj2018-automl/data/{folder}/train.csv'
    mode = 'regression' if problem == 'r' else 'classification'
    output_dir = f'./t{task}'
    cmd_train = f'mkdir {output_dir}; python train.py --mode {mode} --train-csv {train_path} --model-dir {output_dir}'
    
    test_path = f'~/cnt/sdsj2018-automl/data/{folder}/test.csv'
    prediction_path = f'{output_dir}/prediction.csv'
    cmd_predict = f'python predict.py --test-csv {test_path} --prediction-csv {prediction_path} --model-dir {output_dir}'

#     print(f'echo Task {task}')
#     print('echo ------------------------------------')
#     print(cmd_train)
#     print(cmd_predict)
#     print('echo ----------***************-----------')
#     print()

### Base Line

In [3]:
def score(solution, task):
    solution_path = f'~/cnt/sdsj2018-automl/solution/{solution}/t{task}/prediction.csv'
    prediction = pd.read_csv(solution_path, index_col=0)
    _, target = load(task, 'test-target')
    if task < 4:
        metrics = np.sqrt(mean_squared_error(prediction['prediction'], target['target']))
    else:
        metrics = roc_auc_score(target['target'], prediction['prediction'])
    return metrics

In [4]:
for task in range(1, 9):
    metric = 'RMSE' if task < 4 else 'ROC_AUC'
    value = score('baseline', task)
    print(f'Task {task}: {metric}={value}')

Task 1: RMSE=11.449004887101944
Task 2: RMSE=1.6338020824320976
Task 3: RMSE=118263.65765372704
Task 4: ROC_AUC=0.8618950450488363
Task 5: ROC_AUC=0.7717664972379902
Task 6: ROC_AUC=0.6551205066635908
Task 7: ROC_AUC=0.7269517697683701
Task 8: ROC_AUC=0.8627469794780536


### Trees
Суммарное время тренировки + скоринга ~30 min

In [7]:
for task in range(1, 9):
    metric = 'RMSE' if task < 4 else 'ROC_AUC'
    value = score('trees', task)
    baseline_value = score('baseline', task)
    diff = value - baseline_value
    print(f'Task {task}: {metric}={value} \nDifference: {diff}\n')

Task 1: RMSE=9.738509028456058 
Difference: -1.7104958586458867

Task 2: RMSE=1.6374022353879505 
Difference: 0.00360015295585292

Task 3: RMSE=118265.1477466668 
Difference: 1.4900929397554137

Task 4: ROC_AUC=0.7698038617332088 
Difference: -0.09209118331562749

Task 5: ROC_AUC=0.7737069437580545 
Difference: 0.0019404465200643095

Task 6: ROC_AUC=0.653936195438504 
Difference: -0.001184311225086776

Task 7: ROC_AUC=0.8340454679904876 
Difference: 0.10709369822211756

Task 8: ROC_AUC=0.8808520114337872 
Difference: 0.018105031955733653

