In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## load data

In [2]:
# columns = ['id','loan_dt','label','tag']+['f%d'%(i+1)for i in range(6745)]
train_path_1 = './data/open_data_train_valid/train/train_1.txt'
# train_path_5 = './data/open_data_train_valid/train/train_5.txt'
valid_path = './data/open_data_train_valid/valid.txt'
valid_id_path = './data/open_data_train_valid/valid_id.txt'

In [3]:
%%time
train_1 = pd.read_table(train_path_1)

CPU times: user 28.7 s, sys: 3.3 s, total: 32 s
Wall time: 34.4 s


In [5]:
%%time
valid = pd.read_table(valid_path)
valid_id = pd.read_table(valid_id_path)

CPU times: user 32.8 s, sys: 1.2 s, total: 34 s
Wall time: 36.4 s


## define pipeline函数

In [10]:
from sklearn.externals import joblib
import autosklearn.classification
import os

def pipeline(flag, train_x, train_y, test_x, test_id):
    '''
    input :
    flag(bool): 是否加载autosklearn模型
    train_x(array)
    train_y(array)
    test_x(array)
    test_id(array)
    '''
    if flag:
        if not os.path.exists('automl.pkl'):
            print('automl.pkl 不存在！')
            return
        automl = joblib.load('automl.pkl')
        print('模型加载完毕！')
        
    else:
        if not os.path.exists('tmp_folder'):
            os.mkdir('tmp_folder')
        if not os.path.exists('output_folder'):
            os.mkdir('output_folder')
            
        automl = autosklearn.classification.AutoSklearnClassifier(time_left_for_this_task=18000,
                                                                  per_run_time_limit=1800,
                                                                  ml_memory_limit=16480,
                                                                  delete_tmp_folder_after_terminate=False,
                                                                  delete_output_folder_after_terminate=False,
                                                                  shared_mode=True,
                                                                  tmp_folder='./tmp_folder',
                                                                  output_folder='./output_folder '
                                                                 )
        print('开始训练！')
        automl.fit(train_x.values, train_y.values)
        print('保存模型')
        joblib.dump(automl, 'automl.pkl') # 保存模型
        
    print(automl.show_models())
    pred = automl.predict_proba(test_x.drop(columns=['id']).values) # 预测

    test_result = pd.DataFrame(columns=["id","prob"])
    test_result.id = test_x.id
    test_result.prob = pred[:,1]

    if not os.path.exists('preds'):
        os.mkdir('preds')
    test_result.to_csv('./preds/auto_pred.csv',index=None) # 保存pred

In [None]:
train_y = train_1.label.values
train_x = train_1.drop(columns=['id','label']).values
test_id = valid.id.values
test_x = valid.drop(columns=['id']).values

pipeline(False, train_x, train_y, test_x, test_id)