In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from datetime import datetime
import random
import sys

from sklearn.model_selection import ParameterSampler
from scipy.stats import randint as sp_randint
from scipy.stats import uniform

from functions import (
    under_over_sampler,
    classifier_train,
    classifier_train_manual,
    make_generic_df,
    get_xy_from_df,
    plot_precision_recall_vs_threshold,
    plot_precision_vs_recall,
)

from classification_methods import (
    random_forest_classifier,
    knn_classifier,
    # logistic_regression,
    # sgd_classifier,
    # ridge_classifier,
    # svm_classifier,
    # gaussian_nb_classifier,
    xgboost_classifier,
)

# stop warnings from sklearn
# https://stackoverflow.com/questions/32612180/eliminating-warnings-from-scikit-learn
def warn(*args, **kwargs):
    pass


import warnings

warnings.warn = warn

Using TensorFlow backend.


In [2]:
"test"

'test'

In [3]:
list(range(1, 8))

[1, 2, 3, 4, 5, 6, 7]

In [4]:
# INITIAL PARAMETERS

# select model by commenting/un-commenting classifier
classifier_list_all = [
    random_forest_classifier,
#     knn_classifier,
#     logistic_regression,
#     sgd_classifier,
#     ridge_classifier,
#     svm_classifier,
#     gaussian_nb_classifier,
#     xgboost_classifier,
]


# select over/under-sampling method
over_under_sampling_methods = [
#     "random_over",
    "random_under",
#     "random_under_bootstrap",
#     "smote",
#     "adasyn",
#     None,
]

# select which indices to use
index_list = [
#     list(range(0, 10)),
#     list(range(1, 10)),
#     list(range(1, 9)),
    list(range(1, 8)),
#     list(range(2, 8)),
#     list(range(3, 7)),
#     list(range(2, 9)),
#     list(range(2, 10)),
]

# select the scaler method
scaler_methods = [
#     "standard", 
    "min_max"
]

# select the imbalance ratio
imbalance_ratios = [
#     0.1,
    0.5,
#     0.8,
#     1
]

# select if the feature set is averaged or not
average_across_indices = [
    True,
#     False
]

# the integer that is used as the random number in the classifier
# parameter sampler
parameter_sampler_int = [
    11475, 11
]

# features used in model (includes 'failed' column, but this is dropped, don't worry)
feat_list = ['rms_current','failed']

# other default parameters that do not need to be touched
tool_list_all = [54]
tool_list_some = []

In [5]:
# k-fold splits
train_fold_1 = [
    "2018-11-21", 
    "2019-01-25", 
    "2019-01-28", # failures
    "2019-11-27", # failures
    "2019-01-23", # failures, from Jan without speed
    "2019-05-03",
    "2019-09-11", # failures
    "2019-09-13",
    ]

train_fold_2 = [
    "2019-01-29", # failures
    "2019-01-30", # failures
    "2019-02-01",
    "2019-02-08", # failures
    "2019-09-10",
    "2019-09-12",
    "2018-11-20",
    "2019-02-11",
    "2019-01-24", # i forgot this one earlier
    "2019-05-04",
    "2018-11-16",
    "2018-11-19",
]

train_fold_3 = [
    "2019-02-04", # failures
    "2019-02-05", 
    "2019-02-07", # failures
    "2019-05-06",
    "2019-01-22", # from Jan without speed 
    "2018-10-23",
    "2018-11-15", # failures 
    ]

train_folds = [train_fold_1, train_fold_2, train_fold_3]
train_dates_all = [date for sublist in train_folds for date in sublist]

In [6]:
file_folder = Path(
    "/home/tim/Documents/Checkfluid-Project/data/processed/_tables/low_levels_labels_created_2020-03-11"
)

file = file_folder / "low_level_labels_created_2020.03.11_v3_updated_2020.08.06.csv"

df = pd.read_csv(file)

# sort the values by date and index so that it is reproducible
df = df.sort_values(by=["unix_date", "tool", "index"])

# replace NaN's in failed columns with 0
df["failed"].fillna(
    0, inplace=True, downcast="int"
)  # replace NaN in 'failed' col with 0

# function to convert pandas column to datetime format
def convert_to_datetime(cols):
    unix_date = cols[0]
    value = datetime.fromtimestamp(unix_date)
    return value


# apply 'date_ymd' column to dataframe
df["date"] = df[["unix_date"]].apply(convert_to_datetime, axis=1)
# convert to a period, and then string
df["date_ymd"] = pd.to_datetime(df["date"], unit="s").dt.to_period("D").astype(str)


# create train set
df_train = df[df["date_ymd"].isin(train_dates_all)].reset_index(drop=True).copy()

In [7]:
parameters_sample_dict = {
    "no_tools": [0], # only select one tool (tool 54)
    "classifier_used": classifier_list_all,
    "average_across_index": average_across_indices,
    "uo_method": over_under_sampling_methods,
    "scaler_method": scaler_methods,
    "parameter_sampler_random_int": parameter_sampler_int,
    "imbalance_ratio": imbalance_ratios,
    "index_list": index_list,
}

In [8]:
no_iterations = 2
sampler_seed = 1

# generate the list of parameters to sample over
p_list = list(
    ParameterSampler(
        parameters_sample_dict, n_iter=no_iterations, random_state=sampler_seed
    )
)


In [9]:
p_list

[{'uo_method': 'random_under',
  'scaler_method': 'min_max',
  'parameter_sampler_random_int': 11475,
  'no_tools': 0,
  'index_list': [1, 2, 3, 4, 5, 6, 7],
  'imbalance_ratio': 0.5,
  'classifier_used': <function classification_methods.random_forest_classifier(parameter_sampler_random_int)>,
  'average_across_index': True},
 {'uo_method': 'random_under',
  'scaler_method': 'min_max',
  'parameter_sampler_random_int': 11,
  'no_tools': 0,
  'index_list': [1, 2, 3, 4, 5, 6, 7],
  'imbalance_ratio': 0.5,
  'classifier_used': <function classification_methods.random_forest_classifier(parameter_sampler_random_int)>,
  'average_across_index': True}]

In [10]:
for k, p in enumerate(p_list):

    # set random.seed
    random.seed(p["parameter_sampler_random_int"])

    # get specific parameters
    clf_name = str(p["classifier_used"]).split(" ")[1]

    tool_list = sorted(
        random.sample(tool_list_some, p["no_tools"])
        + [54])


    indices_to_keep = p["index_list"]
    to_avg = p["average_across_index"]
    uo_method = p["uo_method"]

    # if svm, need to prevent too large a dataset, thus will only use undersampling
    if clf_name == "svm_classifier":
        uo_method = random.sample(["random_under", "random_under_bootstrap"], 1)

    imbalance_ratio = p["imbalance_ratio"]
    scaler_method = p["scaler_method"]
    parameter_sampler_random_int = p["parameter_sampler_random_int"]
    clf_function = p["classifier_used"]

    # build dictionary to store parameter results and other info
    parameter_values = {
        "clf_name": clf_name,
        "tool_list": tool_list,
        "feat_list": feat_list,
        "indices_to_keep": indices_to_keep,
        "info_no_samples": None,
        "info_no_failures": None,
        "info_no_feat": len(feat_list),
        "to_average": to_avg,
        "uo_method": uo_method,
        "imbalance_ratio": imbalance_ratio,
        "scaler_method": scaler_method,
        "parameter_sampler_seed": parameter_sampler_random_int,
        "initial_script_seed": sampler_seed,
    }


    # prepare the data table
    X_train, y_train, df_ymd_only = get_xy_from_df(
        df_train,
        tool_list=tool_list,
        indices_to_keep=indices_to_keep,
        to_average=to_avg,
        generic_feat_list=feat_list,
    )

    # check if empty X_train
    len_data = len(y_train)
    print(len_data)
    # check if not enough labels in y_train
    no_label_failed = np.sum(y_train)

    seed_indexer = 0
#     while len_data < 20 or no_label_failed < 15:
#         random.seed(p["parameter_sampler_random_int"] + seed_indexer)
#         tool_list = sorted(
#             random.sample(tool_list_some, p["no_tools"])
#             + random.sample([54], random.randint(1, 2))
#         )

#         X_train, y_train, df_ymd_only = get_xy_from_df(
#             df_train,
#             tool_list=tool_list,
#             indices_to_keep=indices_to_keep,
#             to_average=to_avg,
#             generic_feat_list=feat_list,
#         )

#         parameter_values["tool_list"] = tool_list

#         len_data = len(y_train)
#         no_label_failed = np.sum(y_train)
#         seed_indexer += 1

    parameter_values["info_no_samples"] = len_data
    parameter_values["info_no_failures"] = no_label_failed

    # save the general parameters values
    df_gpam = pd.DataFrame.from_dict(parameter_values, orient="index").T

    # instantiate the model
    clf, classifier_parameters = clf_function(parameter_sampler_random_int)

    # save classifier parameters into dataframe
    df_cpam = pd.DataFrame.from_dict(classifier_parameters, orient="index").T

    # train the model
    try:
        result_dict, _, _ = classifier_train_manual(
            X_train,
            y_train,
            df_ymd_only,
            train_folds,
            clf,
            scaler_method=scaler_method,
            uo_sample_method=uo_method,
            imbalance_ratio=imbalance_ratio,
            train_on_all=False,
            print_results=False,
        )

        df_result_dict = pd.DataFrame.from_dict(result_dict, orient="index").T
        # df_result_dict.astype("float16").dtypes

        if k == 0:
            df_results = pd.concat([df_gpam, df_cpam, df_result_dict], axis=1)
        else:
            df_results = df_results.append(
                pd.concat([df_gpam, df_cpam, df_result_dict], axis=1)
            )

#         # save directory for when on the HPC
#         save_directory = Path('/home/tvhahn/scratch/_temp_random_search_results')
#         # save_directory = Path("/home/tim/Documents/Checkfluid-Project/notebooks/1.9-tvh-feat-table/temp_results")

#         file_save_name = "temp_result_{}_{}_{}.csv".format(
#             str(date_time), str(sys.argv[1]), str(sampler_seed)
#         )
#         if k % 10 == 0:
#             df_results.to_csv(save_directory / file_save_name, index=False)

    except ValueError as err:
        print(err)
        print("#!#!#!#!#! SKIPPING")
        pass
    except:
        pass

# df_results.to_csv(save_directory / file_save_name, index=False)

5551
too many values to unpack (expected 3)
#!#!#!#!#! SKIPPING
5551
too many values to unpack (expected 3)
#!#!#!#!#! SKIPPING


In [11]:
classifier_parameters

{'RandomForestClassifier_bootstrap': False,
 'RandomForestClassifier_class_weight': 'balanced',
 'RandomForestClassifier_max_depth': 92,
 'RandomForestClassifier_min_samples_leaf': 2,
 'RandomForestClassifier_n_estimators': 444,
 'RandomForestClassifier_random_state': 17678}

In [12]:
df = df_results
df.to_csv('results_1.csv')

NameError: name 'df_results' is not defined

In [None]:
df.shape

In [None]:
dfr = df[(df['roc_auc_min']>0.01) & 
         (df['auc_min']>0.01)]

In [None]:
dfr

In [16]:
# column prefixes
col_prefix = ['SGDClassifier', 
              'KNeighborsClassifier', 
              'LogisticRegression', 
              'SVC', 
              'RidgeClassifier',
              'RandomForestClassifier', 
              'XGB', 'LogisticRegression']

primary_cols_sorted = ['clf_name',
                         'tool_list',
                         'feat_list',
                         'indices_to_keep',
                         'info_no_samples',
                         'info_no_failures',
                         'info_no_feat',
                         'to_average',
                         'uo_method',
                         'imbalance_ratio',
                         'scaler_method',
                         'parameter_sampler_seed',
                         'initial_script_seed',
                      ]

display_table_columns = ['clf_name',
 'tool_list',
 'feat_list',
'parameter_sampler_seed',
'initial_script_seed',
 'indices_to_keep',
'uo_method',
'imbalance_ratio',
'to_average',
'scaler_method',
 'auc_max',
 'auc_min',
 'auc_score',
 'auc_std',
 'f1_max',
 'f1_min',
 'f1_score',
 'f1_std',
 'precision',
 'precision_max',
 'precision_min',
 'precision_std',
 'recall',
 'recall_max',
 'recall_min',
 'recall_std',
 'roc_auc_max',
 'roc_auc_min',
 'roc_auc_score',
 'roc_auc_std', 
'train_dates_removed',
'auc_min_fold_train','auc_min_fold_test'       
]

model_parameter_columns = ['RandomForestClassifier_bootstrap',
       'RandomForestClassifier_class_weight',
       'RandomForestClassifier_max_depth',
       'RandomForestClassifier_min_samples_leaf',
       'RandomForestClassifier_n_estimators',
       'RandomForestClassifier_random_state',]

In [17]:
dfr = dfr.groupby(["clf_name"]).apply(lambda x: x.sort_values(["auc_score"], ascending = False)).reset_index(drop=True)
dfr = dfr.groupby('clf_name').head(1)[display_table_columns + model_parameter_columns]
dfr = dfr.groupby('clf_name').head(1)
dfr = dfr.sort_values(["auc_score"], ascending = False)
# dfr.to_csv('best_results.csv', index=False)
dfr

NameError: name 'dfr' is not defined

In [18]:
dfr.columns

NameError: name 'dfr' is not defined

In [41]:
dfr[['auc_max', 'auc_min',
       'auc_score', 'auc_std', 'f1_max', 'f1_min', 'f1_score', 'f1_std',
       'precision', 'precision_max',]]

Unnamed: 0,auc_max,auc_min,auc_score,auc_std,f1_max,f1_min,f1_score,f1_std,precision,precision_max
0,0.402098,0.304732,0.366959,0.0441237,0.248705,0.15,0.187205,0.0438048,0.111855,0.162162
