 # Table of Contents
<div class="toc" style="margin-top: 1em;"><ul class="toc-item" id="toc-level0"><li><span><a href="#Setup" data-toc-modified-id="Setup-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Setup</a></span></li><li><span><a href="#Helper-Functions" data-toc-modified-id="Helper-Functions-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Helper Functions</a></span></li><li><span><a href="#Data-Management" data-toc-modified-id="Data-Management-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Data Management</a></span></li><li><span><a href="#Data-Cleaning" data-toc-modified-id="Data-Cleaning-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Data Cleaning</a></span></li><li><span><a href="#Feature-Engineering" data-toc-modified-id="Feature-Engineering-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Feature Engineering</a></span></li><li><span><a href="#Dimension-Reduction" data-toc-modified-id="Dimension-Reduction-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Dimension Reduction</a></span></li><li><span><a href="#Data-Preparation" data-toc-modified-id="Data-Preparation-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Data Preparation</a></span></li><li><span><a href="#Model-Training" data-toc-modified-id="Model-Training-8"><span class="toc-item-num">8&nbsp;&nbsp;</span>Model Training</a></span></li><li><span><a href="#Kaggle-Prediction" data-toc-modified-id="Kaggle-Prediction-9"><span class="toc-item-num">9&nbsp;&nbsp;</span>Kaggle Prediction</a></span></li></ul></div>

# Setup

In [None]:
# Global Data Science Packages
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
# Model Settings
SETTINGS = {
    # VERSION indicates the model version
    "VERSION": 9,
    
    # TARGET_POSITIVE_PERCENT determines the desired mix of positive labels in the training set.
    # Since the raw data only contains 5% positive values, this can be beneficial in two ways:
    # 1) Reduces the data set
    # 2) Balances the positive outcomes
    "TARGET_POSITIVE_PERCENT": 0.15,
    
    # Indicates the cross_validation count. Mostly this is just used to manage runtime.
    "SPLIT_COUNT": 3,
    
    # Shall we actually save submission, or are we testing the pipeline?
    "SUBMITTING": True
}

# Helper Functions

In [None]:
import datetime

# The logger class is used to save logs in an ordered manner
# After instanciation, call #.log(string) to both print a string as
# well as save to a master string. This master string will be used
# by the final submission function to save the log data.

class Logger():
    def __init__(self):
        self.log_str = ""
    
    def log(self, string):
        print(string)
        date = datetime.datetime.now()
        date_str = str(date.hour) + "::" + str(date.minute) + "::" + str(date.second)
        string = "\n("+date_str+")\n" + string 
        self.log_str = self.log_str + string
        
    def get_log(self):
        return self.log_str

# Instanciate a global Logger: L
L = Logger()

In [None]:
import os
import pprint

# record_attempt is used at the end of the pipeline to do the following:
# 1) Setup a new timestamped directory
# 2) Save the master Log (we are referencing the global logger - bad programming!)
# 3) Pickle and save the submissions and the pipeline
# 4) Save CSVs of the final submission.

def record_attempt(columns, df_submission, pipeline):
    assert(df_submission.shape[0] == 892816)
    assert(df_submission.columns[0] == "id")
    
    date = datetime.datetime.now()
    date_str = str(date.year) + "_" + str(date.month) + "_" + str(date.day) + "_" + str(date.hour) + "_" + str(date.minute) + "_" + str(date.second)
    
    # Make directory:
    directory = "../submissions/" + date_str + "/"
    os.makedirs(directory)
    
    file = open(directory + "details__" +date_str+ ".txt", "w")
    file.write("\n\nSETTINGS:")
    for key in SETTINGS:
        file.write("\n\t" + str(key) + " : " + str(SETTINGS[key]))
        
    file.write("\n\nPIPELINE:")
    file.write("\n" + str(pprint.pformat(pipeline)))
        
    if (L):
        file.write("\n\n" + L.get_log())
        
    file.close()
    
    for col in columns: #df_submission.columns :
        if col != "id":
            df_submission[["id"] + [col]].to_csv(directory + "model_" + col + "__" +date_str+ ".csv", header=["id", "target"], index=False)
    
    pd.to_pickle(pipeline, directory + "pipeline__" +date_str+ ".pkl")
    pd.to_pickle(df_submission, directory + "predictions__" +date_str+ ".pkl")
    
    print("Recorded to " + directory)
    

In [None]:
from sklearn.metrics import make_scorer

# gini and gini_normalized are used to score various models. These functions were given by Kaggle Competition. 
# gini_normalized will output a value between 0.0 - 0.5. This is the same score you get with kaggle.
# at the end we create a gini_scorer which can be used by sklearn estimators.

def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses

    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)

def gini_normalized(a, p):
     return gini(a, p) / gini(a, a)
    
gini_scorer = make_scorer(gini_normalized)

# Data Management

In [None]:
# Load the data

df_train = pd.read_csv("../input/train.csv")
df_submit = pd.read_csv("../input/test.csv")

# Data Cleaning

In [None]:
# TransformerMixin is used to create pipeline steps
# Note, my Transformers rely on the data being in dataframes, and therefore have column names
# These CANNOT be used in the stanard sklearn Pipeline. I may refactor this if I have time.

from sklearn.base import TransformerMixin

In [None]:
from sklearn.preprocessing import Imputer

# NumericImputer conducts 2 transformations:
# 1) Creates a new columns "[feature]_filled" indicating that the original feature was not null
# 2) Performs a simple mean imputation on the column

class NumericImputer(TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    
    def fit(self, X, y=None):
        self.imputers = {}
        for col in self.columns:
            self.imputers[col] = Imputer(missing_values=-1).fit(X[col].reshape(-1, 1))
        return self
    
    def transform(self, X, y=None):
        df_return = X.copy()
        for col in self.columns:
            df_return[col + "_filled"] = np.logical_not(df_return[col] == -1)
            df_return[col] = self.imputers[col].transform(X[col].reshape(-1, 1))
                
        return df_return

In [None]:
from sklearn.preprocessing import LabelBinarizer

# CategoricalEncoder performs basically dummy-encodes categorical values
# I found a bug in the LabelBinizer which made this a bit messy.

class CategoricalEncoder(TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    
    def fit(self, X, y=None):
        self.encoders = {}
        for col in self.columns:
            self.encoders[col] = LabelBinarizer().fit(X[col])
            
        return self
    
    def transform(self, X, y=None):
        df_return = X.copy()
        for col in self.columns:
            encodings = self.encoders[col].transform(X[col]).T
            # found bug in LabelBinarizer
            if encodings.shape[0] != 1:
                encodings = encodings[1:]
            class_ndx = 1
            for encoding in encodings:
                df_return[col + "_" + str(self.encoders[col].classes_[class_ndx])] = encoding
                class_ndx = class_ndx + 1
                
        return df_return[[col for col in df_return.columns if col not in self.columns]]

# Feature Engineering

In [None]:
from sklearn.preprocessing import PolynomialFeatures

# CustomPoly is a PolynomialFeatures wrapper that works with DataFrame (this is probably overkill)

class CustomPoly(TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        self.poly = PolynomialFeatures(interaction_only=True).fit(X)
        return self
    
    def transform(self, X, y=None):
        df_return = pd.DataFrame(self.poly.transform(X))
        return df_return

# Dimension Reduction

In [None]:
from sklearn.decomposition import PCA

# CustomPCA is a PCA wrapper that works with DataFrames and allows the researcher to specify the minimum explained_variance_ratio

class CustomPCA(TransformerMixin):
    def __init__(self, variance_threshold = 0.01):
        self.variance_threshold = variance_threshold
        pass
    
    def fit(self, X, y=None):
        self.pca = PCA().fit(X)
        self.num_cols = (self.pca.explained_variance_ratio_ > self.variance_threshold).sum()
        return self
    
    def transform(self, X, y=None):
        df_return = pd.DataFrame(self.pca.transform(X))[np.arange(0, self.num_cols + 1)]
        return df_return

# Data Preparation

In [None]:
from sklearn.preprocessing import StandardScaler

# Standard Scaler wrapper...

class CustomScaler(TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        self.scaler = StandardScaler().fit(X)
        return self
    
    def transform(self, X, y=None):
        return pd.DataFrame(self.scaler.transform(X), columns=X.columns)

# Model Training

In [None]:
# This cell block results in a dataframe: df_working
# df_working will contain all the positivily labeled examples as well as
# as many random negatively labeled values to attain TARGET_POSITIVE_PERCENT

num_positives = df_train["target"].sum()
percent_positive = num_positives / df_train.shape[0]
desired_negatives = num_positives * (1 - SETTINGS["TARGET_POSITIVE_PERCENT"]) / (SETTINGS["TARGET_POSITIVE_PERCENT"] + 0.00001)
additional_percent = desired_negatives / (1 - percent_positive) / df_train.shape[0]
additional_ndxs = np.random.random(df_train.shape[0]) <= additional_percent

working_ndxs = df_train["target"] == 1
working_ndxs = np.logical_or(working_ndxs, additional_ndxs)
df_working = df_train[working_ndxs].reset_index(drop=True)

In [None]:
# Now create list holding for each column type 

bin_cols = [col for col in df_working.columns if col[-3:] == "bin"]
cat_cols = [col for col in df_working.columns if col[-3:] == "cat"]
num_cols = [col for col in df_working.columns if col not in ["id", "target", "index"] + bin_cols + cat_cols]
feature_cols = bin_cols + cat_cols + num_cols

In [None]:
# And validate

L.log("Total # of working observations: " + str(df_working.shape[0]))
L.log("Count of features: " + str(len(feature_cols)))
L.log("Percent of working data with target: " + str(100 * df_working["target"].sum() / df_working.shape[0]) + "%")

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error

# Create a "pipeline" - but not sklearn pipeline. A customing hacky pipeline.

pipeline = {
    "transformers": [
        ("NumericImputer", NumericImputer(columns = num_cols)),
        ("CategoricalEncoder", CategoricalEncoder(columns = cat_cols + bin_cols)),
        ("Scaler", CustomScaler()),
#         ("PolyNomialExpansion", CustomPoly()),
        ("PCA", CustomPCA(variance_threshold = 0.0001))
    ],
    "models": [],
    "scorers": [
        ("Gini", gini_normalized),
        ("MSE", mean_squared_error)
    ]
}

In [None]:
from sklearn.tree import ExtraTreeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier

# Setup a load of classifiers

TREE_DEPTH = 7
NUM_ESTIMATORS = 50
MAX_FEATURES = 'auto'
OOB_SCORE = True
VERBOSITY = True
N_JOBS = 3
RANDOM_STATE = 777
WARM_START = False

In [None]:
pipeline["models"].append(("ExtraTreeClassifier", ExtraTreeClassifier(
    criterion='gini',
    splitter='random',
    max_depth=TREE_DEPTH,
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.0,
    max_features=MAX_FEATURES,
    random_state=RANDOM_STATE,
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    min_impurity_split=None,
    class_weight=None
)))

In [None]:
pipeline["models"].append(("DecisionTreeClassifier", DecisionTreeClassifier(
    criterion='gini',
    splitter='best',
    max_depth=TREE_DEPTH,
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.0,
    max_features=MAX_FEATURES,
    random_state=RANDOM_STATE,
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    min_impurity_split=None,
    class_weight=None,
    presort=False
)))

In [None]:
pipeline["models"].append(("MLPClassifier", MLPClassifier(
    hidden_layer_sizes=(32,8,2),
    activation='relu',
    solver='adam',
    alpha=0.0001,
    batch_size='auto',
    learning_rate='constant',
    learning_rate_init=0.001,
    power_t=0.5,
    max_iter=200,
    shuffle=True,
    random_state=RANDOM_STATE,
    tol=0.0001,
    verbose=VERBOSITY,
    warm_start=WARM_START,
    momentum=0.9,
    nesterovs_momentum=True,
    early_stopping=False,
    validation_fraction=0.1,
    beta_1=0.9,
    beta_2=0.999,
    epsilon=1e-08
)))

In [None]:
pipeline["models"].append(("GaussianNB", GaussianNB(
    priors=None
)))

pipeline["models"].append(("BernoulliNB", BernoulliNB(
    alpha=1.0,
    binarize=0.0,
    fit_prior=True,
    class_prior=None
)))

In [None]:
pipeline["models"].append(("SGDClassifier", SGDClassifier(
    loss='hinge',
    penalty='elasticnet',
    alpha=0.0001,
    l1_ratio=0.15,
    fit_intercept=True,
    max_iter=None,
    tol=None,
    shuffle=True,
    verbose=VERBOSITY,
    epsilon=0.1,
    n_jobs=N_JOBS,
    random_state=RANDOM_STATE,
    learning_rate='optimal',
    eta0=0.0,
    power_t=0.5,
    class_weight=None,
    warm_start=WARM_START,
    average=False,
    n_iter=None
)))

In [None]:
pipeline["models"].append(("RidgeClassifier", RidgeClassifier(
    alpha=1.0,
    fit_intercept=True,
    normalize=False,
    copy_X=True,
    max_iter=None,
    tol=0.001,
    class_weight=None,
    solver='auto',
    random_state=RANDOM_STATE
)))

In [None]:
pipeline["models"].append(("PassiveAggressiveClassifier", PassiveAggressiveClassifier(
    C=1.0,
    fit_intercept=True,
    max_iter=None,
    tol=None,
    shuffle=True,
    verbose=VERBOSITY,
    loss='hinge',
    n_jobs=N_JOBS,
    random_state=RANDOM_STATE,
    warm_start=WARM_START,
    class_weight=None,
    average=False,
    n_iter=None
)))

In [None]:
pipeline["models"].append(("RandomForestClassifier", RandomForestClassifier(
    n_estimators=NUM_ESTIMATORS,
    criterion='gini',
    max_depth=TREE_DEPTH,
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.0,
    max_features=MAX_FEATURES,
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    min_impurity_split=None,
    bootstrap=True,
    oob_score=OOB_SCORE,
    n_jobs=N_JOBS,
    random_state=RANDOM_STATE,
    verbose=VERBOSITY,
    warm_start=WARM_START,
    class_weight=None
)))

In [None]:
pipeline["models"].append(("GradientBoostingClassifier", GradientBoostingClassifier(
    loss='deviance', # Choosing 'deviance' as the alternative ('exponential' is effectivey Ada)
    learning_rate=0.1,
    n_estimators=NUM_ESTIMATORS,
    subsample=1.0,
    criterion='friedman_mse',
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.0,
    max_depth=TREE_DEPTH,
    min_impurity_decrease=0.0,
    min_impurity_split=None,
    init=None,
    random_state=RANDOM_STATE,
    max_features=MAX_FEATURES,
    verbose=VERBOSITY,
    max_leaf_nodes=None,
    warm_start=WARM_START,
    presort='auto'
)))

In [None]:
pipeline["models"].append(("ExtraTreesClassifier", ExtraTreesClassifier(
    n_estimators=NUM_ESTIMATORS,
    criterion='gini',
    max_depth=TREE_DEPTH,
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.0,
    max_features=MAX_FEATURES,
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    min_impurity_split=None,
    bootstrap=True, # changed from False to allow oob
    oob_score=OOB_SCORE,
    n_jobs=N_JOBS,
    random_state=RANDOM_STATE,
    verbose=VERBOSITY,
    warm_start=WARM_START,
    class_weight=None
)))

In [None]:
# pipeline["models"].append(("AdaBoostClassifier", AdaBoostClassifier(
#     base_estimator=None,
#     n_estimators=NUM_ESTIMATORS,
#     learning_rate=1.0,
#     algorithm='SAMME.R',
#     random_state=RANDOM_STATE
# )))

# pipeline["models"].append(("RadiusNeighborsClassifier", RadiusNeighborsClassifier(
#     radius=1.0,
#     weights='uniform',
#     algorithm='auto',
#     leaf_size=30,
#     p=2,
#     metric='minkowski',
#     outlier_label=None,
#     metric_params=None
# )))
# pipeline["models"].append(("KNeighborsClassifier", KNeighborsClassifier(
#     n_neighbors=5,
#     weights='uniform',
#     algorithm='auto',
#     leaf_size=30,
#     p=2,
#     metric='minkowski',
#     metric_params=None,
#     n_jobs=N_JOBS
# )))
# pipeline["models"].append(("MultinomialNB", MultinomialNB(
#     alpha=1.0,
#     fit_prior=True,
#     class_prior=None
# )))

# pipeline["models"].append(("GaussianProcessClassifier", GaussianProcessClassifier(
#     kernel=None,
#     optimizer='fmin_l_bfgs_b',
#     n_restarts_optimizer=0,
#     max_iter_predict=100,
#     warm_start=WARM_START,
#     copy_X_train=True,
#     random_state=RANDOM_STATE,
#     multi_class='one_vs_rest',
#     n_jobs=N_JOBS
# )))

# pipeline["models"].append(("BaggingClassifier", BaggingClassifier(
#     base_estimator=None,
#     n_estimators=NUM_ESTIMATORS,
#     max_samples=1.0,
#     max_features=1.0,
#     bootstrap=True,
#     bootstrap_features=False,
#     oob_score=OOB_SCORE,
#     warm_start=WARM_START,
#     n_jobs=N_JOBS,
#     random_state=RANDOM_STATE,
#     verbose=VERBOSITY
# )))

In [None]:
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
import copy

# Setup a few dataframes:
# predictions - used to hold all our different classifier predictions
# test_values -> the ultimately will be a new dataset of each model's prediction along with a target column

predictions = pd.DataFrame(data = df_submit["id"].tolist(), columns=["id"])
test_values = pd.DataFrame(index = df_working["id"].tolist(), columns=[model[0] for model in pipeline["models"]] + ["target"])
test_values["target"][df_working["id"]] = df_working["target"]

# get_prediction_helper strips the probability from each classifier / regression

def get_prediction_helper(estimator, X):
    if hasattr(estimator, "decision_function"):
        Y = estimator.decision_function(X)
    elif hasattr(estimator, "predict_proba"):
        Y = estimator.predict_proba(X).T[1]
    else:
        Y = estimator.predict(X)
    
    return Y

In [None]:
from sklearn.metrics import roc_curve, auc 

def plot_roc(clf, X_train, Y_train, Y_train_pred, X_test, Y_test, Y_test_pred):
        
    train_fpr, train_tpr, train_thresholds = roc_curve(Y_train, Y_train_pred)
    train_auc_score = auc(train_fpr, train_tpr)
    
    
    test_fpr, test_tpr, test_thresholds = roc_curve(Y_test,Y_test_pred)
    test_auc_score = auc(test_fpr, test_tpr)
    
    plt.figure(figsize=(5,5))
    lw = 2
    plt.plot(train_fpr, train_tpr, color='darkorange',
             lw=lw, label='Train ROC curve (area = %0.2f)' % train_auc_score)
    plt.plot(test_fpr, test_tpr, color='darkred',
             lw=lw, label='Test ROC curve (area = %0.2f)' % test_auc_score)
    
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.xticks(np.arange(0.0, 1.05, 0.1))
    plt.yticks(np.arange(0.0, 1.05, 0.1))
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

In [None]:
# Setup a k-fold loop
def train():
    kf = KFold(n_splits=SETTINGS["SPLIT_COUNT"], shuffle=True)

    # some helpers

    iteration = 0
    models = []
    transformers = []

    # basically the meat:

    for train_index, test_index in kf.split(df_working):
        models.append(copy.deepcopy(pipeline["models"]))
        transformers.append(copy.deepcopy(pipeline["transformers"]))

        print("KFold iteration: " + str(iteration))
        train_X = df_working[feature_cols].iloc[train_index]
        train_Y = df_working["target"].iloc[train_index]
        test_X = df_working[feature_cols].iloc[test_index]
        test_Y = df_working["target"].iloc[test_index]
        test_ids = df_working["id"].iloc[test_index]
        submit_X = df_submit[feature_cols]

        for transformer in transformers[iteration]:
            print("\tTransformer: " + transformer[0])
            print("\t\tFitting")
            transformer[1].fit(train_X)
            print("\t\tTransforming Train")
            train_X = transformer[1].transform(train_X)
            print("\t\tTransforming Test")
            test_X = transformer[1].transform(test_X)
            if SETTINGS["SUBMITTING"]:
                print("\t\tTransforming Submission")
                submit_X = transformer[1].transform(submit_X)

        for model in models[iteration]:
            model_name = model[0] + "_" + str(iteration)

            L.log("\tModel: " + model_name)
            print("\t\tFitting")
            model[1].fit(train_X, train_Y)
            print("\t\tPredicting Train")
            train_pred_Y = get_prediction_helper(model[1], train_X)
            for scorer in pipeline["scorers"]:
                L.log("\t\t\tTrain - " + scorer[0] + ": " + str(scorer[1](train_Y, train_pred_Y)))
            print("\t\tPredicting Test")
            test_pred_Y = get_prediction_helper(model[1], test_X)
            for scorer in pipeline["scorers"]:
                L.log("\t\t\tTest - " + scorer[0] + ": " + str(scorer[1](test_Y, test_pred_Y)))
            test_values[model[0]][test_ids] = MinMaxScaler().fit_transform(test_pred_Y.reshape(-1, 1)).T[0].tolist()

            plot_roc(model[1], train_X, train_Y, train_pred_Y, test_X, test_Y, test_pred_Y)

            if SETTINGS["SUBMITTING"]:
                print("\t\tPredicting Submission")
                submit_Y = MinMaxScaler().fit_transform(get_prediction_helper(model[1], submit_X).reshape(-1, 1))
                prediction_attempt = submit_Y * gini_normalized(test_Y, test_pred_Y)
                predictions[model_name] = prediction_attempt

        iteration = iteration + 1

In [None]:
train()

In [None]:
from sklearn.model_selection import train_test_split
# At this point, we have a table -> test values that holds the predicted values for each model (when testing) and a target. Time to get Meta!!

# Now we traing a simple classifier on top of this hyper table

clf = RidgeClassifier(class_weight="balanced")


meta_columns = [col for col in test_values.columns if col != "target"]
X = test_values[meta_columns]
Y = test_values["target"]

(X_train, X_test, Y_train, Y_test) = train_test_split(X, Y, test_size=0.33)
L.log("Training Meta Model")
clf.fit(X_train, Y_train)
probs = get_prediction_helper(clf, X_test)
probs = MinMaxScaler().fit_transform(probs.reshape(-1, 1)).T[0].tolist()
gini_score = gini_normalized(Y_test, probs)
L.log("\tGini - " + str(gini_score))

In [None]:
# If we are submitting, do the same for the prediction table

if SETTINGS["SUBMITTING"]:
    meta_model = pd.DataFrame(data = df_submit["id"].tolist(), columns=["id"])
    for meta in meta_columns:
        meta_model[meta] = 0
        for num in range(0, SETTINGS["SPLIT_COUNT"]):
            raw_label = meta + "_" + str(num)
            meta_model[meta] = meta_model[meta].add(predictions[raw_label])
            
    meta_model[meta_columns] = meta_model[meta_columns] / (SETTINGS["SPLIT_COUNT"] - 1)
    predictions["mean_prediction"] = MinMaxScaler(feature_range=(0.00001, 0.99999)).fit_transform(predictions[[col for col in predictions.columns if col != "id"]].sum(axis=1).reshape(-1, 1))
    X = meta_model[meta_columns]
    probs = get_prediction_helper(clf, X)
    predictions["meta_prediction"] = MinMaxScaler(feature_range=(0.00001, 0.99999)).fit_transform(probs.reshape(-1, 1))
    
    predictions.meta_prediction.sort_values(inplace=False).reset_index().meta_prediction.plot()

# Kaggle Prediction

In [None]:
# Record the attempt

record_attempt(["mean_prediction", "meta_prediction", "RandomForestClassifier_0"], predictions, pipeline)

In [None]:
# Look at the data

predictions