In [2]:
import boto3
import io
import sagemaker
# from sagemaker import Session
# from sagemaker import estimator
from sagemaker.amazon.common import write_numpy_to_dense_tensor
import sagemaker.amazon.common as smac
from sagemaker import get_execution_role

import pickle, gzip, numpy, urllib.request, json

import psycopg2
import getpass
import sys
from os.path import expanduser
import os
import re

import boto3

import subprocess # Use subprocess to execute aws command line

import datetime
import time

import pandas as pd
from pandas import Series, DataFrame
import numpy as np

from collections import defaultdict

#sklearn
from sklearn import preprocessing
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from functools import reduce
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction import DictVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
# ROC Curve with logistic regression
from sklearn.metrics import roc_curve
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn import tree

from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score

from sklearn.model_selection import GridSearchCV

import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.colors import ListedColormap

from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.plotly as py
from plotly import tools

# Make plotly work with Jupyter notebook
init_notebook_mode(connected=True)

import seaborn as sns

from IPython.display import Markdown as md
from IPython.display import Markdown, display

# hide warning messages
import warnings
warnings.filterwarnings('ignore')

  """)


In [3]:
role = get_execution_role()

s3 = boto3.client('s3')
sagemaker = boto3.client('sagemaker')
runtime = boto3.client('runtime.sagemaker')

In [4]:
# function for markdown print
def printmd(string):
    display(Markdown(string))

# Classes to be used in pipeline

In [5]:
class DFFunctionTransformer(TransformerMixin, BaseEstimator):
    # FunctionTransformer but for pandas DataFrames

    def __init__(self, *args, **kwargs):
        self.ft = FunctionTransformer(*args, **kwargs)

    def fit(self, X, y=None):
        # stateless transformer
        return self

    def transform(self, X):
        Xt = self.ft.transform(X)
        Xt = pd.DataFrame(Xt, index=X.index, columns=X.columns)
        return Xt


class DFFeatureUnion(TransformerMixin, BaseEstimator):
    # FeatureUnion but for pandas DataFrames

    def __init__(self, transformer_list):
        self.transformer_list = transformer_list

    def fit(self, X, y=None):
        for (name, t) in self.transformer_list:
            t.fit(X, y)
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        Xts = [t.transform(X) for _, t in self.transformer_list]
        Xunion = reduce(lambda X1, X2: pd.merge(X1, X2, left_index=True, right_index=True), Xts)
        return Xunion


class DFImputer(TransformerMixin, BaseEstimator):
    # Imputer but for pandas DataFrames

    def __init__(self, strategy='mean'):
        self.strategy = strategy
        self.imp = None
        self.statistics_ = None

    def fit(self, X, y=None):
        self.imp = Imputer(strategy=self.strategy)
        self.imp.fit(X)
        self.statistics_ = pd.Series(self.imp.statistics_, index=X.columns)
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        Ximp = self.imp.transform(X)
        Xfilled = pd.DataFrame(Ximp, index=X.index, columns=X.columns)
        return Xfilled


class DFStandardScaler(TransformerMixin, BaseEstimator):
    # StandardScaler but for pandas DataFrames

    def __init__(self):
        self.ss = None
        self.mean_ = None
        self.scale_ = None

    def fit(self, X, y=None):
        self.ss = StandardScaler()
        self.ss.fit(X)
        self.mean_ = pd.Series(self.ss.mean_, index=X.columns)
        self.scale_ = pd.Series(self.ss.scale_, index=X.columns)
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        Xss = self.ss.transform(X)
        Xscaled = pd.DataFrame(Xss, index=X.index, columns=X.columns)
        return Xscaled


class DFRobustScaler(TransformerMixin, BaseEstimator):
    # RobustScaler but for pandas DataFrames

    def __init__(self):
        self.rs = None
        self.center_ = None
        self.scale_ = None

    def fit(self, X, y=None):
        self.rs = RobustScaler()
        self.rs.fit(X)
        self.center_ = pd.Series(self.rs.center_, index=X.columns)
        self.scale_ = pd.Series(self.rs.scale_, index=X.columns)
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        Xrs = self.rs.transform(X)
        Xscaled = pd.DataFrame(Xrs, index=X.index, columns=X.columns)
        return Xscaled


class ColumnExtractor(TransformerMixin, BaseEstimator):

    def __init__(self, cols):
        self.cols = cols

    def fit(self, X, y=None):
        # stateless transformer
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        Xcols = X[self.cols]
        return Xcols


class ZeroFillTransformer(TransformerMixin, BaseEstimator):

    def fit(self, X, y=None):
        # stateless transformer
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        Xz = X.fillna(value=0)
        return Xz


class Log1pTransformer(TransformerMixin, BaseEstimator):

    def fit(self, X, y=None):
        # stateless transformer
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        Xlog = np.log1p(X)
        return Xlog


class DateFormatter(TransformerMixin, BaseEstimator):

    def fit(self, X, y=None):
        # stateless transformer
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        Xdate = X.apply(pd.to_datetime)
        return Xdate


class DateDiffer(TransformerMixin, BaseEstimator):

    def fit(self, X, y=None):
        # stateless transformer
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        beg_cols = X.columns[:-1]
        end_cols = X.columns[1:]
        Xbeg = X[beg_cols].as_matrix()
        Xend = X[end_cols].as_matrix()
        Xd = (Xend - Xbeg) / np.timedelta64(1, 'D')
        diff_cols = ['->'.join(pair) for pair in zip(beg_cols, end_cols)]
        Xdiff = pd.DataFrame(Xd, index=X.index, columns=diff_cols)
        return Xdiff


class DummyTransformer(TransformerMixin, BaseEstimator):

    def __init__(self):
        self.dv = None

    def fit(self, X, y=None):
        # assumes all columns of X are strings
        Xdict = X.to_dict('records')
        self.dv = DictVectorizer(sparse=False)
        self.dv.fit(Xdict)
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        Xdict = X.to_dict('records')
        Xt = self.dv.transform(Xdict)
        cols = self.dv.get_feature_names()
        Xdum = pd.DataFrame(Xt, index=X.index, columns=cols)
        # drop column indicating NaNs
        nan_cols = [c for c in cols if '=' not in c]
        Xdum = Xdum.drop(nan_cols, axis=1)
        return Xdum


class MultiEncoder(TransformerMixin, BaseEstimator):
    # Multiple-column MultiLabelBinarizer for pandas DataFrames

    def __init__(self, sep=','):
        self.sep = sep
        self.mlbs = None

    def _col_transform(self, x, mlb):
        cols = [''.join([x.name, '=', c]) for c in mlb.classes_]
        xmlb = mlb.transform(x)
        xdf = pd.DataFrame(xmlb, index=x.index, columns=cols)
        return xdf

    def fit(self, X, y=None):
        Xsplit = X.applymap(lambda x: x.split(self.sep))
        self.mlbs = [MultiLabelBinarizer().fit(Xsplit[c]) for c in X.columns]
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        Xsplit = X.applymap(lambda x: x.split(self.sep))
        Xmlbs = [self._col_transform(Xsplit[c], self.mlbs[i])
                 for i, c in enumerate(X.columns)]
        Xunion = reduce(lambda X1, X2: pd.merge(X1, X2, left_index=True, right_index=True), Xmlbs)
        return Xunion


class StringTransformer(TransformerMixin, BaseEstimator):

    def fit(self, X, y=None):
        # stateless transformer
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        Xstr = X.applymap(str)
        return Xstr


class ClipTransformer(TransformerMixin, BaseEstimator):

    def __init__(self, a_min, a_max):
        self.a_min = a_min
        self.a_max = a_max

    def fit(self, X, y=None):
        # stateless transformer
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        Xclip = np.clip(X, self.a_min, self.a_max)
        return Xclip


class AddConstantTransformer(TransformerMixin, BaseEstimator):

    def __init__(self, c=1):
        self.c = c

    def fit(self, X, y=None):
        # stateless transformer
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        Xc = X + self.c
        return Xc


class MultiColumnLabelEncoder(TransformerMixin, BaseEstimator):  
    """Transformer for applying label encoder on multiple columns.

    This transformer applies label encoding to columns in a dataset.
    """

    def __init__(self):
        self.d = defaultdict(LabelEncoder)

    def transform(self, X, **transform_params):
        """Transforms X to have columns label encoded.

        Args:
            X (obj): The dataset to transform. Can be dataframe or matrix.
            transform_params (kwargs, optional): Additional params.

        Returns:
            The transformed dataset with the label encoded columns.
        """
        X = X.fillna('NaN')  # fill null values with 'NaN'
        transformed = X.apply(lambda x: self.d[x.name].transform(x))
        return transformed

    def fit(self, X, y=None, **fit_params):
        """Fits transfomer over X.

        Needs to apply fit over the defaultdict so as to retain the
        label classes when transforming.
        """
        X = X.fillna('NaN')  # fill null values with 'NaN'
        X.apply(lambda x: self.d[x.name].fit(x))
        return self

# Loading the data

In [6]:
bucket = 'jornaya-ds-us-east-1-sagemaker'
train_key = 'amiao/sagemaker_poc/lincoln_tech/raw_data/lincoln_tech_enroll_train_60.csv'
train_data = s3.get_object(Bucket = bucket, Key = train_key)

val_key = 'amiao/sagemaker_poc/lincoln_tech/raw_data/lincoln_tech_enroll_val_40.csv'
val_data = s3.get_object(Bucket = bucket, Key = val_key)

train = pd.read_csv(io.BytesIO(train_data['Body'].read()))
val = pd.read_csv(io.BytesIO(val_data['Body'].read()))
full = train.append(val)

# Let's drop 'token' for now since it will not be used in our modeling process
full_orig = full.copy()
full = full.drop('token', axis=1)
full = full.reset_index(drop=True)

# Split the data

In [7]:
X = train.drop('lincoln_tech_predict_score', axis=1)
y = train['lincoln_tech_predict_score'].values

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.4 , shuffle=False) #
# full_pipeline.fit(X_train, y_train)

In [8]:
# X.head()

In [9]:
# numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
# numeric_cols

In [10]:
OUTCOME = 'lincoln_tech_predict_score'
# NEAR_UNIQUE_FEATS = ['name_of_event', 'year_month_app', 'organization']
# DATE_FEATS = ['application_date', 'event_start_date', 'event_end_date']

DUMMY_FEATS = [':degree_level_sought']

LABELENCODER_FEATS = ['mobile_device_type_hashed'
                      , 'DeviceType'
                      , 'absent_start', 'call_center', 'has_email', 'has_phone', 'is_mobile_device']#'Browser', 'Platform', , 'url_domain', 'geoip_postal_code', ':high_school_graduation_year'

NUM_FEATS = [ 
 'consumer_five_minutes',
 'consumer_hour',
 'consumer_twelve_hours',
 'consumer_day',
 'consumer_week',
 'device_five_minutes',
 'device_hour',
 'device_twelve_hours',
 'device_day',
 'device_week',
 'fields_changed',
 'fields_interacted',
 'ip_five_minutes',
 'ip_hour',
 'ip_twelve_hours',
 'ip_day',
 'ip_week',
 'lead_age',
 'lead_duration',
 'pct_field_changes',
 'pct_field_interactions',
 'pct_fields_changed',
 'pct_fields_interacted',
 'total_entities',
 'device_count_30_days',
 'device_count_60_days',
 'email_count_30_days',
 'email_count_60_days',
 'lead_five_minutes',
 'lead_hour',
 'lead_twelve_hours',
 'lead_day',
 'lead_week']

In [11]:
filtered_cols = DUMMY_FEATS + LABELENCODER_FEATS + NUM_FEATS
len(filtered_cols)

41

In [12]:
# Preprocessing with a Pipeline
preprocessing_pipeline = Pipeline([
    ('features', DFFeatureUnion([

        ('categoricals', Pipeline([
            ('extract', ColumnExtractor(DUMMY_FEATS)),
            ('dummy', DummyTransformer())
        ])),
        ('multi_labels', Pipeline([
            ('extract', ColumnExtractor(LABELENCODER_FEATS)),
            ('transform_to_string', StringTransformer()),
            ('labencode', MultiColumnLabelEncoder()),
        ])),
        ('numerics', Pipeline([
            ('extract', ColumnExtractor(NUM_FEATS)),
            ('fill_with_mean', DFImputer(strategy='mean'))
#             ('log', Log1pTransformer())
        ]))
    ])),
    ('scale', DFStandardScaler())
])


In [35]:
X_train.head()

Unnamed: 0,token,created,email,provider_id,absent_pages,absent_start,Browser,Platform,call_center,consumer_five_minutes,...,lead_hour,lead_twelve_hours,lead_day,lead_week,url_domain,DeviceType,geoip_continent_code,geoip_postal_code,:high_school_graduation_year,:degree_level_sought
0,403E5217-C4C7-614A-D44F-CBA46F07C2C8,1432073097,jennifer.1509@yahoo.com,B4ECACA4-26E5-11E1-AC55-12313D0892A8,0,False,,,False,1,...,0,0,0,0,cna-training-centers.com,,,60641.0,2012,Diploma
1,E4E20615-AED8-9AEE-9BCE-58C90FE9AFD5,1432074747,sunnydays6619@gmail.com,,0,False,,,False,1,...,0,0,0,0,lm.careercourses.us,,,,1982,Certificate
2,0493A168-35AD-A39A-EB2F-CF495C60132F,1432076425,julia.mitchell71@yahoo.com,FEB3AAB8-5368-11E1-8A6C-12313D0892A8,0,False,,,False,1,...,0,0,0,0,comparetopschools.com,,,,1981,Associate
3,A8B56CDB-E235-CBBF-ABE1-B5D6AC4B3B96,1432076640,roselabrie95@yahoo.com,B4ECACA4-26E5-11E1-AC55-12313D0892A8,0,False,,,False,2,...,1,1,1,1,offers.degreesearch.org,,,,2013,Bachelors
4,286604F4-C9F1-0F35-0CFD-25B97B9B0E35,1432077714,lovelykiki89@hotmail.com,B4ECACA4-26E5-11E1-AC55-12313D0892A8,0,True,,,False,1,...,0,0,0,0,practicalnursing.org,,,10469.0,2006,Diploma


In [13]:
df = preprocessing_pipeline.fit_transform(X_train)

In [14]:
df.head()

Unnamed: 0,:degree_level_sought=Associate,:degree_level_sought=Bachelors,:degree_level_sought=Certificate,:degree_level_sought=Diploma,:degree_level_sought=Not Specified,mobile_device_type_hashed,DeviceType,absent_start,call_center,has_email,...,total_entities,device_count_30_days,device_count_60_days,email_count_30_days,email_count_60_days,lead_five_minutes,lead_hour,lead_twelve_hours,lead_day,lead_week
0,-0.173219,-0.068959,-0.279283,0.364062,-0.10454,0.077477,2.036975,-0.535813,-0.260474,0.196849,...,-0.477869,-0.404954,-0.436649,-0.5337717,-0.6099706,-0.263867,-0.265366,-0.265776,-0.266097,-0.266302
1,-0.173219,-0.068959,3.580595,-2.746788,-0.10454,0.077477,2.036975,-0.535813,-0.260474,0.196849,...,0.0,-0.404954,-0.436649,-0.5337717,-0.6099706,-0.263867,-0.265366,-0.265776,-0.266097,-0.266302
2,5.773035,-0.068959,-0.279283,-2.746788,-0.10454,-1.509166,2.036975,-0.535813,-0.260474,0.196849,...,1.084261,-0.404954,-0.436649,-0.2567257,0.2534542,-0.263867,-0.265366,-0.265776,-0.266097,-0.266302
3,-0.173219,14.501304,-0.279283,-2.746788,-0.10454,0.870798,2.036975,-0.535813,-0.260474,0.196849,...,2.646391,-0.180579,-0.242591,6.151657e-17,-9.58594e-17,2.958304,2.841914,2.83426,2.832001,2.826633
4,-0.173219,-0.068959,-0.279283,0.364062,-0.10454,0.870798,2.036975,1.865948,-0.260474,0.196849,...,-0.477869,-0.404954,-0.436649,-0.5337717,-0.6099706,-0.263867,-0.265366,-0.265776,-0.266097,-0.266302


In [15]:
len(df.columns)

45

In [16]:
df.head()

Unnamed: 0,:degree_level_sought=Associate,:degree_level_sought=Bachelors,:degree_level_sought=Certificate,:degree_level_sought=Diploma,:degree_level_sought=Not Specified,mobile_device_type_hashed,DeviceType,absent_start,call_center,has_email,...,total_entities,device_count_30_days,device_count_60_days,email_count_30_days,email_count_60_days,lead_five_minutes,lead_hour,lead_twelve_hours,lead_day,lead_week
0,-0.173219,-0.068959,-0.279283,0.364062,-0.10454,0.077477,2.036975,-0.535813,-0.260474,0.196849,...,-0.477869,-0.404954,-0.436649,-0.5337717,-0.6099706,-0.263867,-0.265366,-0.265776,-0.266097,-0.266302
1,-0.173219,-0.068959,3.580595,-2.746788,-0.10454,0.077477,2.036975,-0.535813,-0.260474,0.196849,...,0.0,-0.404954,-0.436649,-0.5337717,-0.6099706,-0.263867,-0.265366,-0.265776,-0.266097,-0.266302
2,5.773035,-0.068959,-0.279283,-2.746788,-0.10454,-1.509166,2.036975,-0.535813,-0.260474,0.196849,...,1.084261,-0.404954,-0.436649,-0.2567257,0.2534542,-0.263867,-0.265366,-0.265776,-0.266097,-0.266302
3,-0.173219,14.501304,-0.279283,-2.746788,-0.10454,0.870798,2.036975,-0.535813,-0.260474,0.196849,...,2.646391,-0.180579,-0.242591,6.151657e-17,-9.58594e-17,2.958304,2.841914,2.83426,2.832001,2.826633
4,-0.173219,-0.068959,-0.279283,0.364062,-0.10454,0.870798,2.036975,1.865948,-0.260474,0.196849,...,-0.477869,-0.404954,-0.436649,-0.5337717,-0.6099706,-0.263867,-0.265366,-0.265776,-0.266097,-0.266302


# Distribute variables into four categories

# Building Pipeline

## Pipeline with Logistic Regression

In [17]:
# Preprocessing with a Pipeline
lr_pipeline = Pipeline([
    ('features', DFFeatureUnion([
#         ('dates', Pipeline([
#             ('extract', ColumnExtractor(DATE_FEATS)),
#             ('to_date', DateFormatter()),
#             ('diffs', DateDiffer()),
#             ('mid_fill', DFImputer(strategy='median'))
#         ])),
        ('categoricals', Pipeline([
            ('extract', ColumnExtractor(DUMMY_FEATS)),
            ('dummy', DummyTransformer())
        ])),
        ('multi_labels', Pipeline([
            ('extract', ColumnExtractor(LABELENCODER_FEATS)),
            ('transform_to_string', StringTransformer()),
            ('labencode', MultiColumnLabelEncoder()),
        ])),
        ('numerics', Pipeline([
            ('extract', ColumnExtractor(NUM_FEATS)),
            ('fill_with_mean', DFImputer(strategy='mean'))
#             ('log', Log1pTransformer())
        ]))
    ])),
    ('scale', DFStandardScaler()),
    ('lr_clf', LogisticRegression())
])


## Train the model using pipeline

In [18]:
lr_pipeline.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('features', DFFeatureUnion(transformer_list=[('categoricals', Pipeline(memory=None,
     steps=[('extract', ColumnExtractor(cols=[':degree_level_sought'])), ('dummy', DummyTransformer())])), ('multi_labels', Pipeline(memory=None,
     steps=[('extract', ColumnExtractor(cols=['mobile_device_t...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

## Make predictions

In [19]:
y_pred = lr_pipeline.predict_proba(X_val)[:,1]

In [20]:
fpr_val, tpr_val, thresholds_val = roc_curve(y_val, y_pred)
roc_auc_score(y_val, y_pred)

0.5858486102626055

# Pipeline with AdaBoost

In [21]:
# Preprocessing with a Pipeline
ab_pipeline = Pipeline([
    ('features', DFFeatureUnion([
#         ('dates', Pipeline([
#             ('extract', ColumnExtractor(DATE_FEATS)),
#             ('to_date', DateFormatter()),
#             ('diffs', DateDiffer()),
#             ('mid_fill', DFImputer(strategy='median'))
#         ])),
        ('categoricals', Pipeline([
            ('extract', ColumnExtractor(DUMMY_FEATS)),
            ('dummy', DummyTransformer())
        ])),
        ('multi_labels', Pipeline([
            ('extract', ColumnExtractor(LABELENCODER_FEATS)),
            ('transform_to_string', StringTransformer()),
            ('labencode', MultiColumnLabelEncoder()),
        ])),
        ('numerics', Pipeline([
            ('extract', ColumnExtractor(NUM_FEATS)),
            ('fill_with_mean', DFImputer(strategy='mean'))
#             ('log', Log1pTransformer())
        ]))
    ])),
    ('scale', DFStandardScaler()),
    ('adaboost', AdaBoostClassifier(n_estimators=50,
                         learning_rate=0.2,
                         random_state=0))
])


In [22]:
ab_pipeline.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('features', DFFeatureUnion(transformer_list=[('categoricals', Pipeline(memory=None,
     steps=[('extract', ColumnExtractor(cols=[':degree_level_sought'])), ('dummy', DummyTransformer())])), ('multi_labels', Pipeline(memory=None,
     steps=[('extract', ColumnExtractor(cols=['mobile_device_t...ithm='SAMME.R', base_estimator=None,
          learning_rate=0.2, n_estimators=50, random_state=0))])

In [23]:
y_pred = ab_pipeline.predict_proba(X_val)[:,1]

In [24]:
fpr_val, tpr_val, thresholds_val = roc_curve(y_val, y_pred)
roc_auc_score(y_val, y_pred)

0.6028906145277991

# Pipeline with Gradient Boosting

In [25]:
# Preprocessing with a Pipeline
gb_pipeline = Pipeline([
    ('features', DFFeatureUnion([
#         ('dates', Pipeline([
#             ('extract', ColumnExtractor(DATE_FEATS)),
#             ('to_date', DateFormatter()),
#             ('diffs', DateDiffer()),
#             ('mid_fill', DFImputer(strategy='median'))
#         ])),
        ('categoricals', Pipeline([
            ('extract', ColumnExtractor(DUMMY_FEATS)),
            ('dummy', DummyTransformer())
        ])),
        ('multi_labels', Pipeline([
            ('extract', ColumnExtractor(LABELENCODER_FEATS)),
            ('transform_to_string', StringTransformer()),
            ('labencode', MultiColumnLabelEncoder()),
        ])),
        ('numerics', Pipeline([
            ('extract', ColumnExtractor(NUM_FEATS)),
            ('fill_with_mean', DFImputer(strategy='mean'))
#             ('log', Log1pTransformer())
        ]))
    ])),
    ('scale', DFStandardScaler()),
    ('gb', GradientBoostingClassifier())
])


In [26]:
gb_pipeline.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('features', DFFeatureUnion(transformer_list=[('categoricals', Pipeline(memory=None,
     steps=[('extract', ColumnExtractor(cols=[':degree_level_sought'])), ('dummy', DummyTransformer())])), ('multi_labels', Pipeline(memory=None,
     steps=[('extract', ColumnExtractor(cols=['mobile_device_t...      presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False))])

In [27]:
y_pred = gb_pipeline.predict_proba(X_val)[:,1]

In [28]:
fpr_val, tpr_val, thresholds_val = roc_curve(y_val, y_pred)
roc_auc_score(y_val, y_pred)

0.610964771437985

# Pipeline with Gradient Boosting

In [29]:
# Preprocessing with a Pipeline
gd_pipeline = Pipeline([
    ('features', DFFeatureUnion([
#         ('dates', Pipeline([
#             ('extract', ColumnExtractor(DATE_FEATS)),
#             ('to_date', DateFormatter()),
#             ('diffs', DateDiffer()),
#             ('mid_fill', DFImputer(strategy='median'))
#         ])),
        ('categoricals', Pipeline([
            ('extract', ColumnExtractor(DUMMY_FEATS)),
            ('dummy', DummyTransformer())
        ])),
        ('multi_labels', Pipeline([
            ('extract', ColumnExtractor(LABELENCODER_FEATS)),
            ('transform_to_string', StringTransformer()),
            ('labencode', MultiColumnLabelEncoder()),
        ])),
        ('numerics', Pipeline([
            ('extract', ColumnExtractor(NUM_FEATS)),
            ('fill_with_mean', DFImputer(strategy='mean'))
#             ('log', Log1pTransformer())
        ]))
    ])),
    ('scale', DFStandardScaler()),
    ('gd', GradientBoostingClassifier())
])


## Grid Search

In [30]:
# param_grid = {
#     'pca__n_components': [5, 20, 30, 40, 50, 64],
#     'logistic__alpha': np.logspace(-4, 4, 5),
# }

In [31]:
parameters = {
    "loss":["deviance"],
    "learning_rate": [0.1, 0.2],
    "min_samples_split": np.linspace(0.1, 0.5),
    "min_samples_leaf": np.linspace(0.1, 0.5),
    "max_depth":[3,5,8],
    "max_features":["log2","sqrt"],
    "criterion": ["friedman_mse",  "mae"],
    "subsample":[0.5, 1.0],
    "n_estimators":[10]
    }

In [32]:
cv = GridSearchCV(gd_pipeline, parameters)

In [33]:
# print(cv.best_score_)    
# print(cv.best_params_)  

In [34]:
cv.fit(X, y)

ValueError: Invalid parameter criterion for estimator Pipeline(memory=None,
     steps=[('features', DFFeatureUnion(transformer_list=[('categoricals', Pipeline(memory=None,
     steps=[('extract', ColumnExtractor(cols=[':degree_level_sought'])), ('dummy', DummyTransformer())])), ('multi_labels', Pipeline(memory=None,
     steps=[('extract', ColumnExtractor(cols=['mobile_device_t...      presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False))]). Check the list of available parameters with `estimator.get_params().keys()`.

In [None]:
y_pred = cv.predict_proba(X_val)[:,1]

In [None]:
fpr_val, tpr_val, thresholds_val = roc_curve(y_val, y_pred)
roc_auc_score(y_val, y_pred)