In [39]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, FunctionTransformer, Imputer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn2pmml import make_pmml_pipeline, sklearn2pmml, PMMLPipeline
from sklearn_pandas import DataFrameMapper
from sklearn.metrics import roc_auc_score
from sklearn.externals import joblib
from sklearn2pmml.decoration import ContinuousDomain
from sklearn2pmml.preprocessing import ExpressionTransformer

In [8]:
%run '../code/constants.py'
%run '../code/feature_selection.py'

In [9]:
df = pd.read_pickle(path + fin_mod_df)

In [10]:
X, y = get_x_y(df)
non_trans_vars = [col for col in X.columns if col not in trend_vars + to_logs]
X = X[trend_vars + to_logs + non_trans_vars]

In [71]:
featureU = FeatureUnion([
    ('transformations', DataFrameMapper([
        (trend_vars, ExpressionTransformer("X[:, 0] - X[:, 1]"))
        , (to_logs, make_pipeline(Imputer(strategy='median')
           , FunctionTransformer(np.log1p)
                                 ))
        ])
    )
 , ('identity', DataFrameMapper([(non_trans_vars, ContinuousDomain())]))
])

In [93]:
pl = PMMLPipeline([
    ('featureUnion', featureU)
    , ('impute', Imputer(strategy='median'))
    , ('standardize', StandardScaler())
#     , ('interactions', PolynomialFeatures(include_bias=False))
#     , ('clf', SGDClassifier(alpha=0.008, l1_ratio=0.13, max_iter=450,loss='log'
#                             ,penalty='elasticnet', n_iter=None, tol=None))# alpha = 0.8
    , ('clf', LogisticRegression(penalty='l2', max_iter=500, C=0.8))
])

In [94]:
pl.fit(X, y)

PMMLPipeline(steps=[('featureUnion', FeatureUnion(n_jobs=1,
       transformer_list=[('transformations', DataFrameMapper(default=False, df_out=False,
        features=[(['pmml_variables.cbb_num_closures_one_year_ago', 'third_party.clarity_cbb.clear-bank-behavior.fis-chex-advisor.number-closures.three-years-ago'], ExpressionTransformer(expr='X[:, 0] - X[:, 1]')), ([...t='as_is', with_data=True,
         with_statistics=True))],
        input_df=False, sparse=False))],
       transformer_weights=None)),
       ('impute', Imputer(axis=0, copy=True, missing_values='NaN', strategy='median', verbose=0)),
       ('standardize', StandardScaler(copy=True, with_mean=True, with_std=True)),
       ('clf', LogisticRegression(C=0.8, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=500, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [95]:
print(proba_scorer(y, pl.predict_proba(X), roc_auc_score))

0.6236740052378797


In [96]:
joblib.dump(pl, '../outputs/pmml_mod.pkl')

['../outputs/CBB_model_v2.pkl']

In [98]:
sklearn2pmml(pl, '../outputs/pmml_file.pmml', with_repr=True)

Done!