In [0]:
import dataiku
from dataiku import pandasutils as pdu
import pandas as pd, numpy as np

In [0]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

In [0]:
# load 'train' dataset as a Pandas dataframe
df = dataiku.Dataset("train").get_dataframe()

In [0]:
#-----------------------------------------------------------------
# Dataset Settings
#-----------------------------------------------------------------

# Select a subset of features to use for training
SCHEMA = {    
    'target': 'high_value',    
    'features_num': ['age', 'price_first_item_purchased', 'pages_visited'],    
    'features_cat': ['gender', 'campaign']    
}

In [0]:
#-----------------------------------------------------------------
# Preprocessing on Training Set
#-----------------------------------------------------------------

# Numerical variables
df_num = df[SCHEMA['features_num']]

trf_num = Pipeline([
    ('imp', SimpleImputer(strategy='mean')),
    ('sts', StandardScaler()),
])

x_num = trf_num.fit_transform(df_num)

# Categorical variables
df_cat = df[SCHEMA['features_cat']]
features = df_cat.columns

for feature in features:
    if df_cat[feature].dtype != 'object':
        df_cat[feature] = df_cat[feature].astype(str)

data = df_cat.to_dict(orient='records')
        
trf_cat = DictVectorizer(sparse=False)
x_cat = trf_cat.fit_transform(data)

# Concat 
X = np.concatenate((x_cat, x_num), axis=1)
Y = df[SCHEMA['target']].values

In [0]:
#-------------------------------------------------------------------------
# TRAINING
#-------------------------------------------------------------------------

param_grid = {
    "max_depth"        : [3, None],
    "max_features"     : [1, 3, 5],
    "min_samples_split": [2, 3, 10],
    "min_samples_leaf" : [1, 3, 10],
    "bootstrap"        : [True, False],
    "criterion"        : ["gini", "entropy"],
    "n_estimators"     : [5, 10]
}

clf = RandomForestClassifier()
gs = GridSearchCV(clf, param_grid=param_grid, n_jobs=-1, scoring='roc_auc', cv=3)
gs.fit(X, Y)
clf = gs.best_estimator_

In [0]:
#-----------------------------------------------------------------
# Preprocessing on Test Set
#-----------------------------------------------------------------

# load 'test' dataset as a Pandas dataframe
df_test = dataiku.Dataset("to_assess_prepared").get_dataframe()

#-----------------------------------------------------------------
# Transform and score test set
#-----------------------------------------------------------------

# Preprocess numerical features
x_test_num = trf_num.transform( df_test[SCHEMA['features_num']] )

# Preprocess categorical features
df_test_cat = df_test[SCHEMA['features_cat']]
features = df_test_cat.columns

for feature in features:
    if df_test_cat[feature].dtype != 'object':
        df_test_cat[feature] = df_test_cat[feature].astype(str)
data = df_test_cat.to_dict(orient='records')

x_test_cat = trf_cat.transform( data )

# Concatenate
X_test = np.concatenate((x_test_cat, x_test_num), axis=1)

In [0]:
# Actually score the new records
scores = clf.predict_proba(X_test)

In [0]:
#-----------------------------------------------------------------
# Reshape
#-----------------------------------------------------------------
preds = pd.DataFrame(scores, index=df_test.index).rename(columns={0: 'proba_False', 1: 'proba_True'})
all_preds = df_test.join(preds)

In [0]:
# Sample of the test dataset with predicted probabilities
all_preds.head()

In [0]:
# Compute AUC results

auc = roc_auc_score(all_preds['high_value'].astype(bool).values, all_preds['proba_True'].values)
auc