In [1]:
import dataiku
from dataiku import pandasutils as pdu
import pandas as pd, numpy as np

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

In [3]:
# load 'train' dataset as a Pandas dataframe
df = dataiku.Dataset("train").get_dataframe()

In [4]:
#-----------------------------------------------------------------
# Dataset Settings
#-----------------------------------------------------------------

# Select a subset of features to use for training
SCHEMA = {    
    'target': 'high_value',    
    'features_num': ['age', 'price_first_item_purchased', 'pages_visited'],    
    'features_cat': ['gender', 'campaign']    
}

In [5]:
#-----------------------------------------------------------------
# Preprocessing on Training Set
#-----------------------------------------------------------------

# Numerical variables
df_num = df[SCHEMA['features_num']]

trf_num = Pipeline([
    ('imp', SimpleImputer(strategy='mean')),
    ('sts', StandardScaler()),
])

# Categorical variables
df_cat = df[SCHEMA['features_cat']]

trf_cat = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", trf_num, SCHEMA['features_num']),
        ("cat", trf_cat, SCHEMA['features_cat'])
    ]
)

In [6]:
#-------------------------------------------------------------------------
# TRAINING
#-------------------------------------------------------------------------

clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("clf", RandomForestClassifier())]
)

param_grid = {
    "clf__max_depth"        : [3, None],
    "clf__max_features"     : [1, 3, 5],
    "clf__min_samples_split": [2, 3, 10],
    "clf__min_samples_leaf" : [1, 3, 10],
    "clf__bootstrap"        : [True, False],
    "clf__criterion"        : ["gini", "entropy"],
    "clf__n_estimators"     : [10]
}

gs = GridSearchCV(clf, param_grid=param_grid, n_jobs=-1, scoring='roc_auc', cv=3)
X = df[SCHEMA['features_num'] + SCHEMA['features_cat']]
Y = df[SCHEMA['target']].values
gs.fit(X, Y)
clf = gs.best_estimator_

In [7]:
# #-----------------------------------------------------------------
# # Score Test Set
# #-----------------------------------------------------------------

# # load 'test' dataset as a Pandas dataframe
# df_test = dataiku.Dataset("to_assess_prepared").get_dataframe()

# # Actually score the new records
# scores = clf.predict_proba(df_test)

# # Reshape
# preds = pd.DataFrame(scores, index=df_test.index).rename(columns={0: 'proba_False', 1: 'proba_True'})
# all_preds = df_test.join(preds)

# # Sample of the test dataset with predicted probabilities
# all_preds.head()

Unnamed: 0,customer_id,ip,age,price_first_item_purchased,gender,data_source,ip_geopoint,ip_country_code,pages_visited,campaign,high_value,proba_False,proba_True
0,0008dd99a0a,88.69.206.61,71,22.0,F,testing,POINT(8.5725 49.8609),DE,3.0,False,0.0,0.142661,0.857339
1,001261e788a,158.187.79.11,33,10.0,F,testing,POINT(-97.822 37.751),US,2.0,False,1.0,0.681608,0.318392
2,0022a1402ba,59.79.175.96,31,44.0,F,testing,POINT(120.1614 30.2936),CN,7.0,True,0.0,0.406576,0.593424
3,0029b2eb40a,199.246.197.53,37,57.0,M,testing,POINT(-79.3716 43.6319),CA,8.0,True,1.0,0.350281,0.649719
4,004fae62c5a,50.201.54.169,69,57.0,M,testing,POINT(-88.4397 41.7741),US,3.0,True,1.0,0.357284,0.642716


In [8]:
# # Compute AUC results
# auc = roc_auc_score(all_preds['high_value'].astype(bool).values, all_preds['proba_True'].values)
# auc

0.7807510135194108