In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

Read in training/validation/full data: 

In [12]:
train_df = pd.read_feather('train_df')
val_df = pd.read_feather('val_df')
full_df = pd.read_feather('full_data_df')

Fitting random forest model to the 7 day purchase labels:

In [5]:
rf_7day = RandomForestClassifier(n_estimators=100, n_jobs=-1, max_depth=7, class_weight='balanced_subsample')
rf_7day.fit(train_df.drop(['user_id_hash', 'purchase_7day', 'purchase_14day'], axis=1), train_df['purchase_7day'])

RandomForestClassifier(bootstrap=True, class_weight='balanced_subsample',
            criterion='gini', max_depth=7, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False)

In [7]:
y_train = train_df['purchase_7day']
y_val = val_df['purchase_7day']
y_fit = rf_7day.predict(train_df.drop(['user_id_hash', 'purchase_7day', 'purchase_14day'], axis=1))
y_fit_prob = rf_7day.predict_proba(train_df.drop(['user_id_hash', 'purchase_7day', 'purchase_14day'], axis=1))
y_pred = rf_7day.predict(val_df.drop(['user_id_hash', 'purchase_7day', 'purchase_14day'], axis=1))
y_pred_prob = rf_7day.predict_proba(val_df.drop(['user_id_hash', 'purchase_7day', 'purchase_14day'], axis=1))

In [8]:
print("Training AUC: %.3f" % roc_auc_score(y_train, y_fit_prob[:, 1]))
print("Validation AUC: %.3f" % roc_auc_score(y_val, y_pred_prob[:, 1]))

Training AUC: 0.937
Validation AUC: 0.922


Fitting random forest model to the 14 day purchase labels:

In [9]:
rf_14day = RandomForestClassifier(n_estimators=100, n_jobs=-1, max_depth=7, class_weight='balanced_subsample')
rf_14day.fit(train_df.drop(['user_id_hash', 'purchase_7day', 'purchase_14day'], axis=1), train_df['purchase_14day'])

RandomForestClassifier(bootstrap=True, class_weight='balanced_subsample',
            criterion='gini', max_depth=7, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False)

In [10]:
y_train = train_df['purchase_14day']
y_val = val_df['purchase_14day']
y_fit = rf_14day.predict(train_df.drop(['user_id_hash', 'purchase_7day', 'purchase_14day'], axis=1))
y_fit_prob = rf_14day.predict_proba(train_df.drop(['user_id_hash', 'purchase_7day', 'purchase_14day'], axis=1))
y_pred = rf_14day.predict(val_df.drop(['user_id_hash', 'purchase_7day', 'purchase_14day'], axis=1))
y_pred_prob = rf_14day.predict_proba(val_df.drop(['user_id_hash', 'purchase_7day', 'purchase_14day'], axis=1))

In [11]:
print("Training AUC: %.3f" % roc_auc_score(y_train, y_fit_prob[:, 1]))
print("Validation AUC: %.3f" % roc_auc_score(y_val, y_pred_prob[:, 1]))

Training AUC: 0.929
Validation AUC: 0.913


Making predictions from the features computed on the full dataset:

In [14]:
y_pred_7 = rf_7day.predict(full_df.drop(['user_id_hash'], axis=1))
y_pred_prob_7 = rf_7day.predict_proba(full_df.drop(['user_id_hash'], axis=1))

In [15]:
y_pred_14 = rf_14day.predict(full_df.drop(['user_id_hash'], axis=1))
y_pred_prob_14 = rf_14day.predict_proba(full_df.drop(['user_id_hash'], axis=1))

In [16]:
predictions = np.concatenate([np.array(full_df['user_id_hash']).reshape(-1,1), y_pred_prob_7[:,1].reshape(-1,1), y_pred_prob_14[:,1].reshape(-1,1)], axis=1)
predictions = pd.DataFrame(predictions, columns=['user_id_hash', 'user_purchase_binary_7_days', 'user_purchase_binary_14_days'])

Creating prediction file for kaggle:

In [17]:
data_path = "/Users/christopherolley/data/Leanplum-data"

In [18]:
sample_submission = pd.read_csv("%s/sample_submission_2.csv" % data_path)

In [19]:
kaggle_predictions = pd.merge(pd.DataFrame(sample_submission['user_id_hash']), predictions, on='user_id_hash', how='left')

Fill any users we don't have any data on with zeros (low probability of purchase if didn't exist before two week test window)

In [20]:
kaggle_predictions.fillna(0.0, inplace=True)

In [21]:
kaggle_predictions.head()

Unnamed: 0,user_id_hash,user_purchase_binary_7_days,user_purchase_binary_14_days
0,e469dfaed039ead9110165d9bc457acb11609ca34057dc...,0.048074,0.050355
1,afcc639a324b6c598ef83d360450afa011cb2dd1358bf9...,0.154817,0.187705
2,fd5a7cf211d08e3e00f7be6a9df6e6ea3d2e5c22a5d9c3...,0.220782,0.248832
3,00bfff98b9d0329f014c2eeac7ce47cd18b2bc6e10d608...,0.435151,0.449922
4,0d298f3638c43e915c119d4935e1ce8d168f81b5e3e8c1...,0.0,0.0


In [22]:
kaggle_predictions.to_csv('kaggle_predictions_1.csv', index=False)