### Import packages

In [64]:
import pandas as pd
from datetime import date
import matplotlib.pyplot as plt
import seaborn as sns

### Set-up

For this expercise, we will utilize two files (listed below) from the 'Brazilian E-Commerce Public Dataset' originally posted on [Kaggle]( https://www.kaggle.com/olistbr/brazilian-ecommerce).

In [65]:
# full path of the 'orders' dataset
orders_file = 'https://bitbucket.org/vishal_derive/vcu-data-mining/raw/11253d9f443241b3ce5949802966a80de73af1db/data/olist_orders_dataset.csv'

# full path of the 'customer' dataset
cust_file = 'https://bitbucket.org/vishal_derive/vcu-data-mining/raw/11253d9f443241b3ce5949802966a80de73af1db/data/olist_customers_dataset.csv'

payments_file = 'https://bitbucket.org/vishal_derive/vcu-data-mining/raw/aedab41b6b30a19db9c72e5b82755a118f847d87/data/olist_order_payments_dataset.csv'


### Read data

In [77]:
def read_olist_data(file1, file2, file3, verbose):
    
    # read the data
    orders = pd.read_csv(file1)
    cust = pd.read_csv(file2)
    payments = pd.read_csv(file3)

    # drop unnecessary columns
    #drop_vars = ['order_approved_at', 'order_delivered_carrier_date', 
    #           'order_delivered_customer_date', 'order_estimated_delivery_date']

    
    # date-time conversion
    orders['order_purchase_timestamp'] = pd.to_datetime(orders['order_purchase_timestamp'])
    orders['order_delivered_customer_timestamp'] = pd.to_datetime(orders['order_delivered_customer_date'])
    orders['order_estimated_delivery_timestamp'] = pd.to_datetime(orders['order_estimated_delivery_date'])

    # let's convert the order purchase timestamps into dates
    orders['order_purchase_date'] = orders['order_purchase_timestamp'].dt.date
    orders['order_delivered_customer_formdate'] = orders['order_delivered_customer_timestamp'].dt.date
    orders['order_estimated_delivery_formdate'] = orders['order_estimated_delivery_timestamp'].dt.date

     # extract day of week from the order date
    orders['order_dow'] = orders['order_purchase_timestamp'].dt.weekday_name
    
    # extract month from the order date
    orders['order_month'] = orders['order_purchase_timestamp'].dt.month

    
    # merge orders and cust dataframes
    orders_cust = pd.merge(orders, cust, on='customer_id', how='inner')
    orders_out = pd.merge(orders_cust, payments, on='order_id', how='inner')
    
    # apply filters to (a) discard (incomplete) data after 2018-8-22; see 06_pandas_wrangle.ipynb for the rationale
    #  and (b) keep 'delivered' orders only
    #  we do this here by using a boolean (True/False) mask
    mask = (orders_out['order_purchase_date'] <= date(2018, 8, 22)) & (orders_out['order_status'] == 'delivered')

    orders_out = orders_out[mask]
    
    # discard 'order_status' as we don't need it any more
    orders_out = orders_out.drop('order_status', axis=1)
    
    # let's drop hose columns that we need (for this exercise)
    keep_cols = ['customer_unique_id', 'customer_id','order_id','order_purchase_timestamp', 'order_delivered_customer_timestamp',
                 'order_estimated_delivery_timestamp', 'order_purchase_date', 'order_delivered_customer_formdate', 'order_estimated_delivery_formdate',
                 'order_dow', 'order_month', 'payment_installments', 'payment_type', 'payment_value']

    orders_out = orders_out[keep_cols].sort_values(['customer_unique_id', 'order_purchase_timestamp'])

    #if verbose:
       # print (f'{len(orders_out):,} records in the output  file.')
    
    return orders_out

In [121]:
orders = read_olist_data(orders_file, cust_file, payments_file, 1)

In [122]:
payments = pd.read_csv(payments_file)
payments[payments['order_id']=='31bc09fdbd701a7a4f9b55b5955b8687']

Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value
27995,31bc09fdbd701a7a4f9b55b5955b8687,3,voucher,1,37.88
39416,31bc09fdbd701a7a4f9b55b5955b8687,7,voucher,1,3.48
43160,31bc09fdbd701a7a4f9b55b5955b8687,4,voucher,1,40.33
45492,31bc09fdbd701a7a4f9b55b5955b8687,1,voucher,1,26.86
74317,31bc09fdbd701a7a4f9b55b5955b8687,5,voucher,1,22.28
82459,31bc09fdbd701a7a4f9b55b5955b8687,2,voucher,1,55.71
103860,31bc09fdbd701a7a4f9b55b5955b8687,6,voucher,1,77.99


## Binary Classification model

Let's build a model to predict whether a customer will make a purchase within the next month.

We will use *July 2018*, which is the most recent complete month, as the prediction window. In other words, we will use all available data prior to July 2018 to predict which customers will make a purchase in July 2018.

In [123]:
snapshot_date = date(2017, 6, 30)

In [124]:
#filter out only data before snapshot date
mask = orders['order_purchase_timestamp'].dt.date <= snapshot_date

df_raw = orders[mask]

len(orders), len(df_raw)

(100247, 15077)

In [125]:
df_raw.head()

Unnamed: 0,customer_unique_id,customer_id,order_id,order_purchase_timestamp,order_delivered_customer_timestamp,order_estimated_delivery_timestamp,order_purchase_date,order_delivered_customer_formdate,order_estimated_delivery_formdate,order_dow,order_month,payment_installments,payment_type,payment_value
27681,0000f46a3911fa3c0805444483337064,9b3932a6253894a02c1df9d19004239f,b33ec3b699337181488304f362a6b734,2017-03-10 21:05:03,2017-04-05 14:38:47,2017-04-07,2017-03-10,2017-04-05,2017-04-07,Friday,3,8,credit_card,86.22
74356,0005e1862207bf6ccc02e4228effd9a0,3b37fb626fdf46cd99d37ec62afa88ff,ae76bef74b97bcb0b3e355e60d9a6f9c,2017-03-04 23:32:12,2017-03-09 08:33:08,2017-04-06,2017-03-04,2017-03-09,2017-04-06,Saturday,3,3,credit_card,150.12
44831,00115fc7123b5310cf6d3a3aa932699e,064064dd94c43013786fc1e1a14d6374,91d1bf2b7745903cb0d17162d1bbd750,2017-01-21 21:58:35,2017-02-01 21:57:36,2017-03-06,2017-01-21,2017-02-01,2017-03-06,Saturday,1,1,credit_card,76.11
73161,0011805441c0d1b68b48002f1d005526,728e136fca2819b62d610743ae2904b8,fb5d8b462dc9570527eff204b8a1a57b,2017-04-24 13:36:48,2017-05-12 14:04:42,2017-05-25,2017-04-24,2017-05-12,2017-05-25,Monday,4,10,credit_card,297.14
36639,0011857aff0e5871ce5eb429f21cdaf5,b821d548247d9c319bd595260efe677e,58d34816de3da57e9e07d7b34aced1b8,2017-06-28 11:08:38,2017-07-06 17:11:46,2017-07-26,2017-06-28,2017-07-06,2017-07-26,Wednesday,6,3,credit_card,192.83


In [126]:
# get dummies

df_test = pd.get_dummies(df_raw, columns=['payment_type'])



In [127]:
# grab all binary columns that we just created
dummies = [x for x in df_test.columns if 'payment_type_' in x]

# aggreagte data to the customer-level, so that we have one record per customer
payment_type = df_test.groupby('customer_unique_id')[dummies].sum().reset_index()


In [128]:
payment_type

Unnamed: 0,customer_unique_id,payment_type_boleto,payment_type_credit_card,payment_type_debit_card,payment_type_voucher
0,0000f46a3911fa3c0805444483337064,0,1,0,0
1,0005e1862207bf6ccc02e4228effd9a0,0,1,0,0
2,00115fc7123b5310cf6d3a3aa932699e,0,1,0,0
3,0011805441c0d1b68b48002f1d005526,0,1,0,0
4,0011857aff0e5871ce5eb429f21cdaf5,0,1,0,0
5,00191a9719ef48ebb5860b130347bf33,0,1,0,0
6,001a2bf0e46c684031af91fb2bce149d,0,1,0,0
7,001f3c4211216384d5fe59b041ce1461,0,1,0,0
8,002043098f10ba39a4600b6c52fbfe3c,0,1,0,0
9,002b4cd83fabaffaa475f78ea5ef3e08,0,1,0,0


### Attribute #1: Order recency

For creating model attributes (features) we will have to restrict our data to the timeframe prior to the prediction window.

In [129]:


# check the max date

df_raw.order_purchase_timestamp.max()

# grab the max purchase date for each customer

cust_recency = df_raw.groupby('customer_unique_id')['order_purchase_timestamp'].max().reset_index()

# count the number of days between the most recent order date and the snapshot date

delta = snapshot_date - cust_recency['order_purchase_timestamp'].dt.date

# grab the number of days (as an integer) from the calculated deltas 

cust_recency['order_recency'] = delta.dt.days

# drop the date (we don't need it any more for this exercise)

cust_recency = cust_recency.drop('order_purchase_timestamp', axis=1)


# how many records (unique customers) do we have for this timeframe?

len(cust_recency)


13855

This is the number of customers who are included in the modeling dasetset for this specific time-frame.

Additional eligibility criteria can be applied to exclude customers (from the modeling dataset) who have not shopped in more than, e.g., one/two years.

### Attribute 2: Days defore/after delivery

In [130]:
# count the number of days between the most recent order date and the snapshot date

df_raw['delays'] = (df_raw['order_estimated_delivery_timestamp'].dt.date - df_raw['order_delivered_customer_timestamp'].dt.date).dt.days



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [131]:
order_delay = df_raw.groupby(['customer_unique_id'])['delays'].sum().reset_index()
order_delay.head()

Unnamed: 0,customer_unique_id,delays
0,0000f46a3911fa3c0805444483337064,2.0
1,0005e1862207bf6ccc02e4228effd9a0,28.0
2,00115fc7123b5310cf6d3a3aa932699e,33.0
3,0011805441c0d1b68b48002f1d005526,13.0
4,0011857aff0e5871ce5eb429f21cdaf5,20.0


### Attribute 3: Orders per Customer

In [132]:
# Count total records (i.e., orders) per customer

cust_orders = df_raw.groupby('customer_unique_id').size().reset_index().rename(columns={0: 'total_orders'})


### Attribute 4: Day of Week


In [133]:
# get dummies

df_raw = pd.get_dummies(df_raw, columns=['order_dow'])

# grab all binary columns that we just created
dummies = [x for x in df_raw.columns if 'order_dow_' in x]

# aggreagte data to the customer-level, so that we have one record per customer
cust_dow = df_raw.groupby('customer_unique_id')[dummies].sum().reset_index()



In [134]:
len(cust_dow)

13855

### New Attribute 5: Month

In [135]:
# get dummies

df_raw = pd.get_dummies(df_raw, columns=['order_month'])

# grab all binary columns that we just created
dummonth = [x for x in df_raw.columns if 'order_month_' in x]

# aggreagte data to the customer-level, so that we have one record per customer
cust_month = df_raw.groupby('customer_unique_id')[dummonth].sum().reset_index()

cust_month.head()

Unnamed: 0,customer_unique_id,order_month_1,order_month_2,order_month_3,order_month_4,order_month_5,order_month_6,order_month_10,order_month_12
0,0000f46a3911fa3c0805444483337064,0,0,1,0,0,0,0,0
1,0005e1862207bf6ccc02e4228effd9a0,0,0,1,0,0,0,0,0
2,00115fc7123b5310cf6d3a3aa932699e,1,0,0,0,0,0,0,0
3,0011805441c0d1b68b48002f1d005526,0,0,0,1,0,0,0,0
4,0011857aff0e5871ce5eb429f21cdaf5,0,0,0,0,0,1,0,0


### New Attribute 6: Purchase Dollar Volume

In [136]:
df_raw['order_total']=df_raw['payment_installments']*df_raw['payment_value']

dollar_volume = df_raw.groupby(['customer_unique_id'])['order_total'].sum().reset_index().sort_values('order_total',ascending=False)

dollar_volume.head()

Unnamed: 0,customer_unique_id,order_total
11796,da122df9eeddfedc1dc1f5349a1a690c,75716.3
11924,dc4802a71eae9be1dd28f5d788ceb526,55434.48
12923,eebb5dda148d3893cdaf5b5ca3040ccb,38114.72
5080,5d09b0d82126457e2a8ebfb9c9a1ffc4,37362.2
12882,edf81e1f3070b9dac83ec83dacdbb9bc,33558.08


### Combine all attributes into a single dataframe

In [137]:
# let's check the number of records in each dataframe first

cust_recency.shape, cust_orders.shape, cust_dow.shape, order_delay.shape, cust_month.shape

((13855, 2), (13855, 2), (13855, 8), (13855, 2), (13855, 9))

In [138]:
# combine (merge) `cust_recency` with `cust_orders` so that we have both attributes in one dataset

df = pd.merge(cust_recency, cust_orders, on='customer_unique_id') \
     .merge(cust_dow, on='customer_unique_id') \
     .merge(order_delay, on='customer_unique_id') \
     .merge(cust_month, on='customer_unique_id') \
     .merge(dollar_volume, on='customer_unique_id')

len(df)

13855

### Assign labels (aka the target variable or the dependent variable)

For this exercise, we will assume that the objective of the model is to predict whether a customer will make *at least one purchase* in the future (i.e., within the target window of the model).

**Step 1:** Isolate all orders that were placed within the prediction window.

In [139]:
df.head()

Unnamed: 0,customer_unique_id,order_recency,total_orders,order_dow_Friday,order_dow_Monday,order_dow_Saturday,order_dow_Sunday,order_dow_Thursday,order_dow_Tuesday,order_dow_Wednesday,delays,order_month_1,order_month_2,order_month_3,order_month_4,order_month_5,order_month_6,order_month_10,order_month_12,order_total
0,0000f46a3911fa3c0805444483337064,112,1,1,0,0,0,0,0,0,2.0,0,0,1,0,0,0,0,0,689.76
1,0005e1862207bf6ccc02e4228effd9a0,118,1,0,0,1,0,0,0,0,28.0,0,0,1,0,0,0,0,0,450.36
2,00115fc7123b5310cf6d3a3aa932699e,160,1,0,0,1,0,0,0,0,33.0,1,0,0,0,0,0,0,0,76.11
3,0011805441c0d1b68b48002f1d005526,67,1,0,1,0,0,0,0,0,13.0,0,0,0,1,0,0,0,0,2971.4
4,0011857aff0e5871ce5eb429f21cdaf5,2,1,0,0,0,0,0,0,1,20.0,0,0,0,0,0,1,0,0,578.49


In [140]:


# select orders that were placed between July 2017 and 2018

mask = (orders['order_purchase_timestamp'].dt.date > snapshot_date) & (orders['order_purchase_timestamp'].dt.date < date(2018,7,1))
target_events_raw = orders[mask]

len(target_events_raw)


72774

In [141]:
# confirm the min and max dates are within the month of August 2018

print (target_events_raw['order_purchase_timestamp'].min(), target_events_raw['order_purchase_timestamp'].max())

2017-07-01 00:04:15 2018-06-30 23:59:49


**Step 2:** Summarize data to get one record per customer.

In [142]:
# count the number of orders (we will convert this into a binary flag later)

target_events = target_events_raw.groupby('customer_unique_id').size().reset_index().rename(columns={0: 'purch'})

target_events.head()

Unnamed: 0,customer_unique_id,purch
0,0000366f3b9a7992bf8c76cfdf3221e2,1
1,0000b849f77a49e4a4ce2b2a4ca5be3f,1
2,0000f6ccb0745a6a4b88665a16c9f078,1
3,0004aac84e0df4da2b147fca70cf8255,1
4,0004bd2a26a76fe21f786e4fbd80607f,1


In [143]:
print(f'Number of customers who made at least one purchase durnig the prediction window: {len(target_events):,}')

Number of customers who made at least one purchase durnig the prediction window: 67,776


**Step 3:** Merge this dataframe with the `attr` dataframe to create the modeling dataset.

In [144]:
# merge `target_events` with the dataframe that contains customer attributes

df = pd.merge(df, target_events, how='left', on='customer_unique_id')

df.shape

(13855, 21)

In [23]:
df.head()

Unnamed: 0,customer_unique_id,order_recency,total_orders,order_dow_Friday,order_dow_Monday,order_dow_Saturday,order_dow_Sunday,order_dow_Thursday,order_dow_Tuesday,order_dow_Wednesday,...,order_month_1,order_month_2,order_month_3,order_month_4,order_month_5,order_month_6,order_month_10,order_month_12,order_total,purch
0,0000f46a3911fa3c0805444483337064,112,1,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,689.76,
1,0005e1862207bf6ccc02e4228effd9a0,118,1,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,450.36,
2,00115fc7123b5310cf6d3a3aa932699e,160,1,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,76.11,
3,0011805441c0d1b68b48002f1d005526,67,1,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,2971.4,
4,0011857aff0e5871ce5eb429f21cdaf5,2,1,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,578.49,


In [145]:
df.purch.value_counts()

1.0    309
2.0     47
3.0      9
9.0      1
5.0      1
Name: purch, dtype: int64

We need to convert this variable into a binary flag.

**Step 4:** Set the target variable to 1 if at least one purchase was made, 0 otherwise.

In [146]:
# create the binary target flag

df['purch'] = [1 if x > 0 else 0 for x in df['purch']]

df.purch.value_counts()

0    13488
1      367
Name: purch, dtype: int64

In [147]:
# % distribution of the target flag

df.purch.value_counts() / len(df)

0    0.973511
1    0.026489
Name: purch, dtype: float64

There are two issues here: (1) the target event is extremely rare, and (2) the number of target events is also extremely small. There are several ways to circumvent these issues, but for now we will proceed with this dataset. 

In [148]:
# check the correlations

df.corr()['purch']

order_recency         -0.026816
total_orders           0.005163
order_dow_Friday      -0.006783
order_dow_Monday      -0.011389
order_dow_Saturday     0.006374
order_dow_Sunday      -0.000218
order_dow_Thursday     0.011081
order_dow_Tuesday     -0.004688
order_dow_Wednesday    0.013004
delays                 0.016203
order_month_1          0.002737
order_month_2         -0.021879
order_month_3         -0.010577
order_month_4         -0.007809
order_month_5          0.012584
order_month_6          0.021171
order_month_10         0.004326
order_month_12        -0.001401
order_total            0.009565
purch                  1.000000
Name: purch, dtype: float64

Before moving on to model building, let create a two-axis plot to view the purchase rates by recency.

### Model building

In [149]:
df.columns


Index(['customer_unique_id', 'order_recency', 'total_orders',
       'order_dow_Friday', 'order_dow_Monday', 'order_dow_Saturday',
       'order_dow_Sunday', 'order_dow_Thursday', 'order_dow_Tuesday',
       'order_dow_Wednesday', 'delays', 'order_month_1', 'order_month_2',
       'order_month_3', 'order_month_4', 'order_month_5', 'order_month_6',
       'order_month_10', 'order_month_12', 'order_total', 'purch'],
      dtype='object')

In [150]:
# set-up

preds = df.columns[1:-1]

X = df[preds]
y = df['purch']

In [151]:
len(y)

13855

Tri-fold partitioning is recommended as long as there's sufficient sample size avaiable. In this example, since the number of target events is very small, we will perform a two-fold paritioning of the modeling sample.

Note that we could increase the size of the prediction window to capture more purchase events. However, for this exercise we will continue with the current prediction window (of one month).

#### Two-fold partition

In [31]:
# split the dataframe into train(50%) and test(50%)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=314)

len(X_train), len(X_test)

(6927, 6928)

In [152]:
# Shuffle the dataset  
shuffle_df = df.sample(frac=1)

# Define a size for your train set 
train_size = int(0.5 * len(df))

# Split your dataset 
train_set = shuffle_df[:train_size]
test_set = shuffle_df[train_size:]

In [153]:
##split the data without using sklearn function
X_train=train_set[['order_recency', 'total_orders',
       'order_dow_Friday', 'order_dow_Monday', 'order_dow_Saturday',
       'order_dow_Sunday', 'order_dow_Thursday', 'order_dow_Tuesday',
       'order_dow_Wednesday']]
y_train = train_set['purch']

X_test=test_set[['order_recency', 'total_orders',
       'order_dow_Friday', 'order_dow_Monday', 'order_dow_Saturday',
       'order_dow_Sunday', 'order_dow_Thursday', 'order_dow_Tuesday',
       'order_dow_Wednesday']]
y_test = test_set['purch']

In [154]:
# target distribution in the trainig sample

y_train.value_counts()

0    6737
1     190
Name: purch, dtype: int64

In [155]:
# target distribution in the test sample

y_test.value_counts()

0    6751
1     177
Name: purch, dtype: int64

In [156]:
from sklearn.linear_model import LogisticRegression

# define the model object ('liblinear' is recommended for small datasets)
clf = LogisticRegression(solver='liblinear', random_state=314)

# train (fit) the model using the training sample
clf.fit(X_train, y_train)

# make predictions on the test sample
y_preds = clf.predict(X_test)

### Model accuracy

In [157]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_preds)

0.9744515011547344

Why does the model appear to be very strong in spite of only two predictors that didn't look correlated with the target?

In [158]:
# check model's predictions against actual value of the target event

pd.crosstab(y_test, y_preds)

col_0,0
purch,Unnamed: 1_level_1
0,6751
1,177


The model is predicting 0 (no purchase) for all customers! Accuracy score is not a reliable measure while dealing with rare events. Let's take a look at a few additional metrics. (We will learn about these metrics in the class later.)

Area under the ROC curve (AUC)

In [159]:
from sklearn.metrics import roc_auc_score

# calculate the probabilities on the test sample
y_scores = clf.predict_proba(X_test)[:, 1]

# calcualte AUC
roc_auc_score(y_test, y_scores)

0.5449487709291028

In [160]:
clf.predict_proba(X_test)[:,1].max()

0.05253128100452113

Precision

In [161]:
from sklearn.metrics import average_precision_score

average_precision = average_precision_score(y_test, y_scores)

average_precision

0.028944342421054405

## Decision Tree Model

In [50]:
from sklearn.tree import DecisionTreeClassifier

from sklearn import metrics

from sklearn.metrics import roc_auc_score



In [52]:
# Create Decision Tree classifer object
cltree = DecisionTreeClassifier()

# Train Decision Tree Classifer
cltree = cltree.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = cltree.predict(X_test)

In [54]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.9740184757505773


In [55]:
# calculate the probabilities on the test sample
y_scores = cltree.predict_proba(X_test)[:, 1]

# calcualte AUC
roc_auc_score(y_test, y_scores)

0.5251165210153974

## Export results

#### 1. Scored dataset (with all model attributes)

In [48]:
scores_all = clf.predict_proba(df[preds])[:, 1]

df['prob_to_purchase'] = scores_all

df.head()

Unnamed: 0,customer_unique_id,order_recency,total_orders,order_dow_Friday,order_dow_Monday,order_dow_Saturday,order_dow_Sunday,order_dow_Thursday,order_dow_Tuesday,order_dow_Wednesday,purch,prob_to_purchase
0,0000366f3b9a7992bf8c76cfdf3221e2,51,1,0,0,0,0,1,0,0,0,0.003737
1,0000b849f77a49e4a4ce2b2a4ca5be3f,54,1,0,1,0,0,0,0,0,0,0.002449
2,0000f6ccb0745a6a4b88665a16c9f078,261,1,0,0,0,0,1,0,0,0,0.001617
3,0004aac84e0df4da2b147fca70cf8255,228,1,0,0,0,0,0,1,0,0,0.00232
4,0004bd2a26a76fe21f786e4fbd80607f,86,1,0,0,0,0,1,0,0,0,0.003251


In [86]:
csv_file_zipped = 'purch_model_scores.csv.gz'

df.to_csv(csv_file_zipped, compression='gzip')

#### 2. Export the model

In [50]:
import joblib

model_pkl_file = '../output/purch_model.joblib'

joblib.dump(clf, model_pkl_file)

['../output/purch_model.joblib']

##### Load the model

In [51]:
clf_loaded = joblib.load(model_pkl_file)

clf_loaded

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=314, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [172]:
# use the model to score a "new" data frame

clf_loaded.predict_proba(X_test)

NameError: name 'clf_loaded' is not defined

In [None]:
# TESTCODE modify this to count number of times customer ordered on weekend. Like weekday count.
df_raw.loc[(df_raw['order_dow'] == 'Saturday') | (df_raw['order_dow'] == 'Sunday'), 'weekend'] = 'sure'

df_raw.loc[(df_raw['order_dow'] != 'Saturday') & (df_raw['order_dow'] != 'Sunday'), 'weekend'] = 'nah'

df_raw.head()

##view certain columns
test = df_raw[df_raw['customer_unique_id']=='0000366f3b9a7992bf8c76cfdf3221e2']
test[['customer_unique_id','payment_installments','payment_value','order_total']]