In [65]:
# coding: utf-8

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.metrics import roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from matplotlib import pyplot as plt
%matplotlib inline
import pandas as pd
import numpy as np
import time

def log(text, t_start=None):
    if t_start is None:
        print(text)
    else:
        elapsed_time = round(time.time() - t_start, 2)
        print(text + "\t(" + str(elapsed_time) + "s)")

### Loading files

In [3]:
t = time.time()
customers = pd.read_csv("data/customers.csv")
products = pd.read_csv("data/products.csv")
x_train = pd.read_csv("data/X_train.csv")
y_train = pd.read_csv("data/y_train.csv")
x_test = pd.read_csv("data/X_test.csv")
y_test = pd.read_csv("data/y_test.csv")
log("files loaded", t)

files loaded	(8.82s)


### Statistics

In [116]:
def col_stats(col):
    print "\n----- " + col + " -----"
    xx = {}
    for i, o in x_train.loc[:50000:7].iterrows():
        if o[col] not in xx.keys():
            xx[o[col]] = [0., 0.]
        if y_train.loc[i, ["ReturnQuantityBin"]][0] == 0.0:
            xx[o[col]][0] += 1.
        else:
            xx[o[col]][1] += 1.

    for val, (zeros, ones) in xx.items():
        print val, "\t", (ones/(zeros+ones)), "returns"

for col in x_train.columns:
    if "label" in col.lower():
        col_stats(col)


----- OrderStatusLabel -----
Expédié 	0.206495870083 returns

----- OrderTypelabel -----
DIRECT 	0.207155222158 returns
EXCHANGE 	0.184834123223 returns

----- SeasonLabel -----
Automne/Hiver 	0.212977834886 returns
Printemps/Eté 	0.162486368593 returns

----- PayementModeLabel -----
Carte bancaire 	0.202168590416 returns
Gratuit 	0.188034188034 returns
iDeal 	0.16 returns
PayPal 	0.237723214286 returns
Chèque 	0.235897435897 returns

----- CustomerTypeLabel -----
Nouveau 	0.185685541822 returns
Fidélisé 	0.226255458515 returns

----- DeviceTypeLabel -----
ND 	0.206495870083 returns

----- PricingTypeLabel -----
Promo Sans CP 	0.199652777778 returns
Vente Privée 	0.212740384615 returns
Promo Avec CP 	0.250505050505 returns
Plein Tarif 	0.202788649706 returns


### Applying mask

Here we select the desired columns. We join (just as in SQL) the `orders` table with the `customers` and the `products`. For each of those two tables, we only select a few columns. We also remove unwanted columns (`columns_text`) from `orders`. Finally, there is some data transformation to use prices as floats and turn string columns into separate ones.

In [122]:
def funk_mask(orders, customers, products):
    columns_ext = ["OrderCreationDate", "OrderNumber", "VariantId",
                   "CustomerId", "OrderCreationDate", "OrderShipDate",
                   "BillingPostalCode"
                  ]
    
    product_columns = ["VariantId", "SeasonLabel"]
    
    customers_columns = ["CustomerId", "Gender"]
    
    m = pd.merge(orders,
                 products.loc[:, product_columns],
                 on='VariantId',
                 suffixes=('_pr', ''))
    m = pd.merge(m,
                 customers.loc[:, customers_columns],
                 on='CustomerId',
                 suffixes=('_cs', ''))
        
    # Remove columns from input array
    x1 = m.loc[:, [xx for xx in m.columns if xx not in columns_ext]]
    # Convert UnitPMPEur column to floats (price of a unit, in euros)
    x1.UnitPMPEUR = [np.float(x.replace(",", ".")) for x in x1.UnitPMPEUR]
    # Select columns that contains string values
    columns2bin = [x for x in x1.columns if x1[x].dtype == np.dtype('O')]
    # Convert those to numerical indicators
    x2 = pd.get_dummies(x1.loc[:, columns2bin])
    # Extract other columns without modyfing them
    x3 = x1.loc[:, [xx for xx in x1.columns if xx not in columns2bin]]
    # Rebuild data
    res = pd.concat([x3, x2], axis=1)
    # Fill holes with 0s
    res = res.fillna(0)
    return res

t = time.time()
x1 = funk_mask(x_train, customers, products)
x2 = funk_mask(x_test, customers, products)
log("applied mask", t)

applied mask	(5.73s)


In [119]:
print "Training dataset shape:", x1.shape
print "Test dataset shape:    ", x2.shape

Training dataset shape: (509414, 23)
Test dataset shape:     (455104, 22)


### Building classifier

In [127]:
def compute(name, clf, training_slice=100000, submit=False):
    print "\n" + name + ":"
    clf.fit(x1.iloc[:training_slice],
            y_train.ReturnQuantityBin[:training_slice])
    
    predict_train = clf.predict_proba(
        x1.loc[:training_slice, x1.columns])
    score_train = roc_auc_score(
        y_train.ReturnQuantityBin.iloc[:training_slice + 1],
        predict_train[:, 1])
    print "Train score:", score_train

    predict_test = clf.predict_proba(
        x1.loc[training_slice:2 * training_slice, x1.columns]
        .fillna(0))
    score_test = roc_auc_score(
        y_train.ReturnQuantityBin.iloc[training_slice:2 * training_slice + 1],
        predict_test[:,1])
    print "Test score: ", score_test

    if submit:
        y_submit = clf.predict_proba(x2.loc[:, x1.columns].fillna(0))
        np.savetxt('y_pred.txt', y_submit[:,1], fmt='%f')
    
    return score_train, score_test

def test_classifiers():
    for (name, clf) in [
        ("tree", DecisionTreeClassifier()),
        ("forest", RandomForestClassifier()),
        ("logistic regression", LogisticRegression())
    ]:
        compute(name, clf)

def test_columns(training_slice=50000):
    
    strain, stest = compute("", DecisionTreeClassifier())
    
    for col in x1.columns:
        xx1 = x1.loc[:, [xx for xx in x1.columns if xx != col]]
        clf = DecisionTreeClassifier()
        clf.fit(xx1.iloc[:training_slice],
                y_train.ReturnQuantityBin[:training_slice])
        predict_test = clf.predict_proba(
        xx1.loc[training_slice:2 * training_slice, xx1.columns].fillna(0))
        score_test = roc_auc_score(y_train.ReturnQuantityBin.iloc[training_slice:2 * training_slice + 1],
                                   predict_test[:,1])
        print col, "\t", stest - score_test

t = time.time()
test_columns()
# test_classifiers()
log("run tests", t)


:
Train score: 0.996301066698
Test score:  0.500029803463
LineItem 	-0.000636213181821
TotalLineItems 	0.00431433107195
Quantity 	0.0018399627899
UnitPMPEUR 	-0.00104463708065
OrderNumCustomer 	0.00242143023711
IsOnSale 	0.00241663882896
OrderStatusLabel_Expédié 	0.00362865898416
OrderTypelabel_DIRECT 	0.00252004692218
OrderTypelabel_EXCHANGE 	0.00208523026926
SeasonLabel_pr_Automne/Hiver 	0.00266136315274
SeasonLabel_pr_Printemps/Eté 	0.000534972100385
PayementModeLabel_BankTransfer_DE 	0.00224898803955
PayementModeLabel_BankTransfer_IBAN 	0.00236023915386
PayementModeLabel_Carte bancaire 	0.00187553463081
PayementModeLabel_Chèque 	0.00132186389786
PayementModeLabel_Cod_DHL 	-4.31151732618e-05
PayementModeLabel_DotPay 	0.0023011818961
PayementModeLabel_Gratuit 	0.0036354193313
PayementModeLabel_PayPal 	0.00207601117321
PayementModeLabel_iDeal 	0.00118731771364
PayementModeLabel_maestro 	0.00159094414231
CustomerTypeLabel_Fidélisé 	8.76398335087e-05
CustomerTypeLabel_Nouveau 	0.001411