# AlphaShoe

SD210 Challenge

 - [scoreboard and submissions](http://datachallenge.enst.fr/)
 - [starting kit](http://nbviewer.jupyter.org/urls/dl.dropboxusercontent.com/s/hmrfrurkyoohi3v/moussab%20djerrab%20-%20DataChallenge_ShoeReturns.ipynb)
 - [training data](https://www.dropbox.com/sh/uo4oudw43j45mp3/AACA0UqkitNKSWdE_7fs2Wbla?dl=0)
 - [dictionnary](https://www.dropbox.com/sh/uo4oudw43j45mp3/AACA0UqkitNKSWdE_7fs2Wbla?dl=0&preview=dictionnary.xlsx)

### Importations

In [1]:
# coding: utf-8

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.metrics import roc_auc_score
from sklearn.tree import DecisionTreeClassifier
#deep
from sklearn.neural_network import MLPClassifier

from matplotlib import pyplot as plt
import datetime
import pandas as pd
import numpy as np
import time

%matplotlib inline

def log(text, t_start=None):
    if t_start is None:
        print(text)
    else:
        elapsed_time = round(time.time() - t_start, 2)
        print(text + "\t(" + str(elapsed_time) + "s)")


### Loading files

In [2]:
t = time.time()
customers = pd.read_csv("data/customers.csv")
products = pd.read_csv("data/products.csv")
x_train = pd.read_csv("data/X_train.csv")
y_train = pd.read_csv("data/y_train.csv")
x_test = pd.read_csv("data/X_test.csv")
y_test = pd.read_csv("data/y_test.csv")
log("files loaded", t)

files loaded	(48.54s)


### Data cleaning

In [3]:
# SizeAdviceDescription
SizeAdviceDescriptionCleaner = {}
SizeAdviceDescriptionCleaner['nan'] = 0
SizeAdviceDescriptionCleaner['Ce mod\xc3\x83\xc2\xa8le chausse normalement'] = 0
SizeAdviceDescriptionCleaner['Mod\xc3\x83\xc2\xa8le confortable, convient aux pieds larges'] = -.5
SizeAdviceDescriptionCleaner['Mod\xc3\x83\xc2\xa8le \xc3\x83\xc2\xa9troit, convient aux pieds fins'] = .5
SizeAdviceDescriptionCleaner['Prenez votre pointure habituelle'] = 0
SizeAdviceDescriptionCleaner['Chaussant particuli\xc3\x83\xc2\xa8rement g\xc3\x83\xc2\xa9n\xc3\x83\xc2\xa9reux. Nous vous conseillons de choisir deux tailles en dessous de votre pointure habituelle.'] = -2
SizeAdviceDescriptionCleaner['Chaussant petit. Si vous \xc3\x83\xc2\xaates habituellement entre deux pointures, nous vous conseillons de choisir une demi taille au-dessus de votre pointure habituelle.'] = .5
SizeAdviceDescriptionCleaner['Prenez une taille au-dessus de sa pointure !'] = 1
SizeAdviceDescriptionCleaner['Prenez une taille au-dessus de votre pointure habituelle'] = 1
SizeAdviceDescriptionCleaner['Prenez une taille en dessous de sa pointure !'] = -1
SizeAdviceDescriptionCleaner['Prenez une taille en dessous de votre pointure habituelle'] = -1

# BirthDate
def age(birthdate):
    if type(birthdate) == type(" "):
        return 2016 - int(birthdate[:4])
    return None

# OrderCreationDate and SeasonLabel
def order_season(orderdate):
    month = int(orderdate[5:7])
    if month >= 4 and month <= 9:
        return "Printemps/Et\xc3\x83\xc2\xa9"
    return "Automne/Hiver"


def build_df(x):
    """Builds a pandas DataFrame with clean columns from a read CSV"""
    
    t = time.time()
    m = None
    
    # join
    m = pd.merge(x, products, how='left', on='VariantId', suffixes=('_pr', ''))
    m = pd.merge(m, customers, how='left', on='CustomerId', suffixes=('_cs', ''))
    
    # converting UnitPMPEUR
    m.UnitPMPEUR = m["UnitPMPEUR"].map(lambda row: float(row.replace(',', '.')))
    
    # building news columns
    m["MatchGender"] = m["Gender"] == m["GenderLabel"]
    m["MatchSeason"] = m["SeasonLabel_pr"] == m["SeasonLabel"]
    m["OrderSeason"] = m["OrderCreationDate"].map(order_season)
    m["MatchOrderSeason"] = m["OrderSeason"] == m["SeasonLabel"]
    
    # cleaning
    m["SizeAdviceDescription"] = m["SizeAdviceDescription"].map(SizeAdviceDescriptionCleaner)
    m["BirthDate"] = m["BirthDate"].map(age)
        
    # removing useless columns
    blacklist = ['VariantId', 'CustomerId', 'OrderNumber', 'LineItem',
                 'ProductColorId', 'BrandId', 'SupplierColor', 'OrderShipDate',
                 'ProductId', 'BillingPostalCode', 'FirstOrderDate',
                 'OrderStatusLabel', 'MinSize', 'MaxSize', 'OrderSeason',
                 'OrderCreationDate', 'SubtypeLabel', 'ProductType'
                ]
    whitelist = None
    if blacklist is not None:
        m = m.drop(blacklist, axis=1)
    if whitelist is not None:
        for col in m.columns:
            if col not in whitelist:
                m = m.drop([col], axis=1)

    print "dataframe shape:", m.shape
    log("dataframe built", t)
    return m

### Statistics

In [4]:
def returns_frequency(x, y, col, step):
    """Returns the returns frequencies for each value of a column"""
    
    counter = 0
    
    # counting occurences of each column value
    occurrences = {}
    for i, o in x.loc[::step].iterrows():
        counter += 1
        if str(o[col]) not in occurrences.keys():
            occurrences[str(o[col])] = [0., 0.]
        if y.loc[i, ["ReturnQuantityBin"]][0] == 0.0:
            occurrences[str(o[col])][0] += 1.
        else:
            occurrences[str(o[col])][1] += 1.
    
    # computing the returns frequency, stored in `recap`
    recap, values = [], []
    for val, (zeros, ones) in occurrences.items():
        values.append(((ones / (zeros + ones))))
        recap.append((val, values[-1]))
    recap.sort(key=lambda row: row[0])
    
    # computing variance and relative variance
    var = np.var(values)
    rel = var / len(values)
    
    return recap, var, rel, counter


def column_stats(x, y, col, step, verbose):
    """Computes the statistics for one column"""
    
    print "\n----- " + col + " -----"
    recap, var, rel, counter = returns_frequency(x, y, col, step)
    
    if verbose:
        for (val, freq) in recap:
            print val, "\t", freq, "returns"

    print "variance:", round(var, 5), "\tvalues count:", len(recap)
    return recap, var, rel, counter


def compute_statistics(x, y, blacklist=[], whitelist=[], step=100, verbose=False):
    ignored = []
    labels_g, labels_n = [], []
    scattering_g, scattering_n = [], []      # variances
    r_scattering_g, r_scattering_n = [], []  # variance divided by the number of differents values
    
    counter_cols = 0
    counter_rows = 0
    
    for col in x.columns:
        if col not in blacklist and x[col].dtype in ["object", "bool"]:
            counter_cols += 1
            labels_g.append(col)
            recap, var, rel, counter_rows = column_stats(x, y, col, step, verbose)
            scattering_g.append(var)
            r_scattering_g.append(rel)
        elif col not in blacklist and x[col].dtype in ["float64", "int64"]:
            counter_cols += 1
            labels_n.append(col)
            recap, var, rel, counter_rows = column_stats(x, y, col, step, verbose)
            scattering_n.append(var)
            r_scattering_n.append(rel)
            if col in whitelist:
                plot_dots([float(v[0]) for v in recap], [v[1] for v in recap], col)
        else:
            ignored.append(col)
    
    print "\n\nanalyzed", counter_cols, "columns and", counter_rows, "rows"
    print "\nignored columns:", ignored
    plot_barchart(labels_g, scattering_g, r_scattering_g, "general columns")
    plot_barchart(labels_g, scattering_g, r_scattering_g, "numerical columns")
    

def plot_barchart(labels, values_l, values_r, title):
    fig, ax = plt.subplots(figsize=(16, 10))
    ind = np.arange(len(values_l))
    width = .35
    ax.bar(ind - width/2, values_l, width)
    ax.bar(ind + width/2, values_r, width)
    ax.set_xticks(ind)
    ax.set_xticklabels(labels)
    plt.xticks(rotation=70)
    plt.title(title)
    plt.show()
    

def plot_dots(xs, ys, xlabel):
    plt.figure(figsize=(8, 5))
    plt.xlabel(xlabel)
    plt.ylabel("returns frequency")
    plt.plot(xs, ys, 'o')
    plt.show()

In [6]:
df_stats = build_df(x_train)

dataframe shape: (1067290, 34)
dataframe built	(13.09s)


In [None]:
compute_statistics(df_stats, y_train, blacklist=[], whitelist=df_stats.columns, step=100)

In [None]:
"""
TO REMOVE
def prune(x, columns, variability, threshold):
    to_remove = []
    for i, col in enumerate(columns):
        if variability[i] < threshold:
            to_remove.append(col)
    return x.drop(to_remove, axis=1)

df_train_pruned = prune(df_train, labels, variability, 0.001)
"""

### Classification

In [5]:
def mask(m):
    columns2bin = [col for col in m.columns if m[col].dtype == 'object']
    other_cols = m.drop(columns2bin, axis=1)
    new_cols = pd.get_dummies(m.loc[:, columns2bin])
    res = pd.concat([other_cols, new_cols], axis=1)
    res = res.fillna(0)
    print "new shape:", res.shape
    return res

def compute(name, clf, x1, x2, slc=100000):
    print "\n-----", name, "-----"
    clf.fit(x1.iloc[:slc], y_train.ReturnQuantityBin[:slc])
    
    predict_train = clf.predict_proba(x1.iloc[:slc])
    score_train = roc_auc_score(y_train.ReturnQuantityBin[:slc], predict_train[:, 1])
    print "train score:", score_train
    
    predict_test = clf.predict_proba(x1.iloc[slc:2 * slc])
    score_test = roc_auc_score(y_train.ReturnQuantityBin[slc:2 * slc], predict_test[:, 1])
    print "test score:", score_test
    return score_train, score_test

def compute_all(x1, x2, slc=100000):
    """Tries different classifiers and returns the best one (best test score)"""
    t = time.time()
    best_index, best_score = None, None
    
    print "train shape:\t", x1.shape, "\t", y_train.shape
    print "test shape:\t", x2.shape, "\t", y_test.shape
    
    classifiers = [("random forest", RandomForestClassifier()),
                   ("decision tree", DecisionTreeClassifier()),
                   ("logistic regression", LogisticRegression()),
                    ("DEEP",MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(100, 10), random_state=1))]
    
    for i, (name, clf) in enumerate(classifiers):
        score_train, score_test = compute(name, clf, x1, x2, slc)
        if best_score is None or score_test > best_score:
            best_index, best_score = i, score_test
    
    log("\nbest classifier: " + classifiers[best_index][0], t)
    return classifiers[best_index][1]

def output(clf, x1, x2):
    t = time.time()
    y_tosubmit = clf.predict_proba(x2.loc[:, x1.columns].fillna(0))
    
    timestamp = '{0:%Y_%m_%d_%H_%M_%S}'.format(datetime.datetime.now())
    filename = "ypred_{0}.txt".format(timestamp)
    np.savetxt(filename, y_tosubmit[:,1], fmt='%f')
    
    f = open("predictions.txt", 'a')
    f.write(timestamp + '\n' + repr(clf).replace('\n          ', '') + '\n\n')
    f.close()
    
    print "shape:", y_tosubmit.shape
    log("generated output at " + filename, t)

### Computation test loop

In [6]:
df_test = build_df(x_test)

dataframe shape: (800468, 34)
dataframe built	(10.21s)


In [7]:
df_train = build_df(x_train)

dataframe shape: (1067290, 34)
dataframe built	(12.95s)


In [11]:
t = time.time()
x1 = mask(df_train)
x2 = mask(df_test)
log("applied mask", t)

new shape: (1067290, 158)
new shape: (800468, 166)
applied mask	(7.8s)


In [11]:
clf = compute_all(x1, x2)

train shape:	(1067290, 158) 	(1067290, 4)
test shape:	(800468, 166) 	(800468, 4)

----- random forest -----
train score: 0.994864576917
test score: 0.570640507496

----- decision tree -----
train score: 0.999397896888
test score: 0.533917674453

----- logistic regression -----
train score: 0.639062743106
test score: 0.632971765689

----- DEEP -----
train score: 0.651498651424
test score: 0.6356969107

best classifier: DEEP	(118.65s)


In [12]:
output(clf, x1, x2)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


shape: (800468, 2)
generated output at ypred_2018_04_15_22_43_37.txt	(19.02s)


### Principal component analysis (PCA)

Best scores reached with `n_components` at 96:
 - train score: 0.6388630967451936
 - test score:  0.6331956289779485

Above, scores are deacreasing.

**Note:** the result from `.transform()` is a Numpy array. Therefore the slicing is different.


In [None]:
from sklearn.decomposition import PCA

def try_pca(data, n_components):
    print "\n----- PCA", n_components, "-----"
    
    pca = PCA(n_components=n_components)
    pca.fit(data)
    x = pca.transform(data)
    clf = LogisticRegression()
    slc = 100000
    clf.fit(x[:slc, :], y_train.ReturnQuantityBin[:slc])
    
    predict_train = clf.predict_proba(x[:slc, :])
    score_train = roc_auc_score(y_train.ReturnQuantityBin[:slc], predict_train[:, 1])
    print "train score:", score_train
    
    predict_test = clf.predict_proba(x[slc:2 * slc, :])
    score_test = roc_auc_score(y_train.ReturnQuantityBin[slc:2 * slc], predict_test[:, 1])
    print "test score:", score_test
    
    return score_train, score_test, pca

for n in range(1, 100, 5):
    try_pca(x1, n)

### Cross validation prediction

This is an _attempt_, scores did not met expectation.

In [None]:
from sklearn.model_selection import cross_val_predict

def cross_val(name, clf, slc=100000):
    print "\n-----", name, "-----"
    predict_train = cross_val_predict(clf, x1.iloc[:slc], y_train.ReturnQuantityBin[:slc], cv=10, method='predict_proba')
    score_train = roc_auc_score(y_train.ReturnQuantityBin[:slc], predict_train[:, 1])
    print "train score:", score_train
    return score_train

cross_val("LogisticRegression", LogisticRegression())

### Dataset randomization

In [9]:
def shuffle(x, y, steps=10, slc=100000, plot=True):
    scores_train, scores_test = [], []
    best_clf, best_score = None, None
    
    z = x.copy(deep=True)
    z["ReturnQuantityBin"] = y.ReturnQuantityBin
    
    for k in range(steps):
        u = z.sample(frac=1)
        v = u.loc[:, ["ReturnQuantityBin"]]
        u = u.drop(["ReturnQuantityBin"], axis=1)
        
        #clf = LogisticRegression()
        clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(100, 10), random_state=1)
        clf.fit(u.iloc[:slc], v.ReturnQuantityBin[:slc])
        
        predict_train = clf.predict_proba(u.iloc[:slc])
        score_train = roc_auc_score(v.ReturnQuantityBin[:slc], predict_train[:, 1])
    
        predict_test = clf.predict_proba(u.iloc[slc:2 * slc])
        score_test = roc_auc_score(v.ReturnQuantityBin[slc:2 * slc], predict_test[:, 1])
        
        if best_clf is None or score_test > best_score:
            best_clf, best_score = clf, score_test
        
        if plot:
            print "test", k, "\ttrain:", score_train, "\ttest:", score_test
        
        scores_train.append(score_train)
        scores_test.append(score_test)
    
    if plot:
        plt.figure(figsize=(16, 10))
        plt.xlabel("train score")
        plt.ylabel("test score")
        plt.plot(scores_train, scores_test, '+')
        plt.show()
    
    return scores_train, scores_test, best_clf, best_score

In [17]:
output(best_clf, x1, x2)

NameError: name 'best_clf' is not defined

In [None]:
def try_slice(plot=True, steps=10):

    slices, scores, classifiers = [], [], {}
    best_slice, best_score = None, None

    for slc in range(10000, 200001, 10000):
        sc_train, sc_test, clf, score = shuffle(x1, y_train, slc=slc, steps=steps, plot=False)
        slices.append(slc)
        scores.append(score)
        if best_slice is None or best_score < score:
            best_slice, best_score = slc, score
        classifiers[slc] = clf
        print "slice", slc, "\t", score

    if plot:
        plt.figure(figsize=(16, 10))
        plt.xlabel("slice")
        plt.ylabel("score")
        plt.plot(slices, scores, '-o')
        plt.show()
    
    return classifiers[best_slice]

best_clf = try_slice()

slice 10000 	0.658323403452
slice 20000 	0.659244885548
slice 30000 	0.658527744854
slice 40000 	0.659320759677
slice 50000 	0.661305405909
slice 60000 	0.662496890081
slice 70000 	0.659767449335
slice 80000 	0.659044412524
slice 90000 	0.66011393161
slice 100000 	0.658210195611
slice 110000 	0.658148778931


__Some interpretation__

Tests showed that the slice does not really impact the score. However, the more data are processed the more stabilized reults get. One then may select let's say `50000` as a default slice and then perform much more tests to try to, by chance, find a good one.

In [15]:
sc_train, sc_test, clf, score = shuffle(x1, y_train, slc=50000, steps=10, plot=False)

In [16]:
output(clf, x1, x2)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike


shape: (800468, 2)
generated output at ypred_2018_04_16_03_41_32.txt	(22.94s)


In [None]:
from sklearn.model_selection import cross_val_score

for max_depth in range(1, 201, 5):
    clf = RandomForestClassifier(max_depth=max_depth)
    score = cross_val_score(clf, x1, y_train, cv=10)
    print score =

clf = svm.SVC(kernel='linear', C=1)
>>> scores = cross_val_score(clf, iris.data, iris.target, cv=5)
>>> scores                                              
array([ 0.96...,  1.  ...,  0.96...,  0.96...,  1.        ])