# Model to forecast inventory demand based on historical sales data. 

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt
import time
import random
import pickle

## Model accuracy is RMSLE

In [2]:
def rmsle(y, y0):
    assert len(y) == len(y0)
    return np.sqrt(np.mean(np.power(np.log1p(y)-np.log1p(y0), 2)))

## Load Training Data 
The size of the training data is quite large (~4 GB). Large datasets require significant amount of memory to process. Instead, we will sample the data randomly for our initial data analysis and visualization. 

In [None]:
def load_samp_data(filename='train.csv', columns=[], load_pkl=1):
    """ 
      Function returns a dataframe containing the training data sampled randomly. 
      The data is also stored in a pickle file for later processing.
    """
    if load_pkl:
        inputfile = open('train_samp_data.pkl', 'rb')
        data = pickle.load(inputfile)
        inputfile.close()
        return data
    
    chunksize= 10 ** 6
    datasize = 74180464 #datasize = sum(1 for line in open(filename)) - 1 #number of records in file (excludes header)
    samplesize = 3*10 ** 4 # samples per chunk of data read from the file.
    
    data = pd.DataFrame([],columns=columns)
    chunks = pd.read_csv(filename, iterator=True, chunksize=chunksize)
    for chunk in chunks:
        chunk.columns = columns
        data = data.append(chunk.sample(samplesize)) 
    
    # write data to a pickle file.
    outputfile = open('train_samp_data.pkl','wb')
    pickle.dump(data,outputfile)
    outputfile.close()
    
    return data
 
load_pkl = 0
columns = ['week_num', 'sales_depot_id', 'sales_chan_id', 'route_id', 'client_id', 'prod_id', 'saleunit_curr_wk', 'saleamt_curr_wk', 'retunit_next_week', 'retamt_next_wk', 'y_pred_demand']
tic = time.time()
train_data_samp = load_samp_data('train.csv', columns, load_pkl)
toc = time.time()
print '**'
print 'Time to load: ', toc-tic, 'sec'
print 
print train_data_samp.describe()
print '**'
print train_data_samp[['week_num', 'sales_depot_id', 'sales_chan_id', 'route_id', 'client_id', 'prod_id']]

## Preliminary evaluation using Linear Regression

In [None]:
from sklearn import linear_model
from sklearn.cross_validation import train_test_split
features_train = train_data_samp[['week_num', 'sales_depot_id', 'sales_chan_id', 'route_id', 'client_id', 'prod_id']].values
labels_train = train_data_samp[['y_pred_demand']].values

# Split the data samples into train and test.
X_train, X_test, y_train, y_test = train_test_split(features_train, labels_train, test_size=0.33, random_state=42)

# Linear regression
tic = time.time()
clf = linear_model.LinearRegression()
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
pred[pred<0] = 0
tac = time.time()
print '----------'
print 'Time:', tac-tic, 'RMSLE (LinearRegression):', rmsle(pred, y_test)
print '----------'


## Preliminary evaluation using gradient boosting (xgboost)

In [7]:
# Utility function to report best scores
def report(grid_scores, n_top=3):
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")

In [8]:

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestClassifier

from sklearn.grid_search import RandomizedSearchCV

from scipy.stats import randint as sp_randint
from operator import itemgetter





clf = RandomForestClassifier(n_estimators=30)
# specify parameters and distributions to sample from
param_dist = {"max_depth": [10, None],
              "max_features": sp_randint(1, 6),
              "min_samples_split": sp_randint(1, 6),
              "min_samples_leaf": sp_randint(1, 6),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search, n_jobs=2)
start = time.time()
random_search.fit(X_train, np.ravel(y_train))

print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time.time() - start), n_iter_search))
report(random_search.grid_scores_)
random_search.best_score_ 
pred = random_search.predict(X_test)
pred[pred<0] = 0
print 'Time:', tac-tic, 'RMSLE (RF):', rmsle(pred, np.ravel(y_test))
print '----------'

Time: 0.00601291656494 RMSLE (LinearRegression): 0.935833411032
----------
RandomizedSearchCV took 1190.15 seconds for 20 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.237 (std: 0.003)
Parameters: {'bootstrap': False, 'min_samples_leaf': 3, 'min_samples_split': 5, 'criterion': 'entropy', 'max_features': 5, 'max_depth': 10}

Model with rank: 2
Mean validation score: 0.236 (std: 0.002)
Parameters: {'bootstrap': False, 'min_samples_leaf': 2, 'min_samples_split': 2, 'criterion': 'entropy', 'max_features': 5, 'max_depth': 10}

Model with rank: 3
Mean validation score: 0.236 (std: 0.003)
Parameters: {'bootstrap': True, 'min_samples_leaf': 5, 'min_samples_split': 3, 'criterion': 'gini', 'max_features': 5, 'max_depth': 10}

Time: 0.00601291656494 RMSLE (RF): 0.786554235216
----------


In [None]:
clientnameid_data = pd.read_csv('cliente_tabla.csv')
townstate_data = pd.read_csv('town_state.csv')
print clientnameid_data.head()
print '----'
print townstate_data.head()

## Data Cleaning
There are duplicate client ids in cliente_table, which means one client id may have multiple client name that are very similar. We will cluster them based on a hash function and use a clustering algorithm to evaluate similarity.  

In [None]:
import re
def hash_eval(s):
    hash_base = 4
    s = re.sub('[., ]', '', s)
    seqlen = len(s)
    n = seqlen - 1
    h = 0
    for c in s:
        h += ord(c) * (hash_base ** n)
        n -= 1
    curhash = h
    return curhash

# In the client table, same clients are assigned different client ID. We create a new client table where clients are assigned unique ID. 
clientid_hash = dict()
new_client_id = [-1]   
for idx, s in enumerate(clientnameid_data.NombreCliente):
    t = hash_eval(s)
    clientid_hash.setdefault(t, []).append(clientnameid_data.Cliente_ID[idx])
    if t in clientid_hash:
        a = clientid_hash[t]
        new_client_id.append(a[0])

# In the agency table, same agencies (town, state) are assigned different agency ID. We create a new agency table where agencies (town, state) are assigned unique ID. 
agencyid_hash = dict()
new_agency_id = [-1]   
for idx, s in enumerate(townstate_data.Town+townstate_data.State):
    t = hash_eval(s)
    agencyid_hash.setdefault(t, []).append(townstate_data.Agencia_ID[idx])
    if t in agencyid_hash:
        a = agencyid_hash[t]
        new_agency_id.append(a[0])


In [None]:
clientnameid_data['New_Cliente_ID'] = new_client_id[1:]
townstate_data['New_Agencia_ID'] = new_agency_id[1:]

In [None]:
print clientnameid_data.head(10)
print '---'
print townstate_data.head()
print '---'
print train_data_samp.head(10)


In [None]:
print train_data_samp.head(10)
print '------'
for idx, cid in enumerate(train_data_samp.client_id):
    train_data_samp.client_id.values[idx] = clientnameid_data.New_Cliente_ID[train_data_samp.client_id.values[idx] == clientnameid_data.Cliente_ID.values].values[0]
    train_data_samp.sales_depot_id.values[idx] = townstate_data.New_Agencia_ID[train_data_samp.sales_depot_id.values[idx] == townstate_data.Agencia_ID.values].values[0]
print '-----'
print train_data_samp.head()


## Load Test Data

In [None]:
test_data = pd.read_csv('test.csv')
test_data.columns = ['id', 'week_num', 'sales_depot_id', 'sales_chan_id', 'route_id', 'client id', 'prod_id']
test_labels = pd.read_csv('sample_submission.csv')
test_data = test_data.drop('id', 1)
print test_data.head()

In [None]:
g = sns.PairGrid(data_t)
g.map(plt.scatter)

In [None]:
a = [[1, 2, 3, 4]]

In [None]:
print a

In [None]:
np.array(a)

In [None]:
print np.array(a)

In [None]:
a = np.array(a)

In [None]:
a

In [None]:
print a.reshape(-1,)