In [46]:
%matplotlib inline
from matplotlib import pyplot as plt
plt.style.use('ggplot')
import numpy as np
import pandas as pd
from copy import deepcopy
from numpy.random import randint
import random
import itertools 
from operator import itemgetter

from sklearn.dummy import DummyClassifier
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

import sys
sys.path.insert(0,'..')

from vf_portalytics.model import PredictionModel

In [83]:
def make_dataset(random_state, n_informative, collumn_names, **kwargs):
    x, y = make_regression(
        
        n_samples=1000, 
        n_features=5,
        noise=0 if random_state == 1 else 10,
        bias=10 if random_state == 1 else 1000,
        n_informative=min(n_informative, 5), 
        random_state=random_state
    )
    x = pd.DataFrame(x)
    x.columns = [name for name in collumn_names]
    x = x.assign(**kwargs)
    x['yearweek'] = randint(1, 54, len(x))
    # pack_type (original_product_dimension_44) 0: 'Can', 1: 'Bottle'
    x['original_product_dimension_44'] = random.choices([0, 1], k=len(x))
    
    return x, pd.Series(y)

def make_dict():
    """Creates a dictionary with keys all the combinations between the weeks of the year and the pack types"""
    all_list = [list(range(1, 54)), [0]]
    keys = list(itertools.product(*all_list))
    values = random.choices(np.linspace(-2.5, 2.5, num=500), k=len(keys))
    return dict(zip(keys, values))

## Generate data and lookup dictionary

In [84]:
collumn_names = ['promoted_price', 'consumer_length', 
                 'yearweek',  'original_product_dimension_44', 'product_volume_per_sku']

x1, y1 = make_dataset(1, 5, collumn_names, account_banner='A', product_desc='X')
x2, y2 = make_dataset(2, 3, collumn_names, account_banner='B', product_desc='Y')
# create on more that will not have sub_model and will predict 0
x3, y3 = make_dataset(3, 1, collumn_names, account_banner='C', product_desc='Z')

# combine into one dataset
total_x = pd.concat([x1, x2, x3], axis=0, ignore_index=True).reset_index(drop=True)
total_y = pd.concat([y1, y2, y3], axis=0, ignore_index=True).reset_index(drop=True)
# Split into train and test
train_index, test_index = train_test_split(total_x.index, random_state=5)
train_x, train_y = total_x.loc[train_index, :], total_y.loc[train_index]
test_x, test_y = total_x.loc[test_index, :], total_y.loc[test_index]

# create dictionary "predicted_market_volumes" - "lookup_dict"
lookup_dict = make_dict()

In [85]:
train_x

Unnamed: 0,promoted_price,consumer_length,yearweek,original_product_dimension_44,product_volume_per_sku,account_banner,product_desc
2528,1.240198,-0.588810,1,1,-0.258654,C,Z
2828,-1.721130,-0.633242,23,1,-0.251092,C,Z
2137,-2.306269,1.947770,39,0,0.895523,C,Z
2637,-0.303963,-0.400043,8,1,-0.559406,C,Z
135,1.529248,0.686483,42,1,-0.081570,A,X
...,...,...,...,...,...,...,...
2121,-0.504628,0.043220,18,0,-1.403318,C,Z
1424,-0.722067,0.466792,31,0,-3.326870,B,Y
1725,-0.389445,-0.658218,18,0,1.398478,B,Y
2254,0.350929,-0.856347,48,0,-0.523437,C,Z


In [86]:
lookup_dict

{(1, 0): -1.9188376753507015,
 (2, 0): 0.4759519038076152,
 (3, 0): 0.9669338677354706,
 (4, 0): -2.029058116232465,
 (5, 0): 2.129258517034068,
 (6, 0): 2.18937875751503,
 (7, 0): 0.7264529058116231,
 (8, 0): -1.1372745490981964,
 (9, 0): 1.6282565130260522,
 (10, 0): 0.3156312625250499,
 (11, 0): -2.18937875751503,
 (12, 0): 2.0090180360721437,
 (13, 0): -0.9168336673346693,
 (14, 0): 1.3376753507014025,
 (15, 0): 1.2575150300601199,
 (16, 0): -0.5060120240480963,
 (17, 0): -0.8166332665330662,
 (18, 0): 2.4699398797595187,
 (19, 0): 0.5961923847695392,
 (20, 0): -0.1653306613226455,
 (21, 0): 0.6763527054108214,
 (22, 0): -1.2274549098196392,
 (23, 0): -0.4258517034068139,
 (24, 0): 0.876753507014028,
 (25, 0): -0.1753507014028055,
 (26, 0): -0.9769539078156313,
 (27, 0): -0.3557114228456917,
 (28, 0): 0.6663326653306614,
 (29, 0): -1.0671342685370742,
 (30, 0): 0.6763527054108214,
 (31, 0): -0.8166332665330662,
 (32, 0): 0.22545090180360727,
 (33, 0): -2.299599198396794,
 (34, 0): 

In [87]:
import pandas as pd
from copy import deepcopy
from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin


class FeatureSubsetTransform(BaseEstimator, TransformerMixin):

    def __init__(self, group_cols=None, transformer=None):
        """Build a feature tranformer"""
        self.transformer = transformer
        self.group_cols = group_cols

    def fit(self, X, y=None):
        """Drop the columns that are being used to group the data and fit the transformer"""
        x_in = X.drop([n for n in self.group_cols], axis=1)
        self.transformer = self.transformer.fit(X=x_in[['promoted_price']])
        return self

    def transform(self, X):
        x_in = X.drop([n for n in self.group_cols], axis=1)
        # transform the promoted_price collumn
        transformed_price = self.transformer.transform(X=x_in[['promoted_price']])
        # convert data into initial format
        transformed_price = pd.DataFrame(data=transformed_price, index=x_in.index,
                                     columns=self.transformer.get_feature_names(['promoted_price']))
        transformed_price.drop(['1', 'promoted_price'], axis=1, inplace=True)
        transformed_x = pd.concat([x_in, transformed_price], axis=1)
        transformed_x[list(self.group_cols)] = X[list(self.group_cols)]
        return transformed_x


class FeatureSubsetModel(BaseEstimator, RegressorMixin):

    def __init__(self, lookup_dict=None, group_cols=None, sub_models=None):
        """
        Build regression model for subsets of feature rows matching particular combination of feature columns.
        """
        self.lookup_dict = lookup_dict
        self.group_cols = group_cols
        self.sub_models = sub_models

    def fit(self, X, y=None):
        """
        Partition the training data, X, into groups for each unique combination of values in
        ``self.group_cols`` columns. For each group, train the appropriate model specified in
        ``self.sub_models``.
        """
        X['predicted_market_volume'] = [self.lookup_dict.get((week, pr), 0) 
                                        for week, pr in [*zip(X['yearweek'], X['original_product_dimension_44'])]]
        groups = X.groupby(by=list(self.group_cols))
        
        for gp_key, x_group in groups:
            # Find the sub-model for this group key
            gp_model = self.sub_models.get(gp_key, DummyClassifier(constant=0))

            # Drop the feature values for the group columns, since these are same for all rows
            # and so don't contribute anything into the prediction.
            x_in = x_group.drop([n for n in self.group_cols], axis=1)
            y_in = y.loc[x_in.index]
            
            # Fit the submodel with subset of rows and only collumns related to price
            gp_model = gp_model.fit(X=x_in[[col for col in x_in if col.startswith('promoted_price')]], y=y_in.values)
            self.sub_models[gp_key] = gp_model
        return self

    def predict(self, X, y=None):
        """
        Same as ``self.fit()``, but call the ``predict()`` method for each submodel and return the results.
        """
        # create a new collumn by checking the lookup_dict
        X['predicted_market_volume'] = [self.lookup_dict.get((week, pr), 0) 
                                        for week, pr in [*zip(X['yearweek'], X['original_product_dimension_44'])]]
        groups = X.groupby(by=list(self.group_cols))
        results = []

        for gp_key, x_group in groups:
            gp_model = self.sub_models.get(gp_key, DummyClassifier(constant=0))
            x_in = x_group.drop([n for n in self.group_cols], axis=1)
            
            # predict market share only using price related data
            predicted_market_share = gp_model.predict(X=x_in[[col for col in x_in if col.startswith('promoted_price')]])
            predicted_market_share = pd.Series(index=x_in.index, data=predicted_market_share)
            
            result = predicted_market_share.mul(
                x_group['predicted_market_volume']).mul(
                x_group['consumer_length']).div(
                x_group['product_volume_per_sku']).clip(lower=0)
            
            results.append(result)

        return pd.concat(results, axis=0)

# Create pipeline

In [88]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures


subset_cols = ('account_banner', 'product_desc')
sub_models = {
    ('A', 'X'): LinearRegression(),
    ('B', 'Y'): DecisionTreeRegressor(),
}


pipeline = Pipeline([  
  ('transform', FeatureSubsetTransform(group_cols=subset_cols, transformer=PolynomialFeatures(2))),
  ('estimate', FeatureSubsetModel(lookup_dict=lookup_dict, group_cols=subset_cols, sub_models=sub_models))
])


# Create VF Model Wrapper and Save pipeline

In [89]:
# Note: must use one_hot_encode=False to prevent one-hot encoding of categorical features in input data
model_wrapper = PredictionModel("my_test_model", path='/tmp', one_hot_encode=False)

model_wrapper.model = pipeline
# save feature names (no strictly since all the preprocessing is made being made in the pipeline)
model_wrapper.features = {
    # Grouping features
    'account_banner': [],
    'product_desc': [],
    # other feaures
    'promoted_price': [],
    'consumer_length': [],
    'yearweek': [],
    'original_product_dimension_44': [],
    'product_volume_per_sku': [],
}
model_wrapper.target = {'target': []}
model_wrapper.ordered_column_list = sorted(model_wrapper.features.keys())

model_wrapper.model.fit(train_x, train_y)

model_wrapper.save()



# Load Pre-Saved Model 

In [90]:
# Don't specify one_hot_encode here because it will be looked up from the pickle file
saved_model = PredictionModel('my_test_model', path='/tmp')
saved_model.model

Pipeline(memory=None,
         steps=[('transform',
                 FeatureSubsetTransform(group_cols=('account_banner',
                                                    'product_desc'),
                                        transformer=PolynomialFeatures(degree=2,
                                                                       include_bias=True,
                                                                       interaction_only=False,
                                                                       order='C'))),
                ('estimate',
                 FeatureSubsetModel(group_cols=('account_banner',
                                                'product_desc'),
                                    lookup_dict={(1, 0): -1.9188376753507015,
                                                 (2, 0): 0.4759519038076152,
                                                 (3, 0): 0...
                                                ('B', 'Y'): DecisionTreeRegressor

# Test the results

In [91]:
# test for the first group if the pipeline performs what we would like to
groups = train_x.groupby(by=list(subset_cols))
_, train_x = list(groups)[0]

groups = test_x.groupby(by=list(subset_cols))
_, test_x = list(groups)[0]

train_y = train_y.loc[train_x.index]
test_y = test_y.loc[test_x.index]

In [92]:
# predict with pipeline
pipeline_predicted = saved_model.model.predict(test_x)

In [93]:
# drop the columns that declare the group since we use only one group for the test
test_x.drop(list(subset_cols), axis=1, inplace=True)
train_x.drop(list(subset_cols), axis=1, inplace=True)

In [94]:
# transform price collumn
transformer = PolynomialFeatures(2)
transformer.fit(train_x[['promoted_price']])

def transform_data(data):
    transformed_price = transformer.transform(data[['promoted_price']])
    transformed_price = pd.DataFrame(data=transformed_price, index=data.index,
                                         columns=transformer.get_feature_names(data.columns))
    transformed_price.drop(['1', 'promoted_price'], axis=1, inplace=True)
    transformed_x = pd.concat([data, transformed_price], axis=1)
    return transformed_x
train_transformed = transform_data(train_x)
test_transformed = transform_data(test_x)

price_collumns = [col for col in test_transformed if col.startswith('promoted_price')]

In [95]:
# predict market share only using price related data
model = LinearRegression().fit(train_transformed[price_collumns], train_y)

predicted_market_share = model.predict(test_transformed[price_collumns])
predicted_market_share = pd.Series(index=test_transformed.index, data=predicted_market_share)

In [96]:
# predict output
test_x['predicted_market_volume'] = [lookup_dict.get((week, pr), 0) 
                                        for week, pr in [*zip(test_x['yearweek'], test_x['original_product_dimension_44'])]]

directly_predicted = predicted_market_share.mul(
        test_x['predicted_market_volume']).mul(
        test_x['consumer_length']).div(
        test_x['product_volume_per_sku']).clip(lower=0)

KeyError: (2, 1)

In [97]:
pd.DataFrame({'directly_predicted': directly_predicted, 'pipeline_predicted': pipeline_predicted})

Unnamed: 0,directly_predicted,pipeline_predicted
602,0.000000,0.000000
347,33.441769,0.000000
194,0.000000,-0.000000
791,0.000000,0.409094
795,0.000000,-0.000000
...,...,...
444,116.559406,0.000000
280,0.000000,0.000000
342,50.614277,0.000000
601,9.828138,1.188017
