In [145]:
%matplotlib inline
from matplotlib import pyplot as plt
plt.style.use('ggplot')
import numpy as np
import pandas as pd
from copy import deepcopy
from numpy.random import randint
import random
import itertools 

from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

import sys
sys.path.insert(0,'..')

from vf_portalytics.model import PredictionModel

In [146]:
def make_dataset(random_state, n_informative, collumn_names, **kwargs):
    x, y = make_regression(
        
        n_samples=1000, 
        n_features=5,
        noise=0 if random_state == 1 else 10,
        bias=10 if random_state == 1 else 1000,
        n_informative=min(n_informative, 5), 
        random_state=random_state
    )
    x = pd.DataFrame(x)
    x.columns = [name for name in collumn_names]
    x = x.assign(**kwargs)
    x['yearweek'] = randint(0, 53, len(x))
    # pack_type 0: 'Can', 1: 'Bottle'
    x['pack_type'] = random.choices([0, 1], k=len(x))
    
    return x, pd.Series(y)

def make_dict():
    """Creates a dictionary with keys all the combinations between the weeks of the year and the pack types"""
    all_list = [list(range(53)), [0, 1] ]
    keys = list(itertools.product(*all_list))
    values = random.choices(np.linspace(-2.5, 2.5, num=500), k=len(keys))
    return dict(zip(keys, values))

In [147]:
collumn_names = ['price', 'promo_week_length', 
                 'yearweek',  'pack_type', 'vol_per_sku']

x1, y1 = make_dataset(1, 5, collumn_names, account_banner='A', product_desc='X')
x2, y2 = make_dataset(2, 3, collumn_names, account_banner='B', product_desc='Y')

# combine into one dataset
total_x = pd.concat([x1, x2], axis=0, ignore_index=True).reset_index(drop=True)
total_y = pd.concat([y1, y2], axis=0, ignore_index=True).reset_index(drop=True)
# Split into train and test
train_index, test_index = train_test_split(total_x.index, random_state=5)
train_x, train_y = total_x.loc[train_index, :], total_y.loc[train_index]
test_x, test_y = total_x.loc[test_index, :], total_y.loc[test_index]

# create dictionary "predicted_market_volumes" - "lookup_dict"
lookup_dict = make_dict()

In [227]:
import pandas as pd
from copy import deepcopy
from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin


class FeatureSubsetTransform(BaseEstimator, TransformerMixin):

    def __init__(self, group_cols=None, transformer=None):
        """Build a feature tranformer"""
        self.transformer = transformer
        self.group_cols = group_cols

    def fit(self, X, y=None):
        """Drop the columns that are being used to group the data and fit the transformer"""
        x_in = X.drop([n for n in self.group_cols], axis=1)
        self.transformer = self.transformer.fit(X=x_in)
        return self

    def transform(self, X):
        x_in = X.drop([n for n in self.group_cols], axis=1)
        # transform the data
        transformed_x = self.transformer.transform(X=x_in)
        # convert data into initial format
        transformed_x = pd.DataFrame(data=transformed_x, index=x_in.index,
                                     columns=self.transformer.get_feature_names(x_in.columns))
        transformed_x[list(self.group_cols)] = X[list(self.group_cols)]
        return transformed_x


class FeatureSubsetModel(BaseEstimator, RegressorMixin):

    def __init__(self, lookup_dict=None, group_cols=None, sub_models=None):
        """
        Build regression model for subsets of feature rows matching particular combination of feature columns.
        """
        self.lookup_dict = lookup_dict
        self.group_cols = group_cols
        self.sub_models = sub_models

    def fit(self, X, y=None):
        """
        Partition the training data, X, into groups for each unique combination of values in
        ``self.group_cols`` columns. For each group, train the appropriate model specified in
        ``self.sub_models``.
        """
        X['predicted_market_volume'] = itemgetter(*zip(X['yearweek'], 
                                                       X['pack_type']))(self.lookup_dict)
        groups = X.groupby(by=list(self.group_cols))
        
        for gp_key, x_group in groups:
            # Find the sub-model for this group key
            gp_model = self.sub_models[gp_key]

            # Drop the feature values for the group columns, since these are same for all rows
            # and so don't contribute anything into the prediction.
            x_in = x_group.drop([n for n in self.group_cols], axis=1)
            y_in = y.loc[x_in.index]

            # Fit the submodel with subset of rows
            gp_model = gp_model.fit(X=x_in.values, y=y_in.values)
            self.sub_models[gp_key] = gp_model
        return self

    def predict(self, X, y=None):
        """
        Same as ``self.fit()``, but call the ``predict()`` method for each submodel and return the results.
        """
        X['predicted_market_volume'] = itemgetter(*zip(X['yearweek'], 
                                                       X['pack_type']))(self.lookup_dict)
        groups = X.groupby(by=list(self.group_cols))
        results = []

        for gp_key, x_group in groups:
            gp_model = self.sub_models[gp_key]
            x_in = x_group.drop([n for n in self.group_cols], axis=1)
            
            predicted_market_share = gp_model.predict(X=x_in.values)
            predicted_market_share = pd.Series(index=x_in.index, data=predicted_market_share)
            
            result = predicted_market_share.mul(
                x_group['predicted_market_volume']).mul(
                x_group['promo_week_length']).div(
                x_group['vol_per_sku']).clip(lower=0)
            
            results.append(result)

        return pd.concat(results, axis=0)

In [228]:
# Create a pipeline and perform cross validation using both meth
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures


subset_cols = ('account_banner', 'product_desc')
sub_models = {
    ('A', 'X'): LinearRegression(),
    ('B', 'Y'): DecisionTreeRegressor(),
}


pipeline = Pipeline([  
  ('transform', FeatureSubsetTransform(group_cols=subset_cols, transformer=PolynomialFeatures(2))),
  ('estimate', FeatureSubsetModel(lookup_dict=lookup_dict, group_cols=subset_cols, sub_models=sub_models))
])

pipeline.fit(train_x, train_y)

Pipeline(memory=None,
         steps=[('transform',
                 FeatureSubsetTransform(group_cols=('account_banner',
                                                    'product_desc'),
                                        transformer=PolynomialFeatures(degree=2,
                                                                       include_bias=True,
                                                                       interaction_only=False,
                                                                       order='C'))),
                ('estimate',
                 FeatureSubsetModel(group_cols=('account_banner',
                                                'product_desc'),
                                    lookup_dict={(0, 0): 0.7464929859719436,
                                                 (0, 1): -2.2094188376753507,
                                                 (1, 0): -...
                                                                             fit_