In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion

#### Column Extractor

In [3]:
class ColumnExtractor(TransformerMixin):
    def __init__(self, cols):
        self.cols = cols

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        Xcols = X[self.cols]
        return Xcols

#### Groupby_Avg_Featurizer

In [36]:
class Groupby_Avg_Featurizer(TransformerMixin):
    def __init__(self, group_col, value_col, output_col, weight_col=None):
        self.group_col = group_col
        self.value_col = value_col
        self.weight_col = weight_col
        self.output_col = output_col
    def fit(self, X):        
        return self
    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        assert isinstance(self.group_col, list)
        assert isinstance(self.value_col, str)
        if self.weight_col: assert isinstance(self.weight_col, str)
        assert isinstance(self.output_col, str)
        
        if self.weight_col:
            gb = X.groupby(self.group_col).apply(lambda df: np.average(df[self.value_col])).reset_index().rename(columns={0:self.output_col})
        else:
            gb = X.groupby(self.group_col).apply(lambda df: np.average(df[self.value_col])).reset_index().rename(columns={0:self.output_col})
                                                                 
        return pd.merge(X, gb, on=self.group_col, how='left')
        

In [39]:
class Groupby_Std_Featurizer(TransformerMixin):
    def __init__(self, group_col, value_col, output_col, weight_col=None):
        self.group_col = group_col
        self.value_col = value_col
        self.weight_col = weight_col
        self.output_col = output_col
    def fit(self, X):        
        return self
  
    def transform(self, X):
        
        def weighted_std(df, values, weights):
            """
            Return the weighted average and standard deviation.
            values, weights -- column names
            """
            values, weights = df[values].values, df[weights].values
            average = np.average(values, weights=weights)
            variance = np.average((values-average)**2, weights=weights)
            return np.sqrt(variance)
        
        
        assert isinstance(X, pd.DataFrame)
        assert isinstance(self.group_col, list)
        assert isinstance(self.value_col, str)
        if self.weight_col: assert isinstance(self.weight_col, str)
        assert isinstance(self.output_col, str)
        
        if self.weight_col:
            gb = X.groupby(self.group_col).apply(weighted_std, self.value_col, self.weight_col).reset_index().rename(columns={0:self.output_col})
        else:
            gb = X.groupby(self.group_col).apply(lambda df: np.std(df[self.value_col])).reset_index().rename(columns={0:self.output_col})
        
        return pd.merge(X, gb, on=self.group_col, how='left')

#### Lag Value Transformer

In [66]:
class Lag_Value_Transformer(TransformerMixin):
    def __init__(self, value_col, time_col, index_col, period=[1]):
        '''
        time_col: an integer-indexed column to indicate timestamp
        index_col: a list of columns together with the timec_col to merge on
        
        '''
        self.value_col = value_col
        self.time_col = time_col
        self.index_col = index_col
        self.period = period
    
    def fit(self, X):
        return self
    
    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        assert isinstance(self.value_col, str)
        assert isinstance(self.time_col, str)
        assert isinstance(self.index_col, list)
        assert isinstance(self.period, list)
        
        
        for p in self.period:
            df = X.loc[:,self.index_col+[self.time_col, self.value_col]]
            df[self.time_col] = df[self.time_col] + p
            lag_col_name = '{}_lag_{}'.format(self.value_col, p)
            df = df.rename(columns={self.value_col:lag_col_name})
            X = pd.merge(X, df, on=self.index_col+[self.time_col], how='left')
        return X
        
        

In [80]:
class Groupby_Sum_Transformer(TransformerMixin):
    '''
        Compute the weighted sum of two columns of a df
    '''
    def __init__(self, group_col, value_col, output_col, weight_col=None):
        self.group_col = group_col
        self.value_col = value_col
        self.weight_col = weight_col
        self.output_col = output_col
        self.gb = None
        
    def fit(self, X):        
        
        if self.weight_col:
            self.gb = X.groupby(self.group_col).apply(lambda df: np.dot(df[self.value_col].values, df[self.weight_col].values)).reset_index().rename(columns={0:self.output_col})
        else:
            self.gb = X.groupby(self.group_col).apply(lambda df: np.std(df[self.value_col])).reset_index().rename(columns={0:self.output_col})
        
        
        return self
    
    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        assert isinstance(self.group_col, list)
        assert isinstance(self.value_col, str)
        if self.weight_col: assert isinstance(self.weight_col, str)
        assert isinstance(self.output_col, str)
             
        return pd.merge(X, self.gb, on=self.group_col, how='left')

In [106]:
class ClipTransformer(TransformerMixin):

    def __init__(self, a_min, a_max, col):
        self.a_min = a_min
        self.a_max = a_max
        self.col = col

    def fit(self, X, y=None):
        # stateless transformer
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        X.loc[:,self.col] = np.clip(X[self.col].values, self.a_min, self.a_max)
        return X

In [87]:
class DFFeatureUnion(TransformerMixin):
    # FeatureUnion but for pandas DataFrames

    def __init__(self, transformer_list):
        self.transformer_list = transformer_list

    def fit(self, X, y=None):
        for (name, t) in self.transformer_list:
            t.fit(X, y)
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        Xts = [t.transform(X) for _, t in self.transformer_list]
        Xunion = reduce(lambda X1, X2: pd.merge(X1, X2, left_index=True, right_index=True), Xts)
        return Xunion

### TEST

In [97]:
sales = pd.read_excel("https://github.com/chris1610/pbpython/blob/master/data/sales-estimate.xlsx?raw=True", sheet_name="projections")
sales.head(20)

Unnamed: 0,Account,Name,State,Rep,Manager,Current_Price,Quantity,New_Product_Price
0,714466,Trantow-Barrows,MN,Craig Booker,Debra Henley,500,100,550
1,737550,"Fritsch, Russel and Anderson",MN,Craig Booker,Debra Henley,600,90,725
2,146832,Kiehn-Spinka,TX,Daniel Hilton,Debra Henley,225,475,255
3,218895,Kulas Inc,TX,Daniel Hilton,Debra Henley,290,375,300
4,412290,Jerde-Hilpert,WI,John Smith,Debra Henley,375,400,400
5,740150,Barton LLC,WI,John Smith,Debra Henley,550,100,600
6,141962,Herman LLC,CA,Cedric Moss,Fred Anderson,400,200,425
7,163416,Purdy-Kunde,CA,Cedric Moss,Fred Anderson,450,150,475
8,239344,Stokes LLC,WA,Cedric Moss,Fred Anderson,550,75,610
9,307599,"Kassulke, Ondricka and Metz",NV,Wendy Yule,Fred Anderson,275,450,300


In [120]:
train = sales.iloc[[0,2,4,9,10]].copy()
train

Unnamed: 0,Account,Name,State,Rep,Manager,Current_Price,Quantity,New_Product_Price
0,714466,Trantow-Barrows,MN,Craig Booker,Debra Henley,500,100,550
2,146832,Kiehn-Spinka,TX,Daniel Hilton,Debra Henley,225,475,255
4,412290,Jerde-Hilpert,WI,John Smith,Debra Henley,375,400,400
9,307599,"Kassulke, Ondricka and Metz",NV,Wendy Yule,Fred Anderson,275,450,300
10,688981,Keeling LLC,NV,Wendy Yule,Fred Anderson,300,250,350


In [121]:
test = sales.iloc[[1,3,5,7,8,11]].copy()
test

Unnamed: 0,Account,Name,State,Rep,Manager,Current_Price,Quantity,New_Product_Price
1,737550,"Fritsch, Russel and Anderson",MN,Craig Booker,Debra Henley,600,90,725
3,218895,Kulas Inc,TX,Daniel Hilton,Debra Henley,290,375,300
5,740150,Barton LLC,WI,John Smith,Debra Henley,550,100,600
7,163416,Purdy-Kunde,CA,Cedric Moss,Fred Anderson,450,150,475
8,239344,Stokes LLC,WA,Cedric Moss,Fred Anderson,550,75,610
11,729833,Koepp Ltd,NV,Wendy Yule,Fred Anderson,350,100,375


In [100]:
gb_sum_transformer = Groupby_Sum_Transformer(group_col=['State'],value_col='Current_Price',output_col='revenue',weight_col='Quantity')
train_1 = gb_sum_transformer.fit_transform(train)
train_1

Unnamed: 0,Account,Name,State,Rep,Manager,Current_Price,Quantity,New_Product_Price,revenue
0,714466,Trantow-Barrows,MN,Craig Booker,Debra Henley,500,100,550,50000
1,146832,Kiehn-Spinka,TX,Daniel Hilton,Debra Henley,225,475,255,106875
2,412290,Jerde-Hilpert,WI,John Smith,Debra Henley,375,400,400,150000
3,307599,"Kassulke, Ondricka and Metz",NV,Wendy Yule,Fred Anderson,275,450,300,198750
4,688981,Keeling LLC,NV,Wendy Yule,Fred Anderson,300,250,350,198750


In [101]:
test_1 = gb_sum_transformer.transform(test)
test_1

Unnamed: 0,Account,Name,State,Rep,Manager,Current_Price,Quantity,New_Product_Price,revenue
0,737550,"Fritsch, Russel and Anderson",MN,Craig Booker,Debra Henley,600,90,725,50000.0
1,218895,Kulas Inc,TX,Daniel Hilton,Debra Henley,290,375,300,106875.0
2,740150,Barton LLC,WI,John Smith,Debra Henley,550,100,600,150000.0
3,163416,Purdy-Kunde,CA,Cedric Moss,Fred Anderson,450,150,475,
4,239344,Stokes LLC,WA,Cedric Moss,Fred Anderson,550,75,610,
5,729833,Koepp Ltd,NV,Wendy Yule,Fred Anderson,350,100,375,198750.0


In [118]:
train.loc[:,'Current_Price'] = np.clip(train.loc[:,'Current_Price'].values, 300, 500)
# train.loc[train.Account==714466,['Current_Price']] = 4
# train.['Current_Price'] = np.clip(train['Current_Price'].values, 300, 500)
train

Unnamed: 0,Account,Name,State,Rep,Manager,Current_Price,Quantity,New_Product_Price
0,714466,Trantow-Barrows,MN,Craig Booker,Debra Henley,500,100,550
2,146832,Kiehn-Spinka,TX,Daniel Hilton,Debra Henley,300,475,255
4,412290,Jerde-Hilpert,WI,John Smith,Debra Henley,375,400,400
9,307599,"Kassulke, Ondricka and Metz",NV,Wendy Yule,Fred Anderson,300,450,300
10,688981,Keeling LLC,NV,Wendy Yule,Fred Anderson,300,250,350


In [122]:
pipeline = Pipeline([
        
            ('clip_price', ClipTransformer(300,500, col='Current_Price')),
            ('avg_current_price_by_state_rep', Groupby_Weighted_Avg_Featurizer(group_col=['State','Rep'],value_col='Current_Price',output_col='State_Rep_Current_Price_Avg')),
            ('std_current_price_by_state_rep', Groupby_Std_Featurizer(group_col=['State','Rep'],value_col='Current_Price',weight_col='Quantity',output_col='State_Rep_Current_Price_Std'))

        
            ])
train2 = pipeline.transform(train)
train2

Unnamed: 0,Account,Name,State,Rep,Manager,Current_Price,Quantity,New_Product_Price,State_Rep_Current_Price_Avg,State_Rep_Current_Price_Std
0,714466,Trantow-Barrows,MN,Craig Booker,Debra Henley,500,100,550,500.0,0.0
1,146832,Kiehn-Spinka,TX,Daniel Hilton,Debra Henley,300,475,255,300.0,0.0
2,412290,Jerde-Hilpert,WI,John Smith,Debra Henley,375,400,400,375.0,0.0
3,307599,"Kassulke, Ondricka and Metz",NV,Wendy Yule,Fred Anderson,300,450,300,300.0,0.0
4,688981,Keeling LLC,NV,Wendy Yule,Fred Anderson,300,250,350,300.0,0.0


In [55]:
s=pd.DataFrame({'Time':[1,2,3,1,2,3], 'Shop_id':[6,6,6,7,7,7], 'Item_id':[9,9,9,10,10,10], 'Sale': np.arange(6)})
s

Unnamed: 0,Time,Shop_id,Item_id,Sale
0,1,6,9,0
1,2,6,9,1
2,3,6,9,2
3,1,7,10,3
4,2,7,10,4
5,3,7,10,5


In [67]:
lag_transformer = Lag_Value_Transformer('Sale','Time',['Shop_id','Item_id'],period=[1,2])
s2 = lag_transformer.transform(s)

In [68]:
s2

Unnamed: 0,Time,Shop_id,Item_id,Sale,Sale_lag_1,Sale_lag_2
0,1,6,9,0,,
1,2,6,9,1,0.0,
2,3,6,9,2,1.0,0.0
3,1,7,10,3,,
4,2,7,10,4,3.0,
5,3,7,10,5,4.0,3.0


In [65]:
s3

Unnamed: 0,Shop_id,Item_id,Time,Sale_lag_2
0,6,9,3,0
1,6,9,4,1
2,6,9,5,2
3,7,10,3,3
4,7,10,4,4
5,7,10,5,5


In [72]:
gb_sum_transformer = Groupby_Sum_Transformer(group_col=['Shop_id'],value_col='Item_id',output_col='revenue',weight_col='Sale')
s4 = gb_sum_transformer.transform(s)
s4

Unnamed: 0,Time,Shop_id,Item_id,Sale,revenue
0,1,6,9,0,27
1,2,6,9,1,27
2,3,6,9,2,27
3,1,7,10,3,120
4,2,7,10,4,120
5,3,7,10,5,120


In [85]:
test = False
not test

True