# Aggregations

In [2]:
import pandas as pd
# Create a dictionary with dates
data = {'date': ['2016-09-01','2017-04-01','2017-08-01','2017-12-01','2017-09-01','2016-09-01','2017-04-01','2017-08-01'], 
        'customer_id': [146361,180838,157857,159772,80014,157857,159772,80014],
        'cat1':[2,4,3,5,3,4,3,5],
        'cat2':[2,1,3,1,2,1,3,1],
        'cat3':[0,0,1,1,1,0,0,1],
        'num1':[-0.518679,0.415853,-2.061687,-0.276558,-1.456827,-0.518679,0.415853,-2.061687]}

# Create a pandas dataframe from the dictionary
df = pd.DataFrame.from_dict(data)

# Convert Date to Datetime format.
df['date'] = pd.to_datetime(df.date, format='%Y-%m-%d %H:%M:%S')

# Print the dataframe
df

def generate_features(df):  
    # create a bunch of features using the date column  
    df.loc[:, 'year'] = df['date'].dt.year  
    df.loc[:, 'weekofyear'] = df['date'].dt.isocalendar().week 
    df.loc[:, 'month'] = df['date'].dt.month  
    df.loc[:, 'dayofweek'] = df['date'].dt.dayofweek  
    df.loc[:, 'weekend'] = (df['date'].dt.weekday >=5).astype(int)  
    
    # create an aggregate dictionary  
    aggs = {}  
    # for aggregation by month, we calculate the  
    # number of unique month values and also the mean  
    aggs['month'] = ['nunique', 'mean']  
    aggs['weekofyear'] = ['nunique', 'mean']  
    # we aggregate by num1 and calculate sum, max, min  
    # and mean values of this column  
    aggs['num1'] = ['sum','max','min','mean']  
    # for customer_id, we calculate the total count  
    aggs['customer_id'] = ['size']  
    # again for customer_id, we calculate the total unique  
    aggs['customer_id'] = ['nunique']  
    # we group by customer_id and calculate the aggregates  
    agg_df = df.groupby('customer_id').agg(aggs)  
    agg_df = agg_df.reset_index()  
    return agg_df

# Run Aggregation
agg_df = generate_features(df)

# Print and sort by column and aggregation metric.
agg_df.sort_values(by=('month','nunique'), ascending=False)

Unnamed: 0_level_0,customer_id,month,month,weekofyear,weekofyear,num1,num1,num1,num1,customer_id
Unnamed: 0_level_1,Unnamed: 1_level_1,nunique,mean,nunique,mean,sum,max,min,mean,nunique
0,80014,2,8.5,2,33.0,-3.518514,-1.456827,-2.061687,-1.759257,1
2,157857,2,8.5,2,33.0,-2.580366,-0.518679,-2.061687,-1.290183,1
3,159772,2,8.0,2,30.5,0.139295,0.415853,-0.276558,0.069647,1
1,146361,1,9.0,1,35.0,-0.518679,-0.518679,-0.518679,-0.518679,1
4,180838,1,4.0,1,13.0,0.415853,0.415853,0.415853,0.415853,1


In [9]:
import numpy as np  
# Create a dictionary with dates
data = {'num1':[-0.518679,0.415853,-2.061687,-0.276558,-1.456827,-0.518679,0.415853,-2.061687]}

# Create a pandas dataframe from the dictionary
x = pd.DataFrame.from_dict(data)


feature_dict = {}  
#calculate mean  
feature_dict['mean'] = np.mean(x.num1)  
#calculate max  
feature_dict['max'] = np.max(x.num1)  
#calculate min 
feature_dict['min'] = np.min(x.num1)
#calculate standard deviation
feature_dict['std'] = np.std(x.num1)  
#calculate variance  
feature_dict['var'] = np.var(x.num1)  
#peak-to-peak  
feature_dict['ptp'] = np.ptp(x.num1) 
#percentile features  
feature_dict['percentile_10'] = np.percentile(x.num1, 10)  
feature_dict['percentile_60'] = np.percentile(x.num1, 60)  
feature_dict['percentile_90'] = np.percentile(x.num1, 90)  
#quantile features  
feature_dict['quantile_5'] = np.quantile(x.num1, 0.05)  
feature_dict['quantile_95'] = np.quantile(x.num1, 0.95)  
feature_dict['quantile_99'] = np.quantile(x.num1, 0.99)   


feature_dict

{'mean': -0.757801375,
 'max': 0.415853,
 'min': -2.061687,
 'std': 0.9347296173838102,
 'var': 0.8737194576144842,
 'ptp': 2.47754,
 'percentile_10': -2.061687,
 'percentile_60': -0.4702548,
 'percentile_90': 0.415853,
 'quantile_5': -2.061687,
 'quantile_95': 0.415853,
 'quantile_99': 0.415853}

In [13]:
from tsfresh.feature_extraction import feature_calculators as fc  
#tsfresh based features  
feature_dict['abs_energy'] = fc.abs_energy(x.num1)  
feature_dict['count_above_mean'] = fc.count_above_mean(x.num1)  
feature_dict['count_below_mean'] = fc.count_below_mean(x.num1)  
feature_dict['mean_abs_change'] = fc.mean_abs_change(x.num1)  
feature_dict['mean_change'] = fc.mean_change(x.num1)


feature_dict

{'mean': -0.757801375,
 'max': 0.415853,
 'min': -2.061687,
 'std': 0.9347296173838102,
 'var': 0.8737194576144842,
 'ptp': 2.47754,
 'percentile_10': -2.061687,
 'percentile_60': -0.4702548,
 'percentile_90': 0.415853,
 'quantile_5': -2.061687,
 'quantile_95': 0.415853,
 'quantile_99': 0.415853,
 'abs_energy': 11.583859052531,
 'count_above_mean': 5,
 'count_below_mean': 3,
 'mean_abs_change': 1.5325271428571428,
 'mean_change': -0.22042971428571428}

In [15]:
import numpy as np  
#generate a random dataframe with  
# #2 columns and 100 rows  
df = pd.DataFrame(  np.random.rand(100, 2),  columns=[f"f_{i}" for i in range(1, 3)]  ) 
df

Unnamed: 0,f_1,f_2
0,0.709723,0.205054
1,0.326757,0.816310
2,0.813957,0.589622
3,0.034470,0.894133
4,0.237896,0.816150
...,...,...
95,0.921310,0.194957
96,0.288137,0.253670
97,0.728482,0.830922
98,0.727618,0.533084


In [17]:
from sklearn import preprocessing  
#initialize polynomial features class object  
# #for two-degree polynomial features  
pf = preprocessing.PolynomialFeatures( degree=2,  
                                      interaction_only=False,  
                                      include_bias=False  )  
#fit to the features  
pf.fit(df)  
#create polynomial features  
poly_feats = pf.transform(df)  
#create a dataframe with all the features  
num_feats = poly_feats.shape[1]  
df_transformed = pd.DataFrame(  
                              poly_feats,  
                              columns=[f"f_{i}" for i in range(1, num_feats + 1)] 
                              ) 


df_transformed

Unnamed: 0,f_1,f_2,f_3,f_4,f_5
0,0.709723,0.205054,0.503707,0.145532,0.042047
1,0.326757,0.816310,0.106770,0.266735,0.666363
2,0.813957,0.589622,0.662527,0.479928,0.347655
3,0.034470,0.894133,0.001188,0.030821,0.799475
4,0.237896,0.816150,0.056595,0.194159,0.666101
...,...,...,...,...,...
95,0.921310,0.194957,0.848813,0.179616,0.038008
96,0.288137,0.253670,0.083023,0.073092,0.064349
97,0.728482,0.830922,0.530685,0.605312,0.690432
98,0.727618,0.533084,0.529428,0.387881,0.284178
