### feature engineerging (numerical + categorical)

In [1]:
import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd 
import seaborn as sns 

from sklearn import datasets
from sklearn import manifold

%matplotlib inline

In [3]:
s = pd.date_range("2020-01-06", "2020-01-10", freq="10H").to_series()
s

2020-01-06 00:00:00   2020-01-06 00:00:00
2020-01-06 10:00:00   2020-01-06 10:00:00
2020-01-06 20:00:00   2020-01-06 20:00:00
2020-01-07 06:00:00   2020-01-07 06:00:00
2020-01-07 16:00:00   2020-01-07 16:00:00
2020-01-08 02:00:00   2020-01-08 02:00:00
2020-01-08 12:00:00   2020-01-08 12:00:00
2020-01-08 22:00:00   2020-01-08 22:00:00
2020-01-09 08:00:00   2020-01-09 08:00:00
2020-01-09 18:00:00   2020-01-09 18:00:00
Freq: 10H, dtype: datetime64[ns]

In [4]:
features = {
    "dayofweek": s.dt.dayofweek.values,
    "dayofyear": s.dt.dayofyear.values,
    "hour": s.dt.hour.values,
    "is_help_year": s.dt.is_leap_year.values,
    "quarter": s.dt.quarter.values,
    "weekofyear": s.dt.weekofyear.values
}
df = pd.DataFrame(features)
df

Unnamed: 0,dayofweek,dayofyear,hour,is_help_year,quarter,weekofyear
0,0,6,0,True,1,2
1,0,6,10,True,1,2
2,0,6,20,True,1,2
3,1,7,6,True,1,2
4,1,7,16,True,1,2
5,2,8,2,True,1,2
6,2,8,12,True,1,2
7,2,8,22,True,1,2
8,3,9,8,True,1,2
9,3,9,18,True,1,2


In [5]:
# generate a random dataset

df = {
    "date": pd.date_range("2016-01-01", "2022-12-31"),
    "customer_id": np.random.randint(1, 500, size=2557),
    "cat1": np.random.randint(1, 500, 2557),
    "cat2": np.random.randint(1, 300, 2557),
    "cat3": np.random.randint(0, 1000, 2557),
    "num1": 5*np.random.random(2557)-5
}
df = pd.DataFrame(df)
df


Unnamed: 0,date,customer_id,cat1,cat2,cat3,num1
0,2016-01-01,46,110,41,838,-2.196268
1,2016-01-02,163,197,168,708,-3.995432
2,2016-01-03,324,425,224,739,-0.553405
3,2016-01-04,92,60,108,59,-0.318557
4,2016-01-05,252,56,74,82,-0.960805
...,...,...,...,...,...,...
2552,2022-12-27,486,466,70,10,-0.124645
2553,2022-12-28,165,362,191,790,-1.522886
2554,2022-12-29,342,92,55,78,-1.591211
2555,2022-12-30,46,254,80,161,-3.405937


In [6]:
def generate_features(df):
    df.loc[:, "year"] = df["date"].dt.year
    df.loc[:, "weekofyear"] = df["date"].dt.weekofyear
    df.loc[:, "month"] = df["date"].dt.month
    df.loc[:, "dayofweek"] = df["date"].dt.dayofweek 
    df.loc[:, "weekend"] = (df["date"].dt.weekday >= 5).astype(int)

    aggs = {}
    aggs["month"] = ["nunique", "mean"]
    aggs["weekofyear"] = ["nunique", "mean"]
    aggs["num1"] = ["sum", "max", "min", "mean"]
    aggs["customer_id"] = ["size", "nunique"]

    print(aggs)
    agg_df = df.groupby("customer_id").agg(aggs)
    agg_df = agg_df.reset_index()
    return agg_df
generate_features(df)

{'month': ['nunique', 'mean'], 'weekofyear': ['nunique', 'mean'], 'num1': ['sum', 'max', 'min', 'mean'], 'customer_id': ['size', 'nunique']}


Unnamed: 0_level_0,customer_id,month,month,weekofyear,weekofyear,num1,num1,num1,num1,customer_id,customer_id
Unnamed: 0_level_1,Unnamed: 1_level_1,nunique,mean,nunique,mean,sum,max,min,mean,size,nunique
0,1,6,5.500000,6,21.833333,-21.198948,-1.912629,-4.763381,-3.533158,6,1
1,2,2,10.500000,2,44.500000,-6.030904,-1.100564,-4.930340,-3.015452,2,1
2,3,3,5.250000,4,22.250000,-10.579084,-0.408044,-4.273609,-2.644771,4,1
3,4,5,8.142857,7,33.857143,-28.587544,-1.911096,-4.957964,-4.083935,7,1
4,5,3,10.333333,3,43.333333,-8.933433,-2.134440,-4.439832,-2.977811,3,1
...,...,...,...,...,...,...,...,...,...,...,...
486,495,6,7.142857,6,28.285714,-26.028893,-0.202470,-4.815407,-3.718413,7,1
487,496,2,5.000000,2,19.000000,-6.082913,-1.767039,-4.315874,-3.041456,2,1
488,497,4,8.600000,5,35.800000,-11.114167,-0.268023,-3.603460,-2.222833,5,1
489,498,2,3.500000,2,12.500000,-3.165302,-0.650408,-2.514895,-1.582651,2,1


In [None]:
feature_dict = {}

feature_dict["mean"] = np.mean(x)
feature_dict["max"] = np.max(x)
feature_dict["min"] = np.min(x)
feature_dict["std"] = np.std(x)
feature_dict["var"] = np.var(x)
feature_dict["ptp"] = np.ptp(x)

feature_dict["percentile_10"] = np.percentile(x, 10)
feature_dict["percentile_60"] = np.percentile(x, 60)
feature_dict["percentile_90"] = np.percentile(x, 90)

feature_dict["quantile_5"] = np.quantile(x, 0.05)
feature_dict["quantile_95"] = np.quantile(x, 0.95)
feature_dict["quantile_99"] = np.quantile(x, 0.99)


# note: tsfresh python library is good for choice
from tsfresh.feature_extraction import feature_calculators as fc 

feature_dict["abs_energy"] = fc.abs_energy(x)
feature_dict["count_above_mean"] = fc.count_above_mean(x)
feature_dict["count_below_mean"] = fc.count_below_mean(x)
feature_dict["mean_abs_change"] = fc.mean_abs_change(x)
feature_dict["mean_change"] = fc.mean_change(x)

In [7]:
df = pd.DataFrame(
    np.random.rand(100, 2),
    columns=[f"f_{i}" for i in range(1, 3)]
)
df

Unnamed: 0,f_1,f_2
0,0.171261,0.252327
1,0.351842,0.882917
2,0.559954,0.067583
3,0.286110,0.039049
4,0.362532,0.202197
...,...,...
95,0.456257,0.123792
96,0.069169,0.399490
97,0.300704,0.104529
98,0.538832,0.246170


In [10]:
from sklearn import  preprocessing

pf = preprocessing.PolynomialFeatures(
    degree=2,
    interaction_only= False,
    include_bias=False
)
pf.fit(df)

poly_feats = pf.transform(df)

num_feats = poly_feats.shape[1]
df_transformed = pd.DataFrame(
    poly_feats,
    columns=[f"f_{i}" for i in range(1, num_feats+1)]
)
df_transformed

Unnamed: 0,f_1,f_2,f_3,f_4,f_5
0,0.171261,0.252327,0.029330,0.043214,0.063669
1,0.351842,0.882917,0.123793,0.310647,0.779542
2,0.559954,0.067583,0.313549,0.037843,0.004567
3,0.286110,0.039049,0.081859,0.011172,0.001525
4,0.362532,0.202197,0.131430,0.073303,0.040884
...,...,...,...,...,...
95,0.456257,0.123792,0.208170,0.056481,0.015324
96,0.069169,0.399490,0.004784,0.027632,0.159592
97,0.300704,0.104529,0.090423,0.031432,0.010926
98,0.538832,0.246170,0.290340,0.132644,0.060600


In [15]:
df["f_bin_10"] = pd.cut(df["f_1"], bins=10, labels=False)
df["f_bin_100"] = pd.cut(df["f_1"], bins=100, labels=False)
df

Unnamed: 0,f_1,f_2,f_bin_10,f_bin_100
0,0.171261,0.252327,1,15
1,0.351842,0.882917,3,34
2,0.559954,0.067583,5,56
3,0.286110,0.039049,2,27
4,0.362532,0.202197,3,35
...,...,...,...,...
95,0.456257,0.123792,4,45
96,0.069169,0.399490,0,4
97,0.300704,0.104529,2,28
98,0.538832,0.246170,5,54


In [17]:
from sklearn import  impute

x = np.random.randint(1, 15, (10, 6)).astype(float)
x

array([[11.,  4., 10.,  4., 12.,  1.],
       [ 3.,  8.,  1., 13., 10.,  9.],
       [ 7.,  3., 10.,  4., 11., 12.],
       [10.,  1.,  5.,  6., 14., 10.],
       [ 6.,  8.,  1.,  5.,  7.,  3.],
       [10., 10.,  5.,  8.,  8., 14.],
       [11.,  8., 12.,  9., 10., 10.],
       [12.,  2., 10.,  1.,  5., 12.],
       [14., 11., 12.,  5., 14., 10.],
       [ 6.,  8.,  9.,  9., 14., 10.]])

In [18]:
x.ravel()[np.random.choice(x.size, 10, replace=False)] = np.nan 
x

array([[11., nan, 10., nan, 12.,  1.],
       [ 3.,  8.,  1., nan, nan,  9.],
       [ 7.,  3., 10.,  4., 11., 12.],
       [nan,  1., nan,  6., 14., 10.],
       [nan,  8., nan,  5.,  7.,  3.],
       [10., nan,  5.,  8.,  8., 14.],
       [11.,  8., 12.,  9., 10., 10.],
       [nan,  2., 10.,  1.,  5., 12.],
       [14., 11., 12.,  5., 14., 10.],
       [ 6.,  8.,  9.,  9., 14., 10.]])

In [19]:
knn_imputer = impute.KNNImputer(n_neighbors=2)
knn_imputer.fit_transform(x)

array([[11. ,  8. , 10. ,  7. , 12. ,  1. ],
       [ 3. ,  8. ,  1. ,  7. , 10.5,  9. ],
       [ 7. ,  3. , 10. ,  4. , 11. , 12. ],
       [ 6.5,  1. ,  9.5,  6. , 14. , 10. ],
       [ 7. ,  8. ,  5.5,  5. ,  7. ,  3. ],
       [10. ,  5.5,  5. ,  8. ,  8. , 14. ],
       [11. ,  8. , 12. ,  9. , 10. , 10. ],
       [ 8.5,  2. , 10. ,  1. ,  5. , 12. ],
       [14. , 11. , 12. ,  5. , 14. , 10. ],
       [ 6. ,  8. ,  9. ,  9. , 14. , 10. ]])