In [31]:
import pandas as pd
import numpy as np
from tsfresh.feature_extraction import feature_calculators as fc
from sklearn import preprocessing
from sklearn import impute

In [1]:
def datetime_feature(df, datatime_columns):
    df.loc[:, 'year'] = df[datatime_columns].dt.year
    df.loc[:, "weekofyear"] = df[datatime_columns].dt.weekofyear
    df.loc[:, "month"] = df[datatime_columns].dt.month
    df.loc[:, "dayofweek"] = df[datatime_columns].dt.dayofweek
    df.loc[:, "weekend"] = (df.datetime_column.dt.weekday >= 5).astype(int)
    df.loc[:, "hour"] = df[datetime_columns].dt.hour

In [4]:
s = pd.date_range("2020-01-06", "2020-01-10", freq="10H").to_series()


features = {
    "dayofweek": s.dt.dayofweek.values,
    "dayofyear": s.dt.dayofyear.values,
    "hour": s.dt.hour.values,
    "is_leap_year": s.dt.is_leap_year.values,
    "quarter": s.dt.quarter.values,
    "weekofyear": s.dt.weekofyear.values
}

In [7]:
def generate_features(df):
    df.loc[:,"year"] = df["date"].dt.year
    df.loc[:, "weekofyear"] = df["date"].dt.weekofyear
    df.loc[:, "month"] = df["date"].dt.month
    df.loc[:, "dayofweek"] = df["date"].dt.dayofweek
    df.loc[:, "weekend"] = (df["date"].df.weekday >= 5).astype(int)
    
    
    agg = {}
    
    aggs["month"] = ["nunique", "mean"]
    aggs["weekofyewar"] = ["nunique", "mean"]
    aggs["num1"] = ["sum", "max", "min", "mean"]
    aggs["customer_id"] = ["size"]
    aggs["customer_id"] = ["nunique"]
    
    
    agg_df = df.groupby("customer_id").agg(aggs)
    agg_df = agg_df.reset_index()
    return agg_df

In [13]:
def generate_dataframe(x):
    feature_dict = {}
     
    feature_dict["mean"] = np.mean(x)
    feature_dict["max"] = np.max(x)
    feature_dict["min"] = np.min(x)
    feature_dict["std"] = np.std(x)
    feature_dict["var"] = np.var(x)
    feature_dict["ptp"] = np.ptp(x)
    
    
    feature_dict["percentile_10"] = np.percentile(x, 10)
    feature_dict["percentile_60"] = np.percentile(x, 60)
    feeture_dict["percentile_90"] = np.percentile(x, 90)
    
    
    feature_dict["quantile_5"] = np.quantile(x, 0.05)
    feature_dict["quantile_95"] = np.quantile(x, 0.95)
    feature_dict["quantile_99"] = np.quantile(x, 0.99)
    
    feature_dict["abs_energy"] = fc.abs_energy(x)
    feature_dict["count_above_mean"] = fc.count_above_mean(x)
    feature_dict["count_below_mean"] = fc.count_below_mean(x)
    feature_dict["mean_abs_change"] = fc.mean_abs_change(x)
    feature_dict["mean_change"] = fc.mean_change(x)

In [14]:
df = pd.DataFrame(
    np.random.rand(100, 2),
    columns = [f"f_{i}" for i in range(1, 3)]
)

In [16]:
df.head()

Unnamed: 0,f_1,f_2
0,0.048764,0.987769
1,0.341648,0.332173
2,0.945558,0.351221
3,0.076532,0.820681
4,0.479469,0.863153


In [19]:
pf = preprocessing.PolynomialFeatures(
    degree=2,
    interaction_only = False,
    include_bias = False
)

In [21]:
pf.fit(df)
poly_feats = pf.transform(df)


num_feats = poly_feats.shape[1]
df_transformed = pd.DataFrame(
    poly_feats,
    columns = [f"f_{i}" for i in range(1, num_feats + 1)]
)

In [27]:
df_transformed.head()

Unnamed: 0,f_1,f_2,f_3,f_4,f_5
0,0.048764,0.987769,0.002378,0.048167,0.975687
1,0.341648,0.332173,0.116724,0.113486,0.110339
2,0.945558,0.351221,0.894079,0.3321,0.123356
3,0.076532,0.820681,0.005857,0.062808,0.673517
4,0.479469,0.863153,0.229891,0.413855,0.745033


In [24]:
df["f_bin_10"] = pd.cut(df["f_1"], bins=10, labels=False)
df["f_bin_100"] = pd.cut(df["f_1"], bins=100, labels=False)

In [25]:
df

Unnamed: 0,f_1,f_2,f_bin_10,f_bin_100
0,0.048764,0.987769,0,4
1,0.341648,0.332173,3,34
2,0.945558,0.351221,9,95
3,0.076532,0.820681,0,7
4,0.479469,0.863153,4,48
...,...,...,...,...
95,0.215611,0.454393,2,21
96,0.577666,0.567087,5,57
97,0.034610,0.372627,0,3
98,0.097178,0.755363,0,9


In [28]:
df_transformed.f_3.var()

0.0967203195962463

In [29]:
df_transformed.f_3.apply(lambda x: np.log(1+x)).var()

0.04977657881847403

In [34]:
X = np.random.randint(1, 15, (10, 6))
X = X.astype(float)

X.ravel()[np.random.choice(X.size, 10, replace=False)] = np.nan

knn_imputer = impute.KNNImputer(n_neighbors=2)
knn_imputer.fit_transform(X)

array([[ 7. , 12.5, 14. ,  8.5, 10. ,  7. ],
       [12. ,  9. , 12. ,  7. ,  7.5,  2. ],
       [12. ,  4. ,  6. , 10. , 10. ,  1. ],
       [11. , 14. , 13. , 10. ,  5. ,  4.5],
       [12. , 12. ,  7.5,  2. ,  2. , 13. ],
       [11. , 11. , 14. ,  7. , 14. , 10. ],
       [13. , 10. ,  2. ,  9. ,  5. ,  7. ],
       [ 6. ,  2. ,  6. ,  7. , 14. ,  3. ],
       [ 6. , 14. , 10. ,  8.5, 10. , 10. ],
       [13. ,  2. , 10. , 10. , 11. , 10. ]])