In [6]:
import pandas as pd 
#create a series of datetime with a frequency of 10 hours
s = pd.date_range('2020-01-06', '2020-01-10', freq ='10H').to_series()
#create some features based on datetime
#gen a dictionary of features from a given series...ab
features = {
    "dayofweek":s.dt.dayofweek.values,
    "dayofyear": s.dt.dayofyear.values, 
    "hour": s.dt.hour.values,
    "is_leap_year": s.dt.is_leap_year.values,
    "quarter": s.dt.quarter.values,
    "weekofyear": s.dt.weekofyear.values
}
print(s)

2020-01-06 00:00:00   2020-01-06 00:00:00
2020-01-06 10:00:00   2020-01-06 10:00:00
2020-01-06 20:00:00   2020-01-06 20:00:00
2020-01-07 06:00:00   2020-01-07 06:00:00
2020-01-07 16:00:00   2020-01-07 16:00:00
2020-01-08 02:00:00   2020-01-08 02:00:00
2020-01-08 12:00:00   2020-01-08 12:00:00
2020-01-08 22:00:00   2020-01-08 22:00:00
2020-01-09 08:00:00   2020-01-09 08:00:00
2020-01-09 18:00:00   2020-01-09 18:00:00
Freq: 10H, dtype: datetime64[ns]


In [7]:
def generate_features(df):
    #create a bunch of features using date columns
    df.loc[:, 'year'] = df['date'].dt.year
    df.loc[:, 'weekofyear'] = df['date'].dt.weekofyear
    df.loc[:, 'month'] = df['date'].dt.month
    df.loc[:, 'dayofweek'] = df['date'].dt.dayofweek
    df.loc[:, 'weekend'] = (df['date'].dt.weekday >=5).astype(int)
    # create an aggregate dictionary
    aggs = {}
    # for aggregation by month, we calculate the
    # number of unique month values and also the mean
    aggs['month'] = ['nunique', 'mean']
    aggs['weekofyear'] = ['nunique', 'mean']
    # we aggregate by num1 and calculate sum, max, min
    # and mean values of this column
    aggs['num1'] = ['sum','max','min','mean']
    # for customer_id, we calculate the total count
    aggs['customer_id'] = ['size']
    # again for customer_id, we calculate the total unique
    aggs['customer_id'] = ['nunique']
    # we group by customer_id and calculate the aggregates
    agg_df = df.groupby('customer_id').agg(aggs)
    agg_df = agg_df.reset_index()
    return agg_df

In [9]:
#example : create a bunch of statistical features, x is a list of values 
import numpy as np 
feature_dict = {}

x=[1,2,3,4,5,6]
#calculate mean 
feature_dict['mean'] = np.mean(x)
#calculate max
feature_dict['max'] = np.max(x)
#calculate min 
feature_dict['min'] = np.min(x)
#calculate std deviation
feature_dict['std'] = np.std(x)
#calculate variance
feature_dict['var'] = np.var(x)
#calculate peek-to-peek 
feature_dict['ptp'] = np.ptp(x)
# #calculate percentile
feature_dict['percentile_10'] = np.percentile(x, 10)
feature_dict['percentile_60'] = np.percentile(x, 60)
feature_dict['percentile_90'] = np.percentile(x, 90)
#calculate quantile
feature_dict['quantile_5'] = np.percentile(x, 5)
feature_dict['quantile_95'] = np.percentile(x, 95)
feature_dict['quantile_99'] = np.percentile(x, 99)

In [12]:
# example: gen a random dataframe with 2 columns and 100 rows

import numpy as np 
df = pd.DataFrame(np.random.rand(100,2), 
    columns=[f"f_{i}" for i in range(1,3)])
df

Unnamed: 0,f_1,f_2
0,0.351790,0.619431
1,0.370736,0.003406
2,0.873181,0.828888
3,0.050860,0.099718
4,0.595553,0.674058
...,...,...
95,0.242844,0.922599
96,0.289261,0.637347
97,0.928571,0.619644
98,0.730984,0.266064


In [20]:
# create two-degree polynominal features using PolynominalFeatures, from scikit-learn 
from sklearn import preprocessing

pf = preprocessing.PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
#fit to the features
pf.fit(df)
#create polynominal features 
poly_feats = pf.transform(df)

#create a df with all the features
num_feats = poly_feats.shape[1]
print('num_feats = ',num_feats)
df_transformed = pd.DataFrame(
    poly_feats,
    columns=[f"f_{i}" for i in range(1, num_feats + 1)]
    )
df_transformed


num_feats =  5


Unnamed: 0,f_1,f_2,f_3,f_4,f_5
0,0.351790,0.619431,0.123757,0.217910,0.383694
1,0.370736,0.003406,0.137445,0.001263,0.000012
2,0.873181,0.828888,0.762445,0.723769,0.687055
3,0.050860,0.099718,0.002587,0.005072,0.009944
4,0.595553,0.674058,0.354683,0.401437,0.454354
...,...,...,...,...,...
95,0.242844,0.922599,0.058973,0.224048,0.851189
96,0.289261,0.637347,0.083672,0.184360,0.406211
97,0.928571,0.619644,0.862245,0.575384,0.383959
98,0.730984,0.266064,0.534338,0.194489,0.070790


**Handle mising/NaN values**
- Categorical features: treat is as a new category
- numerical data: choose a value that does not appear in the specific feature and fill using that
    ví dụ: 
    - giá trị 0 ko có ở trong feature >> có thể thay các giá trị nan bằng 0. đây là cách đơn giản nhưng ko hiệu quả
    - thay vì fill 0 thì dùng mean hoặc median của tất cả các giá trị...
    - sử dụng k-nearest neighbour 
    - huấn luyện 1 mô hình regression , để dự đoán giá trị khuyết thiếu trong 1 cột dựa trên các cột khác...
    

In [23]:
#KNNImputer

import numpy as np 
from sklearn import impute

#create a random numpy arr with 10 samples and 6 features and values ranging from 1 to 15
X = np.random.randint(1,15, (10,6))
#convert the arr to float 
X= X.astype(float)
# randomly assign 10 elements to NaN (missing)
X.ravel()[np.random.choice(X.size, 10, replace=False)] = np.nan
#use 3 nearest neighbours to fill na values 
knn_imputer = impute.KNNImputer(n_neighbors=2)
knn_imputer.fit_transform(X)

array([[ 6. ,  8. ,  9. ,  4. ,  8. ,  6. ],
       [ 3. ,  9. ,  5. ,  2. ,  9.5,  1. ],
       [ 2. , 12. ,  7. ,  3.5, 13. ,  6. ],
       [ 9. ,  8. ,  7. ,  2. ,  8. ,  2. ],
       [ 5. , 10. , 13. ,  1. ,  3. ,  2. ],
       [ 3. ,  9. ,  7. ,  6. ,  6. ,  3.5],
       [14. ,  3. , 11. ,  2. ,  8. ,  3. ],
       [11. ,  4. , 14. ,  2. , 11. , 11. ],
       [ 2.5, 13. ,  4. ,  5. ,  8. , 12. ],
       [ 2.5,  7. ,  3. , 12. , 10.5, 10. ]])

**Note** 
- Look at the data and see what fits and create features accordingly 
- Scale or nomalize your features if using linear models (logistic regression, SVM, ...). 
  Tree-based model will work fine without any normalization of features