In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import numpy as np
import pandas as pd

from fastai.structured import *

In [2]:
pd.to_datetime("3/11/2000")

Timestamp('2000-03-11 00:00:00')

In [3]:
pd.to_datetime("201409112300")

Timestamp('2014-09-11 23:00:00')

In [4]:
def to_datetime(time):
    return "20" + str(time) + "00"

In [5]:
pd.to_datetime(to_datetime(14091123))

Timestamp('2014-09-11 23:00:00')

In [6]:
df = pd.DataFrame({ 'A' : pd.to_datetime([to_datetime(14091123)]) })

In [7]:
df

Unnamed: 0,A
0,2014-09-11 23:00:00


In [8]:
add_datepart(df, "A", drop=False)

In [9]:
df

Unnamed: 0,A,AYear,AMonth,AWeek,ADay,ADayofweek,ADayofyear,AIs_month_end,AIs_month_start,AIs_quarter_end,AIs_quarter_start,AIs_year_end,AIs_year_start,AElapsed
0,2014-09-11 23:00:00,2014,9,37,11,3,254,False,False,False,False,False,False,1410476400


In [10]:
df = pd.DataFrame({ 'c1' : ['a','b','c','a'] })

In [11]:
df

Unnamed: 0,c1
0,a
1,b
2,c
3,a


In [17]:
df["B"] = df["c1"].astype('category')

In [22]:
df['B']

0    a
1    b
2    c
3    a
Name: B, dtype: category
Categories (3, object): [a, b, c]

In [21]:
len(df["B"].cat.categories)

3

## target encode

In [33]:
import pandas as pd
import category_encoders as ce

In [24]:
from sklearn.datasets import load_boston

In [25]:
bunch = load_boston()

In [26]:
y = bunch.target

In [28]:
X = pd.DataFrame(bunch.data, columns=bunch.feature_names)

In [29]:
X.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [38]:
X['RAD'].unique()

array([ 1.,  2.,  3.,  5.,  4.,  8.,  6.,  7., 24.])

In [35]:
enc = ce.TargetEncoder(cols=['CHAS', 'RAD']).fit(X, y)

In [36]:
numeric_dataset = enc.transform(X)

In [41]:
numeric_dataset.head()

Unnamed: 0,CRIM,ZN,INDUS,NOX,RM,AGE,DIS,TAX,PTRATIO,B,LSTAT,CHAS,RAD
0,0.00632,18.0,2.31,0.538,6.575,65.2,4.09,296.0,15.3,396.9,4.98,22.093843,24.365
1,0.02731,0.0,7.07,0.469,6.421,78.9,4.9671,242.0,17.8,396.9,9.14,22.093843,26.833333
2,0.02729,0.0,7.07,0.469,7.185,61.1,4.9671,242.0,17.8,392.83,4.03,22.093843,26.833333
3,0.03237,0.0,2.18,0.458,6.998,45.8,6.0622,222.0,18.7,394.63,2.94,22.093843,27.928947
4,0.06905,0.0,2.18,0.458,7.147,54.2,6.0622,222.0,18.7,396.9,5.33,22.093843,27.928947


## mean encode

In [85]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from itertools import product

class MeanEncoder:
    def __init__(self, categorical_features, n_splits=5, target_type='classification', prior_weight_func=None):
        """
        :param categorical_features: list of str, the name of the categorical columns to encode

        :param n_splits: the number of splits used in mean encoding

        :param target_type: str, 'regression' or 'classification'

        :param prior_weight_func:
        a function that takes in the number of observations, and outputs prior weight
        when a dict is passed, the default exponential decay function will be used:
        k: the number of observations needed for the posterior to be weighted equally as the prior
        f: larger f --> smaller slope
        """

        self.categorical_features = categorical_features
        self.n_splits = n_splits
        self.learned_stats = {}

        if target_type == 'classification':
            self.target_type = target_type
            self.target_values = []
        else:
            self.target_type = 'regression'
            self.target_values = None

        if isinstance(prior_weight_func, dict):
            self.prior_weight_func = eval('lambda x: 1 / (1 + np.exp((x - k) / f))', dict(prior_weight_func, np=np))
        elif callable(prior_weight_func):
            self.prior_weight_func = prior_weight_func
        else:
            self.prior_weight_func = lambda x: 1 / (1 + np.exp((x - 2) / 1))
            
    @staticmethod
    def mean_encode_subroutine(X_train, y_train, X_test, variable, target, prior_weight_func):
        X_train = X_train[[variable]].copy()
        X_test = X_test[[variable]].copy()

        if target is not None:
            nf_name = '{}_pred_{}'.format(variable, target)
            X_train['pred_temp'] = (y_train == target).astype(int)  # classification
        else:
            nf_name = '{}_pred'.format(variable)
            X_train['pred_temp'] = y_train  # regression
        ## 先验概率
        prior = X_train['pred_temp'].mean()

        col_avg_y = X_train.groupby(by=variable, axis=0)['pred_temp'].agg({'mean': 'mean', 'beta': 'size'})
        col_avg_y['beta'] = prior_weight_func(col_avg_y['beta'])
        # 先验 + 后验
        col_avg_y[nf_name] = col_avg_y['beta'] * prior + (1 - col_avg_y['beta']) * col_avg_y['mean']
        col_avg_y.drop(['beta', 'mean'], axis=1, inplace=True)
        # 取出了train中每个值的编码
        nf_train = X_train.join(col_avg_y, on=variable)[nf_name].values
        # 如果test中出现了没有特征类别，则使用先验概率填充
        nf_test = X_test.join(col_avg_y, on=variable).fillna(prior, inplace=False)[nf_name].values

        return nf_train, nf_test, prior, col_avg_y
    def fit_transform(self, X, y):
        """
        :param X: pandas DataFrame, n_samples * n_features
        :param y: pandas Series or numpy array, n_samples
        :return X_new: the transformed pandas DataFrame containing mean-encoded categorical features
        """
        X_new = X.copy()
        if self.target_type == 'classification':
            skf = StratifiedKFold(self.n_splits)
        else:
            skf = KFold(self.n_splits)

        if self.target_type == 'classification':
            self.target_values = sorted(set(y))
            self.learned_stats = {'{}_pred_{}'.format(variable, target): [] for variable, target in
                                  product(self.categorical_features, self.target_values)}
            for variable, target in product(self.categorical_features, self.target_values):
                nf_name = '{}_pred_{}'.format(variable, target)
                X_new.loc[:, nf_name] = np.nan
                for large_ind, small_ind in skf.split(y, y):
                    nf_large, nf_small, prior, col_avg_y = MeanEncoder.mean_encode_subroutine(
                        X_new.iloc[large_ind], y.iloc[large_ind], X_new.iloc[small_ind], variable, target, self.prior_weight_func)
                    
                    X_new.iloc[small_ind, -1] = nf_small
                    self.learned_stats[nf_name].append((prior, col_avg_y))
        else:
            self.learned_stats = {'{}_pred'.format(variable): [] for variable in self.categorical_features}
            for variable in self.categorical_features:
                nf_name = '{}_pred'.format(variable)
                X_new.loc[:, nf_name] = np.nan
                for large_ind, small_ind in skf.split(y, y):
                    nf_large, nf_small, prior, col_avg_y = MeanEncoder.mean_encode_subroutine(
                        X_new.iloc[large_ind], y.iloc[large_ind], X_new.iloc[small_ind], variable, None, self.prior_weight_func)
                    X_new.iloc[small_ind, -1] = nf_small
                    self.learned_stats[nf_name].append((prior, col_avg_y))
        return X_new

    def transform(self, X):
        """
        :param X: pandas DataFrame, n_samples * n_features
        :return X_new: the transformed pandas DataFrame containing mean-encoded categorical features
        """
        X_new = X.copy()

        if self.target_type == 'classification':
            for variable, target in product(self.categorical_features, self.target_values):
                nf_name = '{}_pred_{}'.format(variable, target)
                X_new[nf_name] = 0
                for prior, col_avg_y in self.learned_stats[nf_name]:
                    X_new[nf_name] += X_new[[variable]].join(col_avg_y, on=variable).fillna(prior, inplace=False)[
                        nf_name]
                X_new[nf_name] /= self.n_splits
        else:
            for variable in self.categorical_features:
                nf_name = '{}_pred'.format(variable)
                X_new[nf_name] = 0
                for prior, col_avg_y in self.learned_stats[nf_name]:
                    X_new[nf_name] += X_new[[variable]].join(col_avg_y, on=variable).fillna(prior, inplace=False)[
                        nf_name]
                X_new[nf_name] /= self.n_splits

        return X_new

## mean encode  使用

In [69]:
c1 = np.random.randint(0,10,1000)
y = np.random.randint(0,2,1000)

In [72]:
df = pd.DataFrame({"c1":c1,"y":y})

In [97]:
df

Unnamed: 0,c1,y
0,6,1
1,4,1
2,3,0
3,5,0
4,1,0
5,0,0
6,7,0
7,4,0
8,2,1
9,9,1


In [101]:
df.groupby(by='c1',axis=0)['y'].agg({'size':'size'})

is deprecated and will be removed in a future version
  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,size
c1,Unnamed: 1_level_1
0,100
1,96
2,115
3,94
4,91
5,119
6,98
7,101
8,92
9,94


In [86]:
encode = MeanEncoder(['c1'],2)

In [87]:
encode.fit_transform(df,df['y'])

    c1_pred_0
c1           
0    0.562500
1    0.475000
2    0.472727
3    0.488889
4    0.469388
5    0.540984
6    0.526316
7    0.672727
8    0.461538
9    0.509804
    c1_pred_0
c1           
0    0.538462
1    0.625000
2    0.400000
3    0.469388
4    0.571429
5    0.534483
6    0.536585
7    0.586957
8    0.490566
9    0.488372
    c1_pred_1
c1           
0    0.437500
1    0.525000
2    0.527273
3    0.511111
4    0.530612
5    0.459016
6    0.473684
7    0.327273
8    0.538462
9    0.490196
    c1_pred_1
c1           
0    0.461538
1    0.375000
2    0.600000
3    0.530612
4    0.428571
5    0.465517
6    0.463415
7    0.413043
8    0.509434
9    0.511628


is deprecated and will be removed in a future version


Unnamed: 0,c1,y,c1_pred_0,c1_pred_1
0,6,1,0.526316,0.473684
1,4,1,0.469388,0.530612
2,3,0,0.488889,0.511111
3,5,0,0.540984,0.459016
4,1,0,0.475000,0.525000
5,0,0,0.562500,0.437500
6,7,0,0.672727,0.327273
7,4,0,0.469388,0.530612
8,2,1,0.472727,0.527273
9,9,1,0.509804,0.490196


In [75]:
skf = StratifiedKFold(5)

In [78]:
for l,s in skf.split(y,y):
    print(l.shape,s.shape)

(799,) (201,)
(799,) (201,)
(800,) (200,)
(801,) (199,)
(801,) (199,)


In [79]:
col_avg_y = df.groupby(by='c1', axis=0)['y'].agg({'mean': 'mean', 'beta': 'size'})
col_avg_y

is deprecated and will be removed in a future version
  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,mean,beta
c1,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.45,100
1,0.4375,96
2,0.565217,115
3,0.521277,94
4,0.483516,91
5,0.462185,119
6,0.469388,98
7,0.366337,101
8,0.521739,92
9,0.5,94


In [76]:
{'{}_pred_{}'.format(variable, target): [] for variable, target in
                                  product(['c1'], [0,1])}

{'c1_pred_0': [], 'c1_pred_1': []}

In [None]:
encode.

In [43]:
iris =  sklearn.datasets.load_iris()

In [46]:
iris.target # 有3个类别

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [49]:
(iris.data.shape, iris.feature_names) # 150 行数据，4组数据

((150, 4),
 ['sepal length (cm)',
  'sepal width (cm)',
  'petal length (cm)',
  'petal width (cm)'])

In [50]:
wine = sklearn.datasets.load_wine()

In [51]:
wine.target_names

array(['class_0', 'class_1', 'class_2'], dtype='<U7')

In [52]:
wine.feature_names

['alcohol',
 'malic_acid',
 'ash',
 'alcalinity_of_ash',
 'magnesium',
 'total_phenols',
 'flavanoids',
 'nonflavanoid_phenols',
 'proanthocyanins',
 'color_intensity',
 'hue',
 'od280/od315_of_diluted_wines',
 'proline']

In [54]:
wine.data[0]

array([  14.23,    1.71,    2.43,   15.6 ,  127.  ,    2.8 ,    3.06,    0.28,    2.29,    5.64,    1.04,
          3.92, 1065.  ])

In [55]:
breast_cancer = sklearn.datasets.load_breast_cancer()

In [57]:
breast_cancer.feature_names

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area', 'mean smoothness', 'mean compactness',
       'mean concavity', 'mean concave points', 'mean symmetry', 'mean fractal dimension', 'radius error',
       'texture error', 'perimeter error', 'area error', 'smoothness error', 'compactness error',
       'concavity error', 'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points', 'worst symmetry',
       'worst fractal dimension'], dtype='<U23')

In [59]:
df = pd.DataFrame(breast_cancer.data, columns=breast_cancer.feature_names)

In [63]:
df.iloc[0]

mean radius                  17.990000
mean texture                 10.380000
mean perimeter              122.800000
mean area                  1001.000000
mean smoothness               0.118400
mean compactness              0.277600
mean concavity                0.300100
mean concave points           0.147100
mean symmetry                 0.241900
mean fractal dimension        0.078710
radius error                  1.095000
texture error                 0.905300
perimeter error               8.589000
area error                  153.400000
smoothness error              0.006399
compactness error             0.049040
concavity error               0.053730
concave points error          0.015870
symmetry error                0.030030
fractal dimension error       0.006193
worst radius                 25.380000
worst texture                17.330000
worst perimeter             184.600000
worst area                 2019.000000
worst smoothness              0.162200
worst compactness        

In [60]:
list(zip(df.columns, df.dtypes))

[('mean radius', dtype('float64')),
 ('mean texture', dtype('float64')),
 ('mean perimeter', dtype('float64')),
 ('mean area', dtype('float64')),
 ('mean smoothness', dtype('float64')),
 ('mean compactness', dtype('float64')),
 ('mean concavity', dtype('float64')),
 ('mean concave points', dtype('float64')),
 ('mean symmetry', dtype('float64')),
 ('mean fractal dimension', dtype('float64')),
 ('radius error', dtype('float64')),
 ('texture error', dtype('float64')),
 ('perimeter error', dtype('float64')),
 ('area error', dtype('float64')),
 ('smoothness error', dtype('float64')),
 ('compactness error', dtype('float64')),
 ('concavity error', dtype('float64')),
 ('concave points error', dtype('float64')),
 ('symmetry error', dtype('float64')),
 ('fractal dimension error', dtype('float64')),
 ('worst radius', dtype('float64')),
 ('worst texture', dtype('float64')),
 ('worst perimeter', dtype('float64')),
 ('worst area', dtype('float64')),
 ('worst smoothness', dtype('float64')),
 ('worst 