In [16]:
from contextlib import contextmanager
from datetime import datetime
from glob import glob
from time import time, sleep
import feather
import logging
import numpy as np
import os
import pandas as pd
import platform
import sys

In [17]:
def start(fname):
    global st_time
    st_time = time()
    print("="*80)
    print("DATE: {}".format(datetime.now()))
    print("FILE: {}".format(fname))
    print("PID: {}".format(os.getpid()))
    print("HOST: {}".format(platform.node()))
    print("ENV: {}".format(platform.platform()))
    print("="*80)


def end():
    print("="*80)
    print("ELAPSED: {:.2f} (sec)".format(elapsed_time()))
    print("="*80)


def elapsed_time():
    return (time() - st_time)


def reset_time():
    global st_time
    st_time = time()

In [18]:
def to_feather(df, path="./"):
    df.reset_index(inplace=True, drop=True)
    os.makedirs(path, exist_ok=True)
    print(f"write to {path}")
    for c in df.columns:
        path_file = os.path.join(path, f"{c}.f")
        if not glob(path_file):
            df[[c]].to_feather(path_file)
        else:
            print(f"WARNIG: {path_file} is exists!")
            sys.exit()

            
def read_feather(path="./", col=None):
    if col is None:
        path_file = os.path.join(path, "*.f")
        print(f"read {path_file}")
        df = pd.concat([ feather.read_dataframe(f) for f in sorted(glob(path_file)) ], axis=1)
    else:
        path_file = os.path.join(path, col)
        if not glob(path_file):
            print(f"read {path_file}")
            df = feather.read_dataframe(path_file)
        else:
            print(f"WARNIG: {path_file} is exists!")
            sys.exit()
    return df

In [19]:
def to_pickles(df, path="./"):
    df.reset_index(inplace=True, drop=True)
    os.makedirs(path, exist_ok=True)
    print(f"write to {path}")
    for c in df.columns:
        path_file = os.path.join(path, f"{c}.f")
        if not glob(path_file):
            df[[c]].to_pickle(path_file)
        else:
            print(f"WARNIG: {path_file} is exists!")
            sys.exit()


def read_pickles(path="./", col=None):
    if col is None:
        path_file = os.path.join(path, "*.f")
        print(f"read {path_file}")
        df = pd.concat([ pd.read_pickle(f) for f in sorted(glob(path_file)) ], axis=1)
    else:
        path_file = os.path.join(path, col)
        if not glob(path_file):
            print(f"read {path_file}")
            df = pd.read_pickle(path_file)
        else:
            print(f"WARNIG: {path_file} is exists!")
            sys.exit()
    return df

In [20]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [21]:
@contextmanager
def timer(name, logger=None, level=logging.DEBUG):
    print_ = print if logger is None else lambda msg: logger.log(level, msg)
    t0 = time()
    print_(f'[{name}] start')
    yield
    print_(f'[{name}] done in {time() - t0:.0f} s')

In [22]:
def get_dummies(df):
    col = df.select_dtypes('O').columns.tolist()
    nunique = df[col].nunique()
    col_binary = nunique[nunique==2].index.tolist()
    [col.remove(c) for c in col_binary]
    df = pd.get_dummies(df, columns=col)
    df = pd.get_dummies(df, columns=col_binary, drop_first=True)
    return df

In [39]:
def _target_mean_encoding(df, src_col, tgt_col):
    """
    target mean encoding
    target が数値のときに利用できる
    """
    target_mean = df.groupby(src_col)[tgt_col].mean().astype("float16")
    df = df[src_col].map(target_mean).copy()
    df.name = src_col+"_"+tgt_col+"_target_mean"
    return df


def target_mean_encoding(df, tgt_col):
    """
    target mean encoding
    すべてのカテゴリ変数に target mean encoding
    """
    src_col = df.select_dtypes("O").columns.tolist()
    if tgt_col in src_col: src_col.remove(tgt_col)
    df = pd.concat([ _target_mean_encoding(df, c, tgt_col) for c in src_col ], axis=1)
    return df


def _bin_counting(df, src_col, tgt_col):
    """
    bin counting
    target がカテゴリのときに利用できる
    """
    cross_df         = pd.crosstab(df[src_col], df[tgt_col], normalize="index")
    labels           = [src_col+"_"+tgt_col+"_"+str(col) for col in cross_df.columns]
    cross_df.columns = labels
    df  = pd.merge(df, cross_df, left_on=src_col, right_index=True, how="left")
    return df[labels]


def bin_counting(df, tgt_col):
    """
    bin counting
    すべてのカテゴリ変数と tgt_col に bin counting
    """
    src_col = df.select_dtypes('O').columns.tolist()
    if tgt_col in src_col: src_col.remove(tgt_col)
    df = pd.concat([ _bin_counting(df, c, tgt_col) for c in src_col ], axis=1)
    return df

In [58]:
# バックオフ：少数派カテゴリ値を抽出
def _backoff_cand(df, cutoff=0.01):
    col = df.select_dtypes('O').columns.tolist()
    cand = {c: [idx for idx, val in df[c].value_counts(normalize=True, dropna=False).iteritems() if val < cutoff] for c in col}
    cand = {key: val for key, val in cand.items() if val!=[]} # 空リストを削除
    return cand


# バックオフ：少数派カテゴリ値をダミーで置き換え
def _backoff_replace(df, cand, inplace, prfx, rep):
    _df = pd.DataFrame({})
    col = []
    for key, val in cand.items():
        tmp = df[key]
        col_name = prfx+str(key)
        for idx in val:
            tmp = tmp.replace(idx, rep)
        _df[col_name] = tmp
        col.append(col_name)
    return _df


def backoff(df, prfx="@", rep="@backoff", inplace=False):
    cand = _backoff_cand(df)
    _df = _backoff_replace(df, cand, inplace, prfx, rep)
    if inplace:
        for key, val in cand.items():
            df[key] = _df[prfx+str(key)]
        return df
    else:
        return pd.concat([df, _df], axis=1)

In [61]:
start("hoge")
end()

DATE: 2019-04-01 18:00:42.306797
FILE: hoge
PID: 9016
HOST: Y-project-11
ENV: Windows-10-10.0.17134-SP0
ELAPSED: 0.00 (sec)


In [28]:
df = pd.DataFrame({"a":[1,2,3,4,5], "b":[6,7,8,9,10], "c":["a","b","c","d","e"]})
df

Unnamed: 0,a,b,c
0,1,6,a
1,2,7,b
2,3,8,c
3,4,9,d
4,5,10,e


In [10]:
to_feather(df, "new_folder00")
df00 = read_feather("new_folder00")
df00

write to new_folder00
read new_folder00\*.f


Unnamed: 0,a,b,c
0,1,6,a
1,2,7,b
2,3,8,c
3,4,9,d
4,5,10,e


In [11]:
to_pickles(df, "new_folder01")
df01 = read_pickles("new_folder01")
df01

write to new_folder01
read new_folder01\*.f


Unnamed: 0,a,b,c
0,1,6,a
1,2,7,b
2,3,8,c
3,4,9,d
4,5,10,e


In [12]:
with timer("reduce mem usage"):
    df = reduce_mem_usage(df)

[reduce mem usage] start
Mem. usage decreased to  0.00 Mb (35.0% reduction)
[reduce mem usage] done in 0 s


In [13]:
with timer("reduce"):
    get_dummies(df)

[reduce] start
[reduce] done in 0 s


In [15]:
with timer("timer"):
    for _ in range(10):
        sleep(0.1)

[timer] start
[timer] done in 1 s


In [63]:
a = {
    "cat1": ["a", "b", "a", "a", "c", "c", "c"],
    "cat2": ["A", "A", "B", "B", "B", "C", "C"],
    "numeric":  [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
    "label":    [1, 1, 1, 2, 2, 3, 3]
}
with timer("proprocessing"):
    df = pd.DataFrame(a)
    # get_dummies(df)
    # bin_counting(df, "label")
    # target_mean_encoding(df, "numeric")
    backoff(df, inplace=True)

[proprocessing] start
[proprocessing] done in 0 s
