# Feature Engineering.

- Adjustment using power transformation to increase variance.

- Transformation to unsigned integer of 32 bits (`uint32`).

### Why `uint32`?

- Possible reduction in processing consumption.

- Reduced memory usage.

- Unsigned integer: for mathematical reasons it is good to avoid values equal to zero (or numerically approximated).

- 32 bits: for preserve was necessary to preserve enough numerical precision. In the case of `uint16` and `uint8`, there would be a big loss of precision.

#### Author: Israel Oliveira [\[e-mail\]](mailto:'Israel%20Oliveira%20'<prof.israel@gmail.com>)

In [1]:
%load_ext watermark

In [2]:
import pandas as pd
from sklearn.preprocessing import PowerTransformer
import numpy as np
from sys import float_info

In [3]:
from tqdm import tqdm

# from glob import glob

# import matplotlib.pyplot as plt
# %matplotlib inline
# from matplotlib import rcParams
# from cycler import cycler

# rcParams['figure.figsize'] = 12, 8 # 18, 5
# rcParams['axes.spines.top'] = False
# rcParams['axes.spines.right'] = False
# rcParams['axes.grid'] = True
# rcParams['axes.prop_cycle'] = cycler(color=['#365977'])
# rcParams['lines.linewidth'] = 2.5

# import seaborn as sns
# sns.set_theme()

# pd.set_option("max_columns", None)
# pd.set_option("max_rows", None)
# pd.set_option('display.max_colwidth', None)

# from IPython.display import Markdown, display
# def md(arg):
#     display(Markdown(arg))

from pandas_profiling import ProfileReport
#report = ProfileReport(#DataFrame here#, minimal=True)
#report.to

# import pyarrow.parquet as pq
# #df = pq.ParquetDataset(path_to_folder_with_parquets, filesystem=None).read_pandas().to_pandas()

# import json
# def open_file_json(path,mode='r',var=None):
#     if mode == 'w':
#         with open(path,'w') as f:
#             json.dump(var, f)
#     if mode == 'r':
#         with open(path,'r') as f:
#             return json.load(f)

# import functools
# import operator
# def flat(a):
#     return functools.reduce(operator.iconcat, a, [])

# import json
# from glob import glob
# from typing import NewType


# DictsPathType = NewType("DictsPath", str)


# def open_file_json(path):
#     with open(path, "r") as f:
#         return json.load(f)

# class LoadDicts:
#     def __init__(self, dict_path: DictsPathType = "./data"):
#         Dicts_glob = glob(f"{dict_path}/*.json")
#         self.List = []
#         self.Dict = {}
#         for path_json in Dicts_glob:
#             name = path_json.split("/")[-1].replace(".json", "")
#             self.List.append(name)
#             self.Dict[name] = open_file_json(path_json)
#             setattr(self, name, self.Dict[name])


In [4]:
# Run this cell before close.
%watermark -d --iversion -b -r -g -m -v
!cat /proc/cpuinfo |grep 'model name'|head -n 1 |sed -e 's/model\ name/CPU/'
!free -h |cut -d'i' -f1  |grep -v total

Python implementation: CPython
Python version       : 3.9.6
IPython version      : 7.25.0

Compiler    : GCC 8.3.0
OS          : Linux
Release     : 5.11.0-7620-generic
Machine     : x86_64
Processor   : 
CPU cores   : 4
Architecture: 64bit

Git hash: 6cc65848e7eeaa692037ac36e1b5c7b4e28c7212

Git repo: https://github.com/ysraell/creditcardfraud.git

Git branch: main

pandas: 1.3.1
numpy : 1.19.5

CPU	: Intel(R) Core(TM) i7-7500U CPU @ 2.70GHz
Mem:           15G
Swap:         4.0G


In [5]:
df = pd.read_csv('/work/data/creditcard.csv')
df

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,...,0.213454,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,172787.0,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,172788.0,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,172788.0,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.00,0


# Transformation to increase the variance.

In [6]:
pt = PowerTransformer(method='box-cox', standardize=True)

unique_rel = {}
norm_cols = [col for col in df.columns if 'V' in col]
for col in tqdm(norm_cols):
    vals = df[col].to_numpy()
    vals = vals +abs(min(vals))+0.1
    trans_vals = pt.fit_transform(vals.reshape(-1, 1))
    unique_rel[col] = pd.DataFrame(trans_vals).nunique()/df[col].nunique()
    df[col] = trans_vals

100%|██████████| 28/28 [00:11<00:00,  2.47it/s]


In [7]:
#report = ProfileReport(df.query('Class == 1'), minimal=True)
#report.to_notebook_iframe()

In [8]:
#df.to_csv('/work/data/creditcard_trans_float.csv', index=False)

How much unique values keep:

In [9]:
sum([x.loc[0] for x in unique_rel.values()])/len(norm_cols)

1.0

# Transformation float to `uint32`

In [10]:
th = 0.9999
n_digits = range(3,16)
max_int = np.iinfo(np.uint32).max - 1
unique_rel = {}
for col in tqdm(norm_cols):
    vals_ori = df[col].to_numpy()
    unique_vals = df[col].nunique()
    vals_rc = vals_ori.copy()
    nunique_vals_rc = 0.0
    for n in n_digits:
        vals = vals_ori*10**n
        vals = (vals + abs(vals.min()) + 1).round(0)
        if vals.max() >= max_int:
            break
        vals_rc = vals.round(0).astype(np.uint32)
        nunique_vals_rc = len(set(vals_rc))/unique_vals
        if nunique_vals_rc >= th:
            break
    unique_rel[col] = nunique_vals_rc
    df[col] = vals_rc

100%|██████████| 28/28 [00:10<00:00,  2.73it/s]


How much unique values keep:

In [12]:
sum([x for x in unique_rel.values()])/len(norm_cols)

0.9966083069337769

In [17]:
#df.to_csv('/work/data/creditcard_trans_int.csv', index=False)