# Part 3 - Data Prep

https://www.udemy.com/course/feature-engineering-for-machine-learning

* Types and characteristics of data
* Missing data imputation
* Categorical encoding
* Variable transformation
* Discretization
* Outliers
* Datetime
* Scaling
* Feature creation

## Load Data

In [None]:
import pandas as pd

df = pd.read_csv('.../created_raw_data.csv')
print(df.shape)
print(df.info())
df.head()

In [None]:
import preppy.utils as utils
from preppy.version import __version__

print(__version__)

utils.report.write_report(df, thresh=.5)

PrepPy Version: 0.1.0
REPORT FOR DATA PREP

#################################################
Columns with Constant Values
#################################################
['constant_2', 'constant_1']

#################################################
Columns with Quasi-Constant Values
#################################################
['binary', 'class', 'constant_1', 'constant_2', 'pd qcut1', 'random choice 2']

#################################################
Duplicate Rows
#################################################
20

#################################################
Duplicate Columns
#################################################
['duplicate_1', 'constant_1', 'duplicate_2']

#################################################
Variables with Noticeably Higher Scales
#################################################
Features with Noticeably Higher Scales (Based on Standard Deviation):
standard scaling    10208.332456
Name: std, dtype: float64

Features with Noticeably High

In [None]:
import preppy.utils as preppy

consts = preppy.functions.identify_consts(df)
quasi_consts = preppy.functions.identify_quasi_consts(df)
duplicates = preppy.functions.check_col_duplicates(df)
print(duplicates)
print(consts)
print(quasi_consts)

['duplicate_1', 'constant_1', 'duplicate_2']
['constant_2', 'constant_1']
['constant_1', 'constant_2']


In [None]:
# numeric_df = df.apply(pd.to_numeric, errors='coerce')
all_deletes = list(set(consts + quasi_consts + duplicates))
for col in all_deletes:
  print(col, df[col].dtype)
  if df[col].dtype in ['float64', 'int64']:
    df_numerical.remove(col)
  elif df[col].dtype in ['object']:
    df_object.remove(col)
    df_categorical_features.remove(col)
  else:
    df_discreet.remove(col)


duplicate_2 float64
duplicate_1 float64
constant_2 object
constant_1 object


## PrepPy Pipeline

In [None]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
import preppy.utils as preppy

pipe = [
    ('constants', preppy.classes.RemoveConstants()),
    ('quasiconsts', preppy.classes.RemoveQuasiConstants(thresh=0.8)),
    ('duplicates', preppy.classes.DropDuplicates()),
    ('missing', preppy.classes.HandleMissingValues()),
    # ('encoding', HandleCatEncodeing())
]

pipe_model = Pipeline(pipe)
data = pipe_model.fit_transform(df)
cols = [col for col in df.columns if col not in consts + quasi_consts + duplicates]
nu_df = pd.DataFrame(data, columns=cols)
nu_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 37 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   binary                 1000 non-null   object
 1   given_name             1000 non-null   object
 2   surname                1000 non-null   object
 3   date_of_birth          1000 non-null   object
 4   phone_number           1000 non-null   object
 5   email                  1000 non-null   object
 6   address                1000 non-null   object
 7   city                   1000 non-null   object
 8   state                  1000 non-null   object
 9   zipcode                1000 non-null   object
 10  correlated w target 2  1000 non-null   object
 11  standard scaling       1000 non-null   object
 12  pd qcut2               1000 non-null   object
 13  uniform corr 1         1000 non-null   object
 14  informative_1          1000 non-null   object
 15  semi_constant_2       

  dfx[feat] = df[feat].fillna(df[feat].mode()[0])


In [None]:
import pickle

# Load the pickled variable from the file
with open('.../var_types.pkl', 'rb') as f:
    var_types = pickle.load(f)

print(var_types)

{'df_numerical': ['correlated w target 2', 'standard scaling', 'uniform corr 1', 'informative_1', 'outliers 1', 'correlated w target 1', 'min max scaling', 'target', 'multicollinearity 4', 'multicollinearity 2', 'multicollinearity 1', 'informative_2', 'corr_feature_class', 'uniform corr 2', 'outliers 2', 'multicollinearity 3', 'class'], 'df_object': ['binary', 'given_name', 'surname', 'date_of_birth', 'phone_number', 'email', 'address', 'city', 'state', 'zipcode', 'semi_constant_2', 'semi_constant_1', 'random label num 12', 'random label num 4', 'random choice 2', 'random choice 7', 'random choice 4'], 'df_discreet': ['pd qcut2', 'pd qcut1', 'pd qcut3'], 'df_categorical_features': ['binary', 'given_name', 'surname', 'date_of_birth', 'phone_number', 'email', 'address', 'city', 'state', 'zipcode', 'pd qcut2', 'semi_constant_2', 'pd qcut1', 'semi_constant_1', 'random label num 12', 'pd qcut3', 'random label num 4', 'random choice 2', 'random choice 7', 'random choice 4']}


In [None]:
df_numerical = var_types['df_numerical']
df_object = var_types['df_object']
df_discreet = var_types['df_discreet']
df_categorical_features = var_types['df_categorical_features']

In [None]:
# code along
nu_df[df_numerical] = nu_df[df_numerical].astype(float)
nu_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 37 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   binary                 1000 non-null   object 
 1   given_name             1000 non-null   object 
 2   surname                1000 non-null   object 
 3   date_of_birth          1000 non-null   object 
 4   phone_number           1000 non-null   object 
 5   email                  1000 non-null   object 
 6   address                1000 non-null   object 
 7   city                   1000 non-null   object 
 8   state                  1000 non-null   object 
 9   zipcode                1000 non-null   object 
 10  correlated w target 2  1000 non-null   float64
 11  standard scaling       1000 non-null   float64
 12  pd qcut2               1000 non-null   object 
 13  uniform corr 1         1000 non-null   float64
 14  informative_1          1000 non-null   float64
 15  semi_

## Feature Engineering

### Feature Combination

In [None]:
# create a new variable by combining two variables
df['scaling_combined'] = df['standard scaling'] + df['min max scaling']
df.drop(['standard scaling', 'min max scaling'], axis=1, inplace=True)

### Categorical Encoding

In [None]:
# code along
import preppy.utils as utils

df = utils.functions.do_OHE(df)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 45 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   binary                             1000 non-null   int64  
 1   given_name                         1000 non-null   float64
 2   surname                            1000 non-null   float64
 3   date_of_birth                      1000 non-null   float64
 4   phone_number                       1000 non-null   float64
 5   email                              1000 non-null   float64
 6   address                            1000 non-null   float64
 7   city                               1000 non-null   float64
 8   state                              1000 non-null   float64
 9   zipcode                            1000 non-null   float64
 10  correlated w target 2              1000 non-null   float64
 11  standard scaling                   1000 non-null   float6

In [None]:
df.to_csv('.../prepared_data.csv', index=False)