## Install packages

In [None]:
# @title
# Manage data and statistics
import numpy as np
from numpy.random import default_rng, SeedSequence
import pandas as pd

from scipy import stats
from scipy.stats import norm, skewnorm, yeojohnson, boxcox, zscore
from scipy.stats.mstats import winsorize

# Plot data
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter
import seaborn as sns
sns.set(style="white")

# Scale variables
from sklearn.preprocessing import scale, StandardScaler
from sklearn.preprocessing import minmax_scale, MinMaxScaler
from sklearn.preprocessing import maxabs_scale, MaxAbsScaler
from sklearn.preprocessing import robust_scale, RobustScaler

# Transform variables
from sklearn.preprocessing import quantile_transform, QuantileTransformer
from sklearn.preprocessing import power_transform, PowerTransformer

# Encode categorical variables
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder

# Discretize continuous variables
from sklearn.preprocessing import KBinsDiscretizer

# Impute missing values
from sklearn.impute import SimpleImputer, KNNImputer, MissingIndicator
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

## Read data

In [None]:
data = {
    'id': [1000, 1001, 1002, 1003, 1004, 1005, 1006],
    'date': pd.date_range('4/25/2020', periods=7, freq='D'),
    'age': [21, 56, 33, 48, 27, 42, 32],
    'income': [67000, 220000, 97000, 166000, 81000, 157000, 96000],
    'gender': ['Male', 'Female', 'Female', 'Male', 'Male', 'Female', 'Female'],
    'education': ['Bachelors', 'PhD', 'Masters', 'Masters', 'Bachelors', 'Bachelors', 'Bachelors'],
    'passed': [False, True, True, True, False, False, True],
    'measurement': np.random.randn(7).round(2)
}

df = pd.DataFrame(data)
df

Unnamed: 0,id,date,age,income,gender,education,passed,measurement
0,1000,2020-04-25,21,67000,Male,Bachelors,False,-1.8
1,1001,2020-04-26,56,220000,Female,PhD,True,-1.27
2,1002,2020-04-27,33,97000,Female,Masters,True,0.79
3,1003,2020-04-28,48,166000,Male,Masters,True,0.17
4,1004,2020-04-29,27,81000,Male,Bachelors,False,-1.0
5,1005,2020-04-30,42,157000,Female,Bachelors,False,-1.28
6,1006,2020-05-01,32,96000,Female,Bachelors,True,-1.5


## Insert missing values into data

In [None]:
# Insert missing values into the dataset
df2 = df.copy()
df2.iloc[[3, 5], [2, 4, 6, 7]] = np.nan
df2

Unnamed: 0,id,date,age,income,gender,education,passed,measurement
0,1000,2020-04-25,21.0,67000,Male,Bachelors,False,-1.8
1,1001,2020-04-26,56.0,220000,Female,PhD,True,-1.27
2,1002,2020-04-27,33.0,97000,Female,Masters,True,0.79
3,1003,2020-04-28,,166000,,Masters,,
4,1004,2020-04-29,27.0,81000,Male,Bachelors,False,-1.0
5,1005,2020-04-30,,157000,,Bachelors,,
6,1006,2020-05-01,32.0,96000,Female,Bachelors,True,-1.5


## 1. Deal with missing values

## 1a. Create indicator variable for missing values in `age` column

## 1b. Fill in missing values in `age` column using [`pandas`](https://www.statology.org/pandas-fillna-with-mean/)

## 1c. Fill in missing values in `age` column using [`scikit-learn`](https://scikit-learn.org/stable/modules/impute.html)

## 2. Handle categorical data

## 2a. Create dummy variables based on the `education` column using [`pandas`](https://www.statology.org/pandas-get-dummies/)

## 2b. Create dummy variables based on the `education` column using [`scikit-learn`](https://scikit-learn.org/stable/modules/preprocessing.html#preprocessing-categorical-features)

## 3. Bring features on the same scale

## 3a. Center and scale (i.e. standardize) the `income` column using [`pandas`](https://www.geeksforgeeks.org/how-to-standardize-data-in-a-pandas-dataframe/)

In [None]:
print(f"Mean of income: {df2['income'].mean().round(1)}\nStd of income:   {df2['income'].std(ddof=0).round(1)}")

Mean of income: 126285.7
Std of income:   51607.6


## 3b. Center and scale (i.e. standardize) the `income` column using [`scikit-learn`](https://scikit-learn.org/stable/modules/preprocessing.html)

In [None]:
scaler = StandardScaler().fit(df2['income'].values.reshape(-1, 1))

print(f"Mean of income: {np.round(scaler.mean_[0],1)}\nStd of income:   {np.round(scaler.scale_[0],1)}")

Mean of income: 126285.7
Std of income:   51607.6
