## Handle Missing Values

### Introduce Missing Values

In [None]:
from sklearn import datasets
import pandas as pd
import numpy as np
%matplotlib inline
# import some data to play with
iris = datasets.load_iris()
X = pd.DataFrame(iris.data)
X.columns = ["SEPAL_LENGTH", "SEPAL_WIDTH", "PETAL_LENGTH", "PETAL_WIDTH"]
X_incomplete = X.copy()
X_incomplete.loc[150] = [np.NaN, np.NaN, np.NaN, np.NaN]

X_incomplete.tail()

### Remove Rows with Missing Values

In [None]:
X_incomplete1 = X_incomplete.copy()
X_incomplete1.dropna(inplace=True)
X_incomplete1.tail()

### Imputation: Replace Missing Values with Mean

In [None]:
X_incomplete1 = X_incomplete.copy()
X_incomplete1.fillna(X_incomplete1.mean(), inplace=True)
# count the number of NaN values in each column
X_incomplete1.tail()

### Imputation: Use Scikit-Learn Preprocessing

In [None]:
X_incomplete1 = X_incomplete.copy()
from sklearn.preprocessing import Imputer
imputer = Imputer(strategy="median") #default is mean
transformed_values = pd.DataFrame(imputer.fit_transform(X_incomplete1))
transformed_values.tail()

## Handle Outliers

### Winsorize

In [None]:
X = [92,19,101,58,1053,91,26,78,10,13,-40,101,86,85,15,89,89,28,-5,41]
print("mean=%.2f"%np.mean(X))
from scipy.stats import mstats
X1 = mstats.winsorize(X, limits=0.05, inplace=True) #It is thresholding at 5th and 95th percentiles
print(X1)
print("After winsorizing, mean=%.2f"%np.mean(X1))

### Statistics Robust to Outliers

In [None]:
mu, sigma = 0, 2 # mean and standard deviation
s = np.random.normal(mu, sigma, 1000)
import matplotlib.pyplot as plt
count, bins, ignored = plt.hist(s, 30, normed=True)
plt.plot(bins, 1/(sigma * np.sqrt(2 * np.pi)) * np.exp( - (bins - mu)**2 / (2 * sigma**2) ), linewidth=2, color='r')
plt.show()
print("mean=%.2f, sd=%.2f"%(np.mean(s), np.std(s)))

# Introduce an outlier
s[983] = 200
print("with outlier, mean=%.2f, sd=%.2f"%(np.mean(s), np.std(s)))

from statsmodels import robust
print("with outlier, median=%.2f, sd=%.2f"%(np.median(s), 1.4826*robust.mad(s)))

In [None]:
from statsmodels import robust

## Binning and Scaling

### Binning the Continuous Variable into Bins

In [None]:
import numpy as np
bins = np.linspace(15,70,12)
print(bins)
age_binned = pd.cut(c.iloc[:,0], bins, right=True, labels=range(11))
print("Original value=%d, bin label=%d"%(c.iloc[10,0], age_binned[10]))
print("Original value=%d, bin label=%d"%(c.iloc[31,0], age_binned[31]))

### z-transaction

In [None]:
from sklearn import preprocessing
data_z = data.copy()
data_z.iloc[:,[0,3,5]] = preprocessing.scale(data_z.iloc[:,[0,3,5]])
data_z.describe()

### Scale to between 0 and 1

In [None]:
min_max_scaler = preprocessing.MinMaxScaler()
data_minmax = data.copy()
data_minmax.iloc[:,[0,3,5]] = min_max_scaler.fit_transform(data_minmax.iloc[:,[0,3,5]])
data_minmax.describe()

In [None]:
data.head()

## Handle Time Stamps

### TimeStamps in Strings

In [None]:
datestring = "2017/11/06 19:00:13" # A Monday
from datetime import datetime

datetime_object = datetime.strptime(datestring, '%Y/%m/%d %H:%M:%S')
print("Year=%d"%datetime_object.year)
print("Month=%d"%datetime_object.month)
print("Day of Month=%d"%datetime_object.day)
print("Weekday=%d"%datetime_object.weekday())
print("Week Number=%s"%datetime_object.strftime("%U"))
print("Hour=%d"%datetime_object.hour)
print("Minute=%d"%datetime_object.minute)

### Handle Unix Time Stamp

In [None]:
ts = 1352068320
datetime_object = datetime.fromtimestamp(ts)
print(datetime_object)
print("Year=%d"%datetime_object.year)
print("Month=%d"%datetime_object.month)
print("Day of Month=%d"%datetime_object.day)
print("Weekday=%d"%datetime_object.weekday())
print("Week Number=%s"%datetime_object.strftime("%U"))
print("Hour=%d"%datetime_object.hour)
print("Minute=%d"%datetime_object.minute)