***4.Handling Numerical Data***

In [1]:
#4.1 Rescaling a Feature
import numpy as np
from sklearn import preprocessing
feature=np.array([[-500.5],[-100.1],[0],[100.1],[900.9]])
minmax_scale=preprocessing.MinMaxScaler(feature_range=(0,1))
#xi'=xi-min(x)/max(x)-min(x)
scaled_feature=minmax_scale.fit_transform(feature)
scaled_feature

array([[0.        ],
       [0.28571429],
       [0.35714286],
       [0.42857143],
       [1.        ]])

In [2]:
#4.2 Standardizing a Feature
x=np.array([[-1000.1],[-200.2],[500.5],[600.6],[9000.9]])
scaler=preprocessing.StandardScaler()
#xi'=xi-x(mean)/std(x)
standardized=scaler.fit_transform(x)
standardized

array([[-0.76058269],
       [-0.54177196],
       [-0.35009716],
       [-0.32271504],
       [ 1.97516685]])

In [3]:
#4.3 Normalizing Observations
from sklearn.preprocessing import Normalizer
features=np.array([[0.5,0.5],[1.1,3.4],[1.5,20.2],[1.63,34.2],[10.9,3.3]])
norm=Normalizer(norm="l2")#.transform(features)
#xi'=xi/sqrt(x1^2+x2^2+...+xn^2)
norm.transform(features)

array([[0.70710678, 0.70710678],
       [0.30782029, 0.95144452],
       [0.07405353, 0.99725427],
       [0.04760678, 0.99886615],
       [0.95709822, 0.28976368]])

In [4]:
#4.4 Generating Polynomial and Interaction features
from sklearn.preprocessing import PolynomialFeatures
features=np.array([[2,3],[2,3],[2,3]])
polynomail_interaction=PolynomialFeatures(degree=2,include_bias=False) #x1,x2,x1^2,x2^2
polynomail_interaction.fit_transform(features)

array([[2., 3., 4., 6., 9.],
       [2., 3., 4., 6., 9.],
       [2., 3., 4., 6., 9.]])

In [5]:
#4.5 Tranforming Features
from sklearn.preprocessing import FunctionTransformer
features=np.array([[2,3],[2,3],[2,3]])
def add_ten(x):
    return x+10
ten_tranformer=FunctionTransformer(add_ten)
ten_tranformer.transform(features)    

array([[12, 13],
       [12, 13],
       [12, 13]])

In [6]:
#using pandas
import pandas as pd
df=pd.DataFrame(features,columns=['Feature_1','Feature_2'])
#apply function
df.apply(add_ten)

Unnamed: 0,Feature_1,Feature_2
0,12,13
1,12,13
2,12,13


In [7]:
#4.6 Detecting Outliers
from sklearn.covariance import EllipticEnvelope
from sklearn.datasets import make_blobs
feature,_=make_blobs(n_samples=10,n_features=2,centers=1,random_state=1)
feature[0,0]=1000
feature[0,1]=1000
outlier_detector=EllipticEnvelope(contamination=.1)
outlier_detector.fit(feature)
outlier_detector.predict(feature)

array([-1,  1,  1,  1,  1,  1,  1,  1,  1,  1])

In [8]:
feature=features[:]
def indicies_of_outliers(x):
    q1,q3=np.percentile(x,[25,75])
    iqr=q3-q1
    lower_bound=q1-(iqr*1.5)
    upper_bound=q3+(iqr*1.5)
    return np.where((x>upper_bound) | (x<lower_bound))
    indicies_of_outliers(feature)

In [9]:
#4.7 Handling Outliers
#Typically we have three strategies we can use to handle outliers. First,we can drop theme
houses=pd.DataFrame()
houses['Price']=[534433,392333,293222,4322032]
houses['Bathrooms']=[2,3,3.5,116]
houses['Square_feet']=[1500,2500,1500,48000]
houses[houses['Bathrooms']<20]

Unnamed: 0,Price,Bathrooms,Square_feet
0,534433,2.0,1500
1,392333,3.0,2500
2,293222,3.5,1500


In [10]:
#Sceond make them as outliers
houses["Outliers"]=np.where(houses["Bathrooms"]<20,0,1)
houses

Unnamed: 0,Price,Bathrooms,Square_feet,Outliers
0,534433,2.0,1500,0
1,392333,3.0,2500,0
2,293222,3.5,1500,0
3,4322032,116.0,48000,1


In [11]:
#dampen the effect 
houses["Log_of_Square_feet"]=[np.log(x) for x in houses["Square_feet"]]
houses

Unnamed: 0,Price,Bathrooms,Square_feet,Outliers,Log_of_Square_feet
0,534433,2.0,1500,0,7.31322
1,392333,3.0,2500,0,7.824046
2,293222,3.5,1500,0,7.31322
3,4322032,116.0,48000,1,10.778956


In [12]:
#4.8 Discretizating Features
from sklearn.preprocessing import Binarizer
age=np.array([[6],[12],[20],[36],[65]])
binarizer=Binarizer()
binarizer.fit_transform(age)

array([[1],
       [1],
       [1],
       [1],
       [1]])

In [13]:
np.digitize(age,bins=[20,30,64])

array([[0],
       [0],
       [1],
       [2],
       [3]])

In [14]:
np.digitize(age,bins=[20,30,64],right=True)

array([[0],
       [0],
       [0],
       [2],
       [3]])

In [15]:
#4.9 Grouping Observations Using Clustering 
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
features,_=make_blobs(n_samples=50,n_features=2,centers=3,random_state=1)
dataframe=pd.DataFrame(features,columns=["F1","F2"])
clusters=KMeans(3,random_state=0)
clusters.fit(features)
dataframe["group"]=clusters.predict(features)
dataframe.head(5)


Unnamed: 0,F1,F2,group
0,-9.877554,-3.336145,2
1,-7.28721,-8.353986,0
2,-6.943061,-7.023744,0
3,-7.440167,-8.791959,0
4,-6.641388,-8.075888,0


In [16]:
#4.10 Deleting Observations with Missing Values
features=np.array([[1.1,11.1],[2.2,22.2],[3.3,33.3],[4.4,44.4],[np.nan,55]])
features[~np.isnan(features).any(axis=1)] #keep only observations that are not(denotes by ~) missing


array([[ 1.1, 11.1],
       [ 2.2, 22.2],
       [ 3.3, 33.3],
       [ 4.4, 44.4]])

In [17]:
#using pandas
dataframe=pd.DataFrame(features,columns=["F1","F2"])
dataframe.dropna()


Unnamed: 0,F1,F2
0,1.1,11.1
1,2.2,22.2
2,3.3,33.3
3,4.4,44.4


In [18]:
# Imputing Missing Values
#if you have a small amount of data,predict the missing values using KNN

from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.datasets import make_blobs

# Create sample data
features, _ = make_blobs(n_samples=10, n_features=2, centers=1, random_state=1)

# Scale the features
scaler = StandardScaler()
standardized_features = scaler.fit_transform(features)

# Store the original value, then replace it with NaN
true_value = standardized_features[0, 0]
standardized_features[0, 0] = np.nan

# Correctly instantiate and use KNNImputer
imputer = KNNImputer(n_neighbors=2) # Use n_neighbors
features_knn_imputed = imputer.fit_transform(standardized_features) # Use .fit_transform()

# Print the results
print("True value:", true_value)
print("Imputed value:", features_knn_imputed[0, 0])

True value: -0.5867361918316237
Imputed value: 0.2996679214579001


In [19]:
from sklearn.impute import SimpleImputer
mean_imputer=SimpleImputer(strategy="mean")
features_mean_imputed=mean_imputer.fit_transform(features)
print("True value:", true_value)
print("Imputed value:", features_knn_imputed[0, 0])

True value: -0.5867361918316237
Imputed value: 0.2996679214579001
