## Data Preprocessing Methods with Scikit-Learn

In [2]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.model_selection import train_test_split

**1. Data Encoding**


Some of the widely used data encoding methods are **Label Encoding** and **One Hot Encoding.** Let us go through these methods with brief explanations and Python examples.

**a) Label Encoding**

In [3]:
basket = ['apple', 'orange', 'grape', 'strawberry', 'melon', 'plum', 'banana', 'melon', 'plum', 'plum', 'grape', 'watermelon', 'melon', 'orange']

In [4]:
encoder = LabelEncoder()
encoder.fit(basket)
labels = encoder.fit_transform(basket)
print(labels) #[0 4 2 6 3 5 1 3 5 5 2 7 3 4]

[0 4 2 6 3 5 1 3 5 5 2 7 3 4]


In [5]:
encoder.inverse_transform(labels)

array(['apple', 'orange', 'grape', 'strawberry', 'melon', 'plum',
       'banana', 'melon', 'plum', 'plum', 'grape', 'watermelon', 'melon',
       'orange'], dtype='<U10')

**b) One-Hot Encoding**

In [6]:
labels = encoder.fit_transform(basket).reshape(-1, 1)
onehot_encoder = OneHotEncoder()
onehot_labels = onehot_encoder.fit_transform(labels)
onehot_labels.toarray()

array([[1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0.]])

In [68]:
basket_df = pd.DataFrame(basket, columns = ['Fruit'])
pd.get_dummies(basket_df)

Unnamed: 0,Fruit_apple,Fruit_banana,Fruit_grape,Fruit_melon,Fruit_orange,Fruit_plum,Fruit_strawberry,Fruit_watermelon
0,True,False,False,False,False,False,False,False
1,False,False,False,False,True,False,False,False
2,False,False,True,False,False,False,False,False
3,False,False,False,False,False,False,True,False
4,False,False,False,True,False,False,False,False
5,False,False,False,False,False,True,False,False
6,False,True,False,False,False,False,False,False
7,False,False,False,True,False,False,False,False
8,False,False,False,False,False,True,False,False
9,False,False,False,False,False,True,False,False


**2. Feature Scaling**

Feature scaling is a method to ‘normalize’ variables or features of data. Feature scaling may be necessary in machine learning for several reasons. It can make the training faster, and it is also capable of making the flow of gradient descent smooth.


In [71]:
from sklearn.datasets import load_digits
digits = load_digits()
digits_df = pd.DataFrame(digits.data, columns = digits.feature_names)
digits_df

Unnamed: 0,pixel_0_0,pixel_0_1,pixel_0_2,pixel_0_3,pixel_0_4,pixel_0_5,pixel_0_6,pixel_0_7,pixel_1_0,pixel_1_1,...,pixel_6_6,pixel_6_7,pixel_7_0,pixel_7_1,pixel_7_2,pixel_7_3,pixel_7_4,pixel_7_5,pixel_7_6,pixel_7_7
0,0.0,0.0,5.0,13.0,9.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,6.0,13.0,10.0,0.0,0.0,0.0
1,0.0,0.0,0.0,12.0,13.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,11.0,16.0,10.0,0.0,0.0
2,0.0,0.0,0.0,4.0,15.0,12.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,3.0,11.0,16.0,9.0,0.0
3,0.0,0.0,7.0,15.0,13.0,1.0,0.0,0.0,0.0,8.0,...,9.0,0.0,0.0,0.0,7.0,13.0,13.0,9.0,0.0,0.0
4,0.0,0.0,0.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,16.0,4.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1792,0.0,0.0,4.0,10.0,13.0,6.0,0.0,0.0,0.0,1.0,...,4.0,0.0,0.0,0.0,2.0,14.0,15.0,9.0,0.0,0.0
1793,0.0,0.0,6.0,16.0,13.0,11.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,6.0,16.0,14.0,6.0,0.0,0.0
1794,0.0,0.0,1.0,11.0,15.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,9.0,13.0,6.0,0.0,0.0
1795,0.0,0.0,2.0,10.0,7.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,5.0,12.0,16.0,12.0,0.0,0.0


**a) StandardScaler()**

In [72]:
scaler = StandardScaler()
standard_digits = scaler.fit_transform(digits_df)
standard_digits = pd.DataFrame(standard_digits, columns = digits.feature_names)
standard_digits

Unnamed: 0,pixel_0_0,pixel_0_1,pixel_0_2,pixel_0_3,pixel_0_4,pixel_0_5,pixel_0_6,pixel_0_7,pixel_1_0,pixel_1_1,...,pixel_6_6,pixel_6_7,pixel_7_0,pixel_7_1,pixel_7_2,pixel_7_3,pixel_7_4,pixel_7_5,pixel_7_6,pixel_7_7
0,0.0,-0.335016,-0.043081,0.274072,-0.664478,-0.844129,-0.409724,-0.125023,-0.059078,-0.624009,...,-0.757436,-0.209785,-0.023596,-0.299081,0.086719,0.208293,-0.366771,-1.146647,-0.505670,-0.196008
1,0.0,-0.335016,-1.094937,0.038648,0.268751,-0.138020,-0.409724,-0.125023,-0.059078,-0.624009,...,-0.757436,-0.209785,-0.023596,-0.299081,-1.089383,-0.249010,0.849632,0.548561,-0.505670,-0.196008
2,0.0,-0.335016,-1.094937,-1.844742,0.735366,1.097673,-0.409724,-0.125023,-0.059078,-0.624009,...,0.259230,-0.209785,-0.023596,-0.299081,-1.089383,-2.078218,-0.164037,1.565686,1.695137,-0.196008
3,0.0,-0.335016,0.377661,0.744919,0.268751,-0.844129,-0.409724,-0.125023,-0.059078,1.879691,...,1.072563,-0.209785,-0.023596,-0.299081,0.282736,0.208293,0.241430,0.379040,-0.505670,-0.196008
4,0.0,-0.335016,-1.094937,-2.551014,-0.197863,-1.020657,-0.409724,-0.125023,-0.059078,-0.624009,...,-0.757436,-0.209785,-0.023596,-0.299081,-1.089383,-2.306869,0.849632,-0.468564,-0.505670,-0.196008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1792,0.0,-0.335016,-0.253452,-0.432200,0.268751,0.038508,-0.409724,-0.125023,-0.059078,-0.311047,...,0.055897,-0.209785,-0.023596,-0.299081,-0.697349,0.436944,0.646898,0.379040,-0.505670,-0.196008
1793,0.0,-0.335016,0.167290,0.980343,0.268751,0.921145,-0.108958,-0.125023,-0.059078,-0.624009,...,-0.554103,-0.209785,-0.023596,-0.299081,0.086719,0.894246,0.444164,-0.129523,-0.505670,-0.196008
1794,0.0,-0.335016,-0.884566,-0.196776,0.735366,-0.844129,-0.409724,-0.125023,-0.059078,-0.624009,...,-0.757436,-0.209785,-0.023596,-0.299081,-0.697349,-0.706312,0.241430,-0.129523,-0.505670,-0.196008
1795,0.0,-0.335016,-0.674195,-0.432200,-1.131092,-1.020657,-0.409724,-0.125023,-0.059078,-0.624009,...,-0.350769,-0.209785,-0.023596,-0.299081,-0.109298,-0.020358,0.849632,0.887602,-0.505670,-0.196008


In [22]:
standard_digits.mean()

pixel_0_0    0.000000e+00
pixel_0_1   -2.560865e-16
pixel_0_2   -2.347717e-16
pixel_0_3   -3.053268e-16
pixel_0_4    1.396892e-16
                 ...     
pixel_7_3   -6.114606e-16
pixel_7_4   -5.190926e-16
pixel_7_5    1.695299e-16
pixel_7_6    1.787972e-16
pixel_7_7   -6.838961e-16
Length: 64, dtype: float64

In [23]:
standard_digits.var()

pixel_0_0    0.000000
pixel_0_1    1.000557
pixel_0_2    1.000557
pixel_0_3    1.000557
pixel_0_4    1.000557
               ...   
pixel_7_3    1.000557
pixel_7_4    1.000557
pixel_7_5    1.000557
pixel_7_6    1.000557
pixel_7_7    1.000557
Length: 64, dtype: float64

**b) MinMaxScaler()**

In [25]:
minmax = MinMaxScaler()
minmax_digits = minmax.fit_transform(digits_df)
minmax_digits = pd.DataFrame(minmax_digits, columns = digits.feature_names)
minmax_digits

Unnamed: 0,pixel_0_0,pixel_0_1,pixel_0_2,pixel_0_3,pixel_0_4,pixel_0_5,pixel_0_6,pixel_0_7,pixel_1_0,pixel_1_1,...,pixel_6_6,pixel_6_7,pixel_7_0,pixel_7_1,pixel_7_2,pixel_7_3,pixel_7_4,pixel_7_5,pixel_7_6,pixel_7_7
0,0.0,0.0,0.3125,0.8125,0.5625,0.0625,0.0000,0.0,0.0,0.0000,...,0.0000,0.0,0.0,0.000000,0.3750,0.8125,0.6250,0.0000,0.0000,0.0
1,0.0,0.0,0.0000,0.7500,0.8125,0.3125,0.0000,0.0,0.0,0.0000,...,0.0000,0.0,0.0,0.000000,0.0000,0.6875,1.0000,0.6250,0.0000,0.0
2,0.0,0.0,0.0000,0.2500,0.9375,0.7500,0.0000,0.0,0.0,0.0000,...,0.3125,0.0,0.0,0.000000,0.0000,0.1875,0.6875,1.0000,0.5625,0.0
3,0.0,0.0,0.4375,0.9375,0.8125,0.0625,0.0000,0.0,0.0,0.5000,...,0.5625,0.0,0.0,0.000000,0.4375,0.8125,0.8125,0.5625,0.0000,0.0
4,0.0,0.0,0.0000,0.0625,0.6875,0.0000,0.0000,0.0,0.0,0.0000,...,0.0000,0.0,0.0,0.000000,0.0000,0.1250,1.0000,0.2500,0.0000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1792,0.0,0.0,0.2500,0.6250,0.8125,0.3750,0.0000,0.0,0.0,0.0625,...,0.2500,0.0,0.0,0.000000,0.1250,0.8750,0.9375,0.5625,0.0000,0.0
1793,0.0,0.0,0.3750,1.0000,0.8125,0.6875,0.0625,0.0,0.0,0.0000,...,0.0625,0.0,0.0,0.000000,0.3750,1.0000,0.8750,0.3750,0.0000,0.0
1794,0.0,0.0,0.0625,0.6875,0.9375,0.0625,0.0000,0.0,0.0,0.0000,...,0.0000,0.0,0.0,0.000000,0.1250,0.5625,0.8125,0.3750,0.0000,0.0
1795,0.0,0.0,0.1250,0.6250,0.4375,0.0000,0.0000,0.0,0.0,0.0000,...,0.1250,0.0,0.0,0.000000,0.3125,0.7500,1.0000,0.7500,0.0000,0.0


In [27]:
minmax_digits.min(), minmax_digits.max()

(pixel_0_0    0.0
 pixel_0_1    0.0
 pixel_0_2    0.0
 pixel_0_3    0.0
 pixel_0_4    0.0
             ... 
 pixel_7_3    0.0
 pixel_7_4    0.0
 pixel_7_5    0.0
 pixel_7_6    0.0
 pixel_7_7    0.0
 Length: 64, dtype: float64,
 pixel_0_0    0.0
 pixel_0_1    1.0
 pixel_0_2    1.0
 pixel_0_3    1.0
 pixel_0_4    1.0
             ... 
 pixel_7_3    1.0
 pixel_7_4    1.0
 pixel_7_5    1.0
 pixel_7_6    1.0
 pixel_7_7    1.0
 Length: 64, dtype: float64)

## Task 31

#### 1. Loading the Titanic Dataset
#### 2.  Preprocessing and Modeling Workflow

In [8]:
import seaborn as sns
import pandas as pd
df = sns.load_dataset('titanic')
df = df[['age', 'fare', 'sex', 'embarked', 'survived']]
df


Unnamed: 0,age,fare,sex,embarked,survived
0,22.0,7.2500,male,S,0
1,38.0,71.2833,female,C,1
2,26.0,7.9250,female,S,1
3,35.0,53.1000,female,S,1
4,35.0,8.0500,male,S,0
...,...,...,...,...,...
886,27.0,13.0000,male,S,0
887,19.0,30.0000,female,S,1
888,,23.4500,female,S,0
889,26.0,30.0000,male,C,1


#### 3. Preprocessing

**Define the features and target**

In [9]:

X = df.drop('survived', axis=1)
y = df['survived']

**Numerical features**

In [10]:

numerical_features = ['age', 'fare']
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  #Impute missing values with the mean
    ('scaler', StandardScaler())  #Standardize features by removing the mean and scaling to unit variance
])

**Categorical features**

In [11]:
categorical_features = ['sex', 'embarked']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), #Impute missing values with the most frequent value
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  #Encode categorical features as one-hot
])

**Combine preprocessing steps**

In [12]:

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])


**Create a preprocessing and modeling pipeline**

In [13]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression()) 
])

#### 4. Train-Test Split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


#### 5. Train and Evaluate Models

In [15]:
#Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

#Predict on the test data
y_pred = pipeline.predict(X_test)

#Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.78


#### 6. Hyperparameter Tuning

In [None]:
# Define the parameter grid
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__max_features': ['sqrt', 'log2', 0.2, 0.5],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4],
}

# Create a new pipeline with a RandomForestClassifier
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Use GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the GridSearchCV on the training data
grid_search.fit(X_train, y_train)

In [90]:
# Best parameters and best score
print(f'Best Parameters: {grid_search.best_params_}')
print(f'Best Score: {grid_search.best_score_:.2f}')

Best Parameters: {'classifier': RandomForestClassifier(max_depth=10, max_features='log2'), 'classifier__max_depth': 10, 'classifier__max_features': 'log2', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 100}
Best Score: 0.80


In [89]:
# Predict on the test data with the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Evaluate the best model
accuracy = accuracy_score(y_test, y_pred)
print(f'Final Model Accuracy: {accuracy:.2f}')


Final Model Accuracy: 0.74
