# Pipelines

In [1]:
import pandas as pd
pd.set_option('future.no_silent_downcasting', True)

In [2]:
df = pd.read_csv('new_features_data.csv')

In [3]:
df.head()

Unnamed: 0,Customer ID,Age,Gender,Loyalty Member,Recency,Frequency,Monetary,Churn,Favorite Product Type,Product Diversity,Total Orders,Cancellation Rate,Average Rating,Add-on Frequency,Preferred Payment Method,Preferred Shipping Type
0,1000,53,Male,No,229,2,6279.42,1,Smartphone,2,2,50.0,2.5,1.0,Credit Card,Overnight
1,1002,41,Male,Yes,118,2,5020.6,0,Laptop,2,2,0.0,2.5,0.5,Cash,Express
2,1003,75,Male,Yes,198,1,41.5,1,Smartphone,1,1,0.0,5.0,1.0,Cash,Express
3,1004,41,Female,No,193,1,83.0,1,Smartphone,1,1,0.0,5.0,1.0,Credit Card,Standard
4,1005,25,Female,No,164,2,11779.11,0,Laptop,2,2,0.0,3.0,0.5,Debit Card,Overnight


In [4]:
df_num = df.select_dtypes(exclude=['object'])
df_cat = df.select_dtypes(include=['object'])

In [5]:
numerical_features = list(df_num.columns)
categorical_features = list(df_cat.columns)

In [6]:
numerical_features

['Customer ID',
 'Age',
 'Recency',
 'Frequency',
 'Monetary',
 'Churn',
 'Product Diversity',
 'Total Orders',
 'Cancellation Rate',
 'Average Rating',
 'Add-on Frequency']

In [7]:
categorical_features

['Gender',
 'Loyalty Member',
 'Favorite Product Type',
 'Preferred Payment Method',
 'Preferred Shipping Type']

In [8]:
binary_features = ['Gender', 'Loyalty Member']
one_hot_features = ['Favorite Product Type', 'Preferred Payment Method', 'Preferred Shipping Type']

In [9]:
df_cat.nunique()

Gender                      2
Loyalty Member              2
Favorite Product Type       5
Preferred Payment Method    6
Preferred Shipping Type     5
dtype: int64

In [10]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer

In [11]:
def binary_encode(df):
    """Binary encoding for Gender and Loyalty Member"""
    return df.replace({'Male': 0, 'Female': 1, 'Yes': 1, 'No': 0})

In [12]:
binary_pipeline = Pipeline(steps=[
    ('binary_encoder', FunctionTransformer(binary_encode, validate=False))
])

one_hot_pipeline = Pipeline(steps=[
    ('one_hot_encoder', OneHotEncoder(drop='first', sparse_output=False))
])

num_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler())
])

In [13]:
preprocessor = ColumnTransformer(
    transformers=[
        ('binary', binary_pipeline, binary_features),
        ('one_hot', one_hot_pipeline, one_hot_features),
        ('numerical', num_pipeline, numerical_features)
    ],
    remainder='drop'
)

In [14]:
final_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

In [15]:
processed_data = final_pipeline.fit_transform(df)

In [16]:
one_hot_encoder = preprocessor.named_transformers_['one_hot']['one_hot_encoder']
one_hot_feature_names = one_hot_encoder.get_feature_names_out(one_hot_features)

In [17]:
processed_df = pd.DataFrame(
    processed_data,
    columns=binary_features + list(one_hot_feature_names) + numerical_features
)

In [18]:
for col in processed_df.columns:
    processed_df[col] = processed_df[col].astype(float)

In [19]:
processed_df.head()

Unnamed: 0,Gender,Loyalty Member,Favorite Product Type_Laptop,Favorite Product Type_Smartphone,Favorite Product Type_Smartwatch,Favorite Product Type_Tablet,Preferred Payment Method_Cash,Preferred Payment Method_Credit Card,Preferred Payment Method_Debit Card,Preferred Payment Method_PayPal,...,Age,Recency,Frequency,Monetary,Churn,Product Diversity,Total Orders,Cancellation Rate,Average Rating,Add-on Frequency
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.213943,0.290005,0.41068,0.236632,0.952983,0.832837,0.41068,0.416733,-0.572888,0.65599
1,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,-0.44828,-0.915062,0.41068,-0.050176,-1.049337,0.832837,0.41068,-0.811092,-0.572888,-0.691107
2,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.428018,-0.046545,-0.755989,-1.184609,0.952983,-0.695795,-0.755989,-0.811092,1.804859,0.65599
3,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,-0.44828,-0.100827,-0.755989,-1.175153,0.952983,-0.695795,-0.755989,-0.811092,1.804859,0.65599
4,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,-1.331244,-0.415665,0.41068,1.489675,-1.049337,0.832837,0.41068,-0.811092,-0.097339,-0.691107


In [20]:
processed_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12136 entries, 0 to 12135
Data columns (total 26 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Gender                                12136 non-null  float64
 1   Loyalty Member                        12136 non-null  float64
 2   Favorite Product Type_Laptop          12136 non-null  float64
 3   Favorite Product Type_Smartphone      12136 non-null  float64
 4   Favorite Product Type_Smartwatch      12136 non-null  float64
 5   Favorite Product Type_Tablet          12136 non-null  float64
 6   Preferred Payment Method_Cash         12136 non-null  float64
 7   Preferred Payment Method_Credit Card  12136 non-null  float64
 8   Preferred Payment Method_Debit Card   12136 non-null  float64
 9   Preferred Payment Method_PayPal       12136 non-null  float64
 10  Preferred Payment Method_Paypal       12136 non-null  float64
 11  Preferred Shipp

In [21]:
processed_df.to_csv('processed_data.csv')