In [None]:
!pip install kagglehub

In [33]:
import kagglehub
import os
import pandas as pd
import seaborn as sns
import numpy as np
from typing import List, Optional
from scipy import stats

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
    FunctionTransformer,
    LabelEncoder,
    OneHotEncoder,
    StandardScaler,
    OrdinalEncoder
)
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split

# Transformers:

In [34]:
class DropColumnTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Drop specified columns
        X_transformed = X.drop(columns=self.columns, axis=1)
        return X_transformed

In [35]:
class CustomOrdinalEncoder(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns: List[str], order: Optional[dict] = None) -> None:
        self.columns = columns
        self.order = order if order is not None else {}
        self.encoders = {}

    def fit(self, X, y=None):
        for column in self.columns:
            if column in self.order:
                self.encoders[column] = OrdinalEncoder(categories=[self.order[column]])
            else:
                self.encoders[column] = OrdinalEncoder()
            self.encoders[column].fit(X[[column]])
        return self

    def transform(self, X):
        X_transformed = X.copy()
        for column in self.columns:
            X_transformed[column] = self.encoders[column].transform(X[[column]])
        return X_transformed


In [36]:
class OutlierRemoveTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=3):
        self.threshold = threshold
        self.outlier_indices = None

    def fit(self, X, y=None):
        numerical_cols = X.select_dtypes(include=["int64", "float64"]).columns
        z_scores = stats.zscore(X[numerical_cols])
        self.outlier_indices = (abs(z_scores) > self.threshold).any(axis=1)
        return self

    def transform(self, X):
        return X[~self.outlier_indices]

In [37]:
class CustomLabelEncoder(BaseEstimator, TransformerMixin):

    def __init__(self, columns: List[str]) -> None:
        self.columns = columns
        self.encoders = {}

    def fit(self, X, y=None):
        for column in self.columns:
            self.encoders[column] = LabelEncoder()
            self.encoders[column].fit(X[column])
        return self

    def transform(self, X):
        X_transformed = X.copy()
        for column in self.columns:
            X_transformed[column] = self.encoders[column].transform(X[column])
        return X_transformed

In [38]:
class CustomOneHotEncoder(BaseEstimator, TransformerMixin):

    def __init__(self, columns: List[str]) -> None:
        self.columns = columns
        self.encoders = {}

    def fit(self, X, y=None):
        for column in self.columns:
            self.encoders[column] = OneHotEncoder(sparse_output=False)
            self.encoders[column].fit(X[[column]])
        return self

    def transform(self, X):
        X_transformed = X.copy()
        for column in self.columns:
            encoded = pd.DataFrame(
                self.encoders[column].transform(X[[column]]),
                columns=self.encoders[column].get_feature_names_out([column]),
                index=X.index,
            )
            X_transformed = pd.concat(
                [X_transformed.drop(columns=column), encoded], axis=1
            )
        return X_transformed

In [39]:
class CustomStandardScaler(BaseEstimator, TransformerMixin):

    def __init__(self, columns: List[str]) -> None:
        self.columns = columns
        self.scaler = StandardScaler()

    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns])
        return self

    def transform(self, X):
        X_transformed = X.copy()
        X_transformed[self.columns] = self.scaler.transform(X[self.columns])
        return X_transformed

In [40]:
class FeatureEngineeringTransformer:
    def __init__(self, new_column, transformation):
        self.new_column = new_column
        self.transformation = transformation

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_transformed = X.copy()
        
        X_transformed[self.new_column] = self.transformation(X)
        
        return X_transformed

# Youtube comments dataset

Loading the dataset:

In [None]:
path = kagglehub.dataset_download("atifaliak/youtube-comments-dataset")

youtube_comments_df = pd.read_csv(os.path.join(path, 'YoutubeCommentsDataSet.csv'))

youtube_comments_df.head()

In [None]:
print(youtube_comments_df.shape)
print(youtube_comments_df.isnull().sum())

In [None]:
youtube_comments_df.duplicated().any()

In [44]:
youtube_comments_df_cleaning = make_pipeline(
    FunctionTransformer(lambda X: X.drop_duplicates(), validate=False)
)

youtube_comments_df_cleaned = youtube_comments_df_cleaning.fit_transform(youtube_comments_df)

In [45]:
youtube_comments_preprocessing_pipeline = make_pipeline(
    CustomOrdinalEncoder(columns=["Sentiment"], order={"Sentiment": ["negative", "neutral", "positive"]})
)

In [46]:
youtube_comments_df_preprocessed = youtube_comments_preprocessing_pipeline.fit_transform(youtube_comments_df_cleaned)

In [None]:
youtube_comments_df_preprocessed

# Flight Price Prediction

In [None]:
path = kagglehub.dataset_download("shubhambathwal/flight-price-prediction")

flight_price_df = pd.read_csv(os.path.join(path, 'Clean_Dataset.csv'))

flight_price_df.head()


In [None]:
print(flight_price_df.shape)
print(flight_price_df.isnull().sum())

In [None]:
flight_price_df.duplicated().any()

In [51]:
flight_price_df_cleaning = make_pipeline(
    DropColumnTransformer(columns=["Unnamed: 0"]),
    OutlierRemoveTransformer(threshold=3)
)

flight_price_df_cleaned = flight_price_df_cleaning.fit_transform(flight_price_df)

In [52]:
flight_price_preprocessing_pipeline = make_pipeline(
    CustomOneHotEncoder(columns=["airline"]),
    CustomOrdinalEncoder(columns=["departure_time", "stops", "arrival_time", "class"], order={"departure_time": ["Early_Morning", "Morning", "Afternoon", "Evening", "Night", "Late_Night"], "stops": ["zero", "one", "two_or_more"], "arrival_time": ["Early_Morning", "Morning", "Afternoon", "Evening", "Night", "Late_Night"], "class": ["Economy", "Business"]}),
    CustomLabelEncoder(columns=["source_city", "destination_city", "flight"]),
    CustomStandardScaler(columns=["duration", "price", "days_left"])
)

In [53]:
flight_price_df_preprocessed = flight_price_preprocessing_pipeline.fit_transform(flight_price_df_cleaned)

In [None]:
flight_price_df_preprocessed

# Credit Card Dataset

In [None]:
path = kagglehub.dataset_download("arjunbhasin/credit-card-dataset")

credit_card_df = pd.read_csv(os.path.join(path, 'credit_data_norm.csv'))

credit_card_df.head()


In [None]:
print(credit_card_df.shape)
print(credit_card_df.isnull().sum())

In [None]:
credit_card_df.duplicated().any()

In [58]:
credit_card_df_cleaning = make_pipeline(
    DropColumnTransformer(columns=["itzsv_minimum_payments_14"]),
    OutlierRemoveTransformer(threshold=3)
)

credit_card_df_cleaned = credit_card_df_cleaning.fit_transform(credit_card_df)

In [None]:
credit_card_df_cleaned

In [60]:
credit_card_preprocessing_pipeline = make_pipeline(
    FeatureEngineeringTransformer("purchases_ratio", lambda X: X["pwnjx_purchases_2"] / (X["xslth_balance_0"])),
    FeatureEngineeringTransformer("installments_ratio", lambda X: X["ojukq_installments_purchases_4"] / (X["pwnjx_purchases_2"])),
    FeatureEngineeringTransformer("cash_advance_usage_ratio", lambda X: X["bvnag_cash_advance_5"] / (X["xslth_balance_0"])),
    FeatureEngineeringTransformer("balance_frequency_percent", lambda X: X["fmeyv_balance_frequency_1"]),
    FeatureEngineeringTransformer("prc_full_payment_percent", lambda X: X["ubvma_prc_full_payment_15"])
)

credit_card_df_preprocessed = credit_card_preprocessing_pipeline.fit_transform(credit_card_df_cleaned)

In [None]:
credit_card_df_preprocessed