In [1]:
# Put packages in an alphabetical order
import numpy as np
import pandas as pd

from preprocessing.categorical import OneHotEncoding
from preprocessing.numerical import Scaler
from preprocessing.selector import Selector

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler


from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression



# Create dataframe

In [2]:
def create_df(N: int = 1000) -> pd.DataFrame:
    """
    Args:
        N: size of DataFrame we want to create
    Returns:
        Random DataFrame
    """
    np.random.seed(42)
    # Create numerical columns
    num1 = np.random.uniform(size=(N,))
    num2 = np.random.normal(loc=100, scale=10, size=(N,))

    # Create categorical columns
    cat1 = np.random.choice(["m", "f"], p=[0.8, 0.2], size=(N,))
    cat2 = np.random.choice(["a", "b", "c", "d"], size=(N,))

    label = np.random.choice([0, 1], p=[0.7, 0.3], size=(N,))

    # Return dataframe
    df = pd.DataFrame(
        {"num1": num1, "num2": num2, "cat1": cat1, "cat2": cat2, "label": label}
    )
    return df


# Create dataframe
df = create_df(2000)

df.tail(n=20)

Unnamed: 0,num1,num2,cat1,cat2,label
1980,0.186101,98.965688,m,d,0
1981,0.802643,87.609473,m,c,0
1982,0.458187,120.958435,m,b,0
1983,0.482969,115.944424,m,a,1
1984,0.13348,106.78947,m,a,1
1985,0.080602,91.875977,m,c,0
1986,0.727939,99.509707,f,b,0
1987,0.496461,98.396399,m,d,0
1988,0.436851,103.308513,m,b,0
1989,0.729508,114.510014,m,d,0


# Train test split

In [3]:
X = df.drop("label", axis=1)
y = df["label"]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

# Preprocessing

# Numerical features

In [5]:
scaler = StandardScaler(with_mean=True, with_std=True)
X_train_scaled = scaler.fit_transform(X_train[["num1", "num2"]])
X_train_scaled

array([[ 1.68087518,  1.22030441],
       [ 1.14230242,  0.18202383],
       [-1.19218411,  0.04376558],
       ...,
       [ 0.0229096 ,  1.59490283],
       [-0.67179381, -1.55402357],
       [-1.22968644, -0.42885218]])

## Categorical features

In [6]:
ohe = OneHotEncoder()
X_train_ohe = ohe.fit_transform(X_train[["cat1", "cat2"]])
X_train_ohe = np.asarray(X_train_ohe.todense())

In [7]:
X_train_prepared = np.concatenate((X_train_scaled, X_train_ohe), axis=1)

X_train_prepared

array([[ 1.68087518,  1.22030441,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 1.14230242,  0.18202383,  0.        , ...,  1.        ,
         0.        ,  0.        ],
       [-1.19218411,  0.04376558,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [ 0.0229096 ,  1.59490283,  0.        , ...,  1.        ,
         0.        ,  0.        ],
       [-0.67179381, -1.55402357,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       [-1.22968644, -0.42885218,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [8]:
X_test_scaled = scaler.transform(X_test[["num1", "num2"]])
X_test_ohe = ohe.transform(X_test[["cat1", "cat2"]])
X_test_ohe = np.asarray(X_test_ohe.todense())

X_test_prepared = np.concatenate((X_test_scaled, X_test_ohe), axis=1)
X_test_prepared

array([[ 0.86019   ,  0.01704258,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-1.07393684, -0.43789244,  1.        , ...,  1.        ,
         0.        ,  0.        ],
       [-0.30917087,  1.39920935,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-1.6284433 , -0.96293008,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.10661195,  1.51193631,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       [ 0.70634851,  1.90976143,  0.        , ...,  0.        ,
         1.        ,  0.        ]])

# Classifier

In [9]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_prepared, y_train)

RandomForestClassifier(random_state=42)

In [10]:
y_pred = rf.predict(X_test_prepared)
print(classification_report(y_true=y_test, y_pred=y_pred))

              precision    recall  f1-score   support

           0       0.73      0.91      0.81       288
           1       0.36      0.12      0.19       112

    accuracy                           0.69       400
   macro avg       0.54      0.52      0.50       400
weighted avg       0.63      0.69      0.64       400



# Better way of setting a preprocessing pipeline

In [11]:
# Set numerical pipeline
num_pipeline = Pipeline(
    steps=[
        ("selector", Selector(["num1", "num2"])),
        ("scaler", Scaler(with_mean=True, with_std=True)),
    ]
)

# Set categorical pipeline
cat_pipeline = Pipeline(
    steps=[("selector", Selector(["cat1", "cat2"])), ("ohe", OneHotEncoding())]
)


# Set feature union
feature_union = FeatureUnion(
    transformer_list=[
        ("num_pipeline", num_pipeline),
        ("categorical_pipeline", cat_pipeline),
    ]
)

In [12]:
X_train_prepared_2 = feature_union.fit_transform(X_train, y_train)

X_test_prepared_2 = feature_union.transform(X_test)

In [13]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_prepared, y_train)

y_pred = rf.predict(X_test_prepared_2)
print(classification_report(y_true=y_test, y_pred=y_pred))

              precision    recall  f1-score   support

           0       0.73      0.91      0.81       288
           1       0.36      0.12      0.19       112

    accuracy                           0.69       400
   macro avg       0.54      0.52      0.50       400
weighted avg       0.63      0.69      0.64       400

