In [2]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [3]:
import numpy as np
import pandas as pd

import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_selection import SelectKBest, f_classif

from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, roc_auc_score, roc_curve

from itertools import product

### **Load Data**

In [4]:
data = pd.read_csv("data/adult.csv.zip")

In [5]:
data.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              48842 non-null  int64 
 1   workclass        48842 non-null  object
 2   fnlwgt           48842 non-null  int64 
 3   education        48842 non-null  object
 4   educational-num  48842 non-null  int64 
 5   marital-status   48842 non-null  object
 6   occupation       48842 non-null  object
 7   relationship     48842 non-null  object
 8   race             48842 non-null  object
 9   gender           48842 non-null  object
 10  capital-gain     48842 non-null  int64 
 11  capital-loss     48842 non-null  int64 
 12  hours-per-week   48842 non-null  int64 
 13  native-country   48842 non-null  object
 14  income           48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


In [6]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


## **Data Quality**

In this part, let's check 2 things:
 - Missing values -> No missing values in this problem
 - Data types -> Some data types need to be changed
    - `capital_gain` -> `float`
    - `capital_loss` -> `float`
    - `age` -> `float`

In [7]:
data = data.astype({
    "fnlwgt": float,
    "capital-gain": float,
    "capital-loss": float,
    "hours-per-week": float,
    "age": float
})

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              48842 non-null  float64
 1   workclass        48842 non-null  object 
 2   fnlwgt           48842 non-null  float64
 3   education        48842 non-null  object 
 4   educational-num  48842 non-null  int64  
 5   marital-status   48842 non-null  object 
 6   occupation       48842 non-null  object 
 7   relationship     48842 non-null  object 
 8   race             48842 non-null  object 
 9   gender           48842 non-null  object 
 10  capital-gain     48842 non-null  float64
 11  capital-loss     48842 non-null  float64
 12  hours-per-week   48842 non-null  float64
 13  native-country   48842 non-null  object 
 14  income           48842 non-null  object 
dtypes: float64(5), int64(1), object(9)
memory usage: 5.6+ MB


## **Exercise:** Build the following pipeline

1. Preprocessing
   1. OHE to all columns except `workclass`
   2. OrdinalEncoder for `workclass`
   3. StandardScaler for all resulting columns + numerical ones
2. Feature Selection technique (SelectKBest)
3. Model training

**How to build the pipeline**

1. Pipeline - `final`
   1. Pipeline - `preprocessing`
      1. ColumnTransformer - `cat_preprocessing`
         1. OneHotEncoder
         1. OrdinalEncoder
      1. StandardScaler
   1. Feature Selection
   1. Logistic Regression

### **Prepare data**

In [9]:
X = data.drop(columns=["income"])
y = data["income"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=99)

In [10]:
# Let's define our final pipeline

steps = []

### **Preprocessing**

1. Build a **ColumnTransformer** for the Categorical variables

In [11]:
categorical_columns = X.select_dtypes(["O","int"]).columns

In [12]:
cat_preprocessing = ColumnTransformer(
    [
        (
            "ohe",
            OneHotEncoder(sparse_output=False),
            categorical_columns.drop("workclass")
        ),
        (
            "ordinal",
            OrdinalEncoder(),
            ["workclass"]
        )
    ],
    remainder="passthrough"  # this can be "drop", "passthrough", or another Estimator
)

2. Build a **Pipeline** to scale all resulting features

In [13]:
preprocessing = Pipeline([
    ("cat_preprocessing", cat_preprocessing),
    ("scaler", StandardScaler())
])

Add the preprocessing step to the final pipeline

In [14]:
steps.append(("preprocessing", preprocessing))

### **Feature Selection**

In [15]:
fs = SelectKBest(score_func=f_classif, k=10)

Add the feature selection step to the final pipeline

In [16]:
steps.append(("feature_selection", fs))

### **Model**

In [17]:
lr = LogisticRegression(solver="liblinear")

search_space = {
    "C": np.logspace(-4,4,1000)
}

rs = RandomizedSearchCV(lr, search_space)

Add the model to the pipeline

In [18]:
steps.append(("model", rs))

### **Build the Pipeline**

In [20]:
final = Pipeline(
    steps=steps
)

In [21]:
final

### **Apply the whole process to original data**

In [22]:
%%time

final.fit(X_train, y_train)

CPU times: user 7.95 s, sys: 7.22 s, total: 15.2 s
Wall time: 2.12 s


### **Evaluate the model**

#### **Evaluate on training**

In [108]:
pred = final.predict(X_train)
probas = final.predict_proba(X_train)

In [109]:
precision_test = precision_score(y_train, pred, pos_label=">50K")
recall_test = recall_score(y_train, pred, pos_label=">50K")
f1_test = f1_score(y_train, pred, pos_label=">50K")
roc_auc_test = roc_auc_score(y_train, probas[:,1])

In [110]:
print(f"Train Precision: {round(precision_test,3)}")
print(f"Train Recall: {round(recall_test,3)}")
print(f"Train F1: {round(f1_test,3)}")
print(f"Train ROC_AUC: {round(roc_auc_test,3)}")

Train Precision: 0.685
Train Recall: 0.398
Train F1: 0.503
Train ROC_AUC: 0.86


#### **Evaluate on test**

In [111]:
pred = final.predict(X_test)
probas = final.predict_proba(X_test)

In [112]:
precision_test = precision_score(y_test, pred, pos_label=">50K")
recall_test = recall_score(y_test, pred, pos_label=">50K")
f1_test = f1_score(y_test, pred, pos_label=">50K")
roc_auc_test = roc_auc_score(y_test, probas[:,1])

In [113]:
print(f"Test Precision: {round(precision_test,3)}")
print(f"Test Recall: {round(recall_test,3)}")
print(f"Test F1: {round(f1_test,3)}")
print(f"Test ROC_AUC: {round(roc_auc_test,3)}")

Test Precision: 0.69
Test Recall: 0.398
Test F1: 0.505
Test ROC_AUC: 0.861


## Access the elements inside the Pipeline

In [119]:
final.named_steps

{'preprocessing': Pipeline(steps=[('cat_preprocessing',
                  ColumnTransformer(remainder='passthrough',
                                    transformers=[('ohe',
                                                   OneHotEncoder(sparse_output=False),
                                                   Index(['education', 'educational-num', 'marital-status', 'occupation',
        'relationship', 'race', 'gender', 'native-country'],
       dtype='object')),
                                                  ('ordinal', OrdinalEncoder(),
                                                   ['workclass'])])),
                 ('scaler', StandardScaler())]),
 'feature_selection': SelectKBest(),
 'model': RandomizedSearchCV(estimator=LogisticRegression(solver='liblinear'),
                    param_distributions={'C': array([1.00000000e-04, 1.01861017e-04, 1.03756668e-04, 1.05687597e-04,
        1.07654461e-04, 1.09657929e-04, 1.11698682e-04, 1.13777413e-04,
        1.15894830e-04, 1.