# **Exercise 3: Predictive Modeling**

In [312]:
import json
from sqlalchemy import create_engine
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier

### 1. Function to load data from PostgreSQL, merge datasets and identify target column

In [313]:
def load_postgresql_data(db_url):

    print("Connecting to PostgreSQL...")
    engine = create_engine(db_url)

    customers = pd.read_sql("SELECT * FROM customers", con=engine)
    policies = pd.read_sql("SELECT * FROM policies", con=engine)
    claims = pd.read_sql("SELECT * FROM claims", con=engine)
    risk_indicators = pd.read_sql("SELECT * FROM risk_indicators", con=engine)

    print("Data loaded successfully!")

    # merge datasets
    df = policies.merge(customers, on="customer_id", how="left")
    df = df.merge(risk_indicators, on="customer_id", how="left")

    # our target => claim in the last year
    recent_claims = claims[claims["claim_date"] >= "2024-01-01"]
    df["target"] = df["customer_id"].isin(recent_claims["customer_id"]).astype(int)

    print(df.info())
    return df

### 2. Function to identify features, transform dataset and define transformers

In [314]:
def preprocess_data(df):

    # separate categorical and numerical features
    categorical_features = ["policy_type", "state"]
    numerical_features = [
        "property_risk_score",
        "health_risk_score",
        "driving_violations",
    ]

    # drop not used columns
    df.drop(
        columns=[
            "customer_id",
            "policy_id",
            "created_at",
            "customer_name",
            "email",
            "phone_number",
        ],
        inplace=True,
    )

    # define transformers
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", StandardScaler(), numerical_features),
            ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_features),
        ]
    )

    return df, preprocessor

### 3. Function to train and evaluate the models (with and without parameters tunning)

In [315]:
def train_models(X_train, X_test, y_train, y_test, preprocessor, models, param_grids=None):

    results = {}

    for name, model in models.items():
        pipeline = Pipeline(
            steps=[("preprocessor", preprocessor), ("classifier", model)]
        )
        if param_grids and name in param_grids:
            print(f"Tuning hyperparameters for {name}...")
            grid_search = GridSearchCV(
                pipeline, param_grids[name], scoring="roc_auc"
            )
            grid_search.fit(X_train, y_train)

            print(f"Best parameters for {name}: {grid_search.best_params_}")
            pipeline = grid_search.best_estimator_
        else:
            pipeline.fit(X_train, y_train)

        y_pred = pipeline.predict(X_test)
        y_pred_prob = pipeline.predict_proba(X_test)[:, 1]

        # evaluate models using different metrics
        precision = classification_report(y_test, y_pred, output_dict=True)["1"][
            "precision"
        ]
        recall = classification_report(y_test, y_pred, output_dict=True)["1"]["recall"]
        f1 = classification_report(y_test, y_pred, output_dict=True)["1"]["f1-score"]
        roc_auc = roc_auc_score(y_test, y_pred_prob)

        results[name] = {
            "Precision": round(precision, 3),
            "Recall": round(recall, 3),
            "F1-Score": round(f1, 3),
            "ROC-AUC": round(roc_auc, 3),
        }
    return results

### 4. Load postgreSQL credentials from config file and load data

In [316]:
with open("../config.json", "r") as f:
    config = json.load(f)

    db_name = config["DB_NAME"]
    db_user = config["DB_USER"]
    db_password = config["DB_PASSWORD"]
    db_host = config["DB_HOST"]
    db_port = int(config["DB_PORT"])

DB_URL = f"postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}"
engine = create_engine(DB_URL)
#load data, merge datasets and create target column
df = load_postgresql_data(DB_URL)

Connecting to PostgreSQL...
Data loaded successfully!
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 995 entries, 0 to 994
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   policy_id            995 non-null    object        
 1   customer_id          995 non-null    object        
 2   policy_type          995 non-null    object        
 3   created_at           995 non-null    datetime64[ns]
 4   customer_name        995 non-null    object        
 5   date_of_birth        995 non-null    datetime64[ns]
 6   phone_number         995 non-null    object        
 7   email                995 non-null    object        
 8   street_address       995 non-null    object        
 9   state                995 non-null    object        
 10  post_code            995 non-null    object        
 11  iban                 995 non-null    object        
 12  job                  995 non-null    o

### 5. Prepare dataset for training

In [317]:
df, preprocessor = preprocess_data(df)

### 6. Split data

In [318]:
X = df.drop(columns=["target"])
y = df["target"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=5, stratify=y
)

### 7. Train models (logistic regression, random forest and XGBoost) with default parameters

In [319]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(n_estimators=100, max_depth=15),
    "XGBoost": XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=6),
}

model_results = train_models(X_train, X_test, y_train, y_test, preprocessor, models)
results_df = pd.DataFrame(model_results).T
print("Model Comparison using default parameters:")
print(results_df)

Model Comparison using default parameters:
                     Precision  Recall  F1-Score  ROC-AUC
Logistic Regression      0.537   0.483     0.509    0.579
Random Forest            0.833   0.674     0.745    0.859
XGBoost                  0.758   0.843     0.798    0.864


### 8. Train best models (Random forest and XGBoost) with tunned parameters

In [320]:
models = {
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(),
}
param_grids = {
    "Random Forest": {
        "classifier__n_estimators": [100, 150, 200],
        "classifier__max_depth": [20, 25, 30]
    },
    "XGBoost": {
        "classifier__n_estimators": [100, 150, 200],
        "classifier__learning_rate": [0.1, 0.2, 0.3],
        "classifier__max_depth": [5, 10, 15],
    },
}
model_results = train_models(X_train, X_test, y_train, y_test, preprocessor, models, param_grids)
results_df = pd.DataFrame(model_results).T
print("Model Comparison using tunned parameters:")
print(results_df)

Tuning hyperparameters for Random Forest...
Best parameters for Random Forest: {'classifier__max_depth': 30, 'classifier__n_estimators': 200}
Tuning hyperparameters for XGBoost...
Best parameters for XGBoost: {'classifier__learning_rate': 0.2, 'classifier__max_depth': 15, 'classifier__n_estimators': 200}
Model Comparison using tunned parameters:
               Precision  Recall  F1-Score  ROC-AUC
Random Forest      0.793    0.82     0.807    0.918
XGBoost            0.880    0.91     0.895    0.949
