# Airline Customer Analytics

**Churn Modelling**

**Date:** 29 Jan, 2026

---

## Importing Required Packages

In [1]:
# --- Data Analysis ---
import pandas as pd

# --- Machine Learning ---
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

from xgboost import XGBClassifier

# --- File I/O ---
import joblib
import json

---

## Load Data

In [2]:
df = pd.read_parquet(
    "../data/curated/customer_features.parquet",
    engine="pyarrow"
)
df.head()

Unnamed: 0,loyalty_number,province,city,gender,education,loyalty_card,clv,is_cancelled,tenure_months,recency,r_score,frequency,f_score,monetary,m_score,rfm_segment
0,480934,Ontario,Toronto,Female,Bachelor,Star,3839.14,False,34,0,5,37,3,54525,3,Loyal
1,549612,Alberta,Edmonton,Male,College,Star,3839.61,False,33,0,5,58,5,77487,5,Champions
2,429460,British Columbia,Vancouver,Male,College,Star,3839.75,True,42,11,2,18,2,24803,2,Dormant
3,608370,Ontario,Toronto,Male,College,Star,3839.75,False,70,0,5,35,3,48432,3,Loyal
4,530508,Quebec,Hull,Male,Bachelor,Star,3842.79,False,50,0,5,37,3,55515,4,Loyal


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16737 entries, 0 to 16736
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   loyalty_number  16737 non-null  int64  
 1   province        16737 non-null  string 
 2   city            16737 non-null  string 
 3   gender          16737 non-null  string 
 4   education       16737 non-null  string 
 5   loyalty_card    16737 non-null  string 
 6   clv             16737 non-null  float64
 7   is_cancelled    16737 non-null  bool   
 8   tenure_months   16737 non-null  int32  
 9   recency         16737 non-null  int64  
 10  r_score         16737 non-null  int64  
 11  frequency       16737 non-null  int64  
 12  f_score         16737 non-null  int64  
 13  monetary        16737 non-null  int64  
 14  m_score         16737 non-null  int64  
 15  rfm_segment     16737 non-null  string 
dtypes: bool(1), float64(1), int32(1), int64(7), string(6)
memory usage: 1.9 MB


## Churn Rule

In [4]:
RECENCY_THRESHOLD = 3

df["churn_label"] = (
    (df["is_cancelled"]) | 
    (df["recency"] >= RECENCY_THRESHOLD)
).astype(int)

df["churn_label"].value_counts(normalize=True).round(2)


churn_label
0    0.78
1    0.22
Name: proportion, dtype: float64

In [5]:
df.groupby("rfm_segment")["churn_label"].mean().sort_values(ascending=False)

rfm_segment
At Risk      1.000000
Dormant      1.000000
Potential    0.124236
Loyal        0.106682
Champions    0.039731
Name: churn_label, dtype: float64

## Train - Test Split

In [6]:
numeric_features = ["frequency", "monetary", "tenure_months", "clv"]
categorical_features = ["province", "city", "gender", "education", "loyalty_card"]

X = df[numeric_features + categorical_features].copy()
y = df["churn_label"]

In [7]:
preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(with_mean=False), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ]
)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

## Logistic Regression

In [9]:
lr = Pipeline(steps=[
    ("preprocess", preprocess),
    ("clf", LogisticRegression(max_iter=500, class_weight="balanced"))
])

lr.fit(X_train, y_train)

proba_lr = lr.predict_proba(X_test)[:, 1]
print("LR+Cat ROC AUC:", round(roc_auc_score(y_test, proba_lr), 2))
print("LR+Cat PR AUC :", round(average_precision_score(y_test, proba_lr), 2))

LR+Cat ROC AUC: 0.84
LR+Cat PR AUC : 0.71


In [10]:
lr_df = df.copy()

lr_df["churn_prob"] = lr.predict_proba(X)[:, 1]

lr_df["priority_score"] = (lr_df["churn_prob"] * lr_df["clv"])

lr_preds = (
    lr_df[[
        "loyalty_number",
        "rfm_segment",
        "churn_label",
        "churn_prob",
        "clv",
        "priority_score"
    ]]
    .sort_values("priority_score", ascending=False)
)

lr_preds.head(15)

Unnamed: 0,loyalty_number,rfm_segment,churn_label,churn_prob,clv,priority_score
16736,652627,Dormant,1,0.86377,83325.38,71973.954499
9823,844145,Dormant,1,0.880172,74228.52,65333.830045
4150,838263,Dormant,1,0.869394,67907.27,59038.168066
4152,767366,Dormant,1,0.800713,73225.96,58633.003503
16730,672917,Dormant,1,0.874222,58166.55,50850.47194
9818,163405,Potential,0,0.823725,60556.19,49881.647683
4143,798869,Potential,0,0.800297,55277.45,44238.35484
9810,737027,Dormant,1,0.889051,49221.43,43760.38384
4138,529331,Dormant,1,0.880197,49221.43,43324.548269
9813,642031,Potential,0,0.838462,51337.91,43044.861404


In [None]:
lr_preds.describe()
lr_preds.groupby("rfm_segment")["priority_score"].mean().sort_values(ascending=False)

rfm_segment
Dormant      6780.347883
Potential    4696.249080
At Risk      2473.780566
Loyal        2250.274792
Champions    1209.277056
Name: priority_score, dtype: float64

In [12]:
joblib.dump(lr, "../artifacts/churn/LR/model.pkl")

metrics = {
    "recency_threshold_months": RECENCY_THRESHOLD,
    "roc_auc": float(roc_auc_score(y_test, proba_lr)),
    "pr_auc": float(average_precision_score(y_test, proba_lr)),
    "churn_rate": float(df["churn_label"].mean()),
    "n_customers": int(len(df)),
}

with open("../artifacts/churn/LR/metrics.json", "w") as f:
    json.dump(metrics, f, indent=2)

lr_preds.to_parquet("../artifacts/churn/LR/predictions.parquet", index=False)


## XGBoost

In [13]:
xgb = Pipeline(steps=[
    ("preprocess", preprocess),
    ("clf", XGBClassifier(
        n_estimators=400,
        learning_rate=0.05,
        max_depth=4,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=1.0,
        objective="binary:logistic",
        eval_metric="aucpr",
        tree_method="hist",
        random_state=42
    ))
])

xgb.fit(X_train, y_train)

proba_xgb = xgb.predict_proba(X_test)[:, 1]
print("XGB ROC AUC:", round(roc_auc_score(y_test, proba_xgb), 2))
print("XGB PR AUC :", round(average_precision_score(y_test, proba_xgb), 2))

XGB ROC AUC: 0.88
XGB PR AUC : 0.83


In [14]:
xgb_df = df.copy()

xgb_df["churn_prob"] = xgb.predict_proba(X)[:, 1]

xgb_df["priority_score"] = (xgb_df["churn_prob"] * xgb_df["clv"])

xgb_preds = (
    xgb_df[[
        "loyalty_number",
        "rfm_segment",
        "churn_label",
        "churn_prob",
        "clv",
        "priority_score"
    ]]
    .sort_values("priority_score", ascending=False)
)

xgb_preds.head(15)

Unnamed: 0,loyalty_number,rfm_segment,churn_label,churn_prob,clv,priority_score
16736,652627,Dormant,1,0.99998,83325.38,83323.731096
9823,844145,Dormant,1,0.999964,74228.52,74225.847684
4152,767366,Dormant,1,0.974616,73225.96,71367.204675
4150,838263,Dormant,1,0.999983,67907.27,67906.136675
16730,672917,Dormant,1,0.99998,58166.55,58165.371221
5806,333051,Potential,1,0.985515,51426.25,50681.316166
9810,737027,Dormant,1,0.999994,49221.43,49221.15422
4138,529331,Dormant,1,0.99997,49221.43,49219.974822
9809,943393,Dormant,1,0.999986,48356.96,48356.291307
9805,257194,Dormant,1,0.99992,46770.95,46767.225545


In [15]:
joblib.dump(xgb, "../artifacts/churn/XGBoost/model.pkl")

metrics = {
    "recency_threshold_months": RECENCY_THRESHOLD,
    "roc_auc": float(roc_auc_score(y_test, proba_xgb)),
    "pr_auc": float(average_precision_score(y_test, proba_xgb)),
    "churn_rate": float(df["churn_label"].mean()),
    "n_customers": int(len(xgb_df)),
}

with open("../artifacts/churn/XGBoost/metrics.json", "w") as f:
    json.dump(metrics, f, indent=2)

xgb_preds.to_parquet("../artifacts/churn/XGBoost/predictions.parquet", index=False)