In [35]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


 ### Load dataset ###
def Load_csv(path_in: str = "hotel_bookings_cleaned.csv") -> pd.DataFrame:
    """
    Load dataset into a pandas DataFrame.
    """
    df = pd.read_csv(path_in)
    print(f"Loaded: {path_in}")
    print(f"Shape: {df.shape[0]:,} rows × {df.shape[1]} columns\n")
    return df

## Feature: `cxl_risk_score` — Rule-Based Cancellation Risk (0–100)

**Goal.** Turn raw booking fields into a single, interpretable cancellation-risk score to guide pricing, overbooking buffers, and inventory allocation.

### Components and formula
Four intuitive drivers are each min-max normalised to [0,1], then combined as a weighted sum × 100:
1. **Lead time** — longer lead times tend to cancel more (More time to reconsider about booking).
2. **Off-season flag** — bookings in defined off-season months are riskier (Lower demand for room).
3. **Single-party flag** — `adults == 1` and `children == babies == 0` (Lower commitment if travelling alone).
4. **Market historical cancel rate** — average `is_canceled` by `market_segment` (Historical behavior).
5. **Customer unpredictability** - unpredictable customer behavior or environment (Simulate randomly).

**Conceptual formula**
$$
\text{cxl_risk_score}= \big(
w_1 \cdot \text{minmax(lead_time)} +
w_2 \cdot \text{minmax(off_season_flag)} +
w_3 \cdot \text{minmax(single_party_flag)} +
w_4 \cdot \text{minmax(market_cxl_rate)} +
w_5 \cdot \text{minmax(customer_randomness)}
\big)  \div \text{sum(}w_i{)} \times 100
$$

### Choosing `off_season_months`
- **Business rationale:** Lower demand often drive higher cancellation/no-show rates.
- **Assumption for this project:** Top 3 months with the fewest records `{January, November, December}`.

### Setting the weights
- `lead_time`: **0.225** (overplanning tendency)
- `off_season_flag`: **0.225** (seasonality)
- `single_party_flag`: **0.225** (commitment proxy)
- `market_cxl_rate`: **0.225** (segment behaviour)
- `customer_randomness`: **0.1** (random risk)

**Why equal weights (except random risk)?**  
- Keeps the method transparent and defensible without modelling.  
- Avoids over-emphasising any single driver when business priors are uncertain.  

### Interpreting the score
- **0–20:** Very low risk — short lead time, in-season, families/parties, low-risk segment
- **20–50:** Low–moderate risk — monitor; modest overbooking buffer acceptable
- **50–80:** Elevated risk — consider deposits/prepayment or tighter allocation
- **80–100:** High risk — strong chance of cancellation; use stricter terms or larger buffer

In [38]:
import pandas as pd
import numpy as np

### 1) Config Dictionary ###
def config_dict(lead_time_w: float = 0.225,
                off_season_w: float = 0.225,
                single_party_w: float = 0.225,
                market_hist_w: float = 0.225,
                customer_randomness_w: float = 0.1,
                off_season_top_k: int = 3) -> dict:
    return {
        "weights": {                 
            "lead_time":    lead_time_w,
            "off_season":   off_season_w,
            "single_party": single_party_w,
            "market_hist":  market_hist_w,
            "customer_randomness":  customer_randomness_w,
        },
        "off_season_top_k": off_season_top_k,       # <- pick the K least frequent months
    }

### 2) Create Min-max normalisation function ###
def _minmax(x: pd.Series) -> pd.Series:
    """
    Min-Max Normalization function for drivers
    """
    x = pd.to_numeric(x)
    xmin, xmax = x.min(), x.max()
    if pd.isna(xmin) or pd.isna(xmax) or xmax == xmin:
        return pd.Series(np.zeros(len(x)), index=x.index) # return to zero if min = max or NaN for min/max
    return (x - xmin) / (xmax - xmin)

### 3) Determine off-season months ###
def det_off_season_months(df: pd.DataFrame, k: int) -> set:
    """
    Pick the K months with the fewest records as off-season months.
    Ties are handled by standard sort order; if fewer than K months exist, use all available.
    """
    months = df['arrival_date_month']

    counts = months.value_counts(dropna=True)  # counts by month
    counts = counts.sort_values(ascending=True) # ensure month order is by ascending frequency, then alphabetically for tie-break
    chosen = counts.index[:max(0, min(k, len(counts)))]
    print('Off-season months defined by demand: ',list(chosen))
    return set(chosen)

### 4) Create off-season flag ###
def off_season_component(df: pd.DataFrame, off_months: set) -> pd.Series:
    """
    Create off-season flag component (1 if month in the K least frequent months)
    """
    off_months = det_off_season_months(df, off_months)
    off_season_flag = df['arrival_date_month'].isin(off_months).astype(int)
    return off_season_flag

### 5) Create market historical cancel rate ###
def market_cxl_component(df: pd.DataFrame) -> pd.Series:
    """
    Create market historical cancel rate component (mean of is_canceled by market_segment)
    """
    market_rate = df.groupby('market_segment')['is_canceled'].mean().rename('market_cxl_rate')
    if 'Undefined' in market_rate.index: 
        market_rate.loc['Undefined'] = 1.0 # If `market_segment` is 'Undefined', default `market_cxl_rate = 1`
    df = df.merge(market_rate, on='market_segment', how="left")
    return df["market_cxl_rate"]

### 6) Create single party flag ###
def single_party_component(df: pd.DataFrame) -> pd.Series:
    """
    Create single party flag component (1 adult, no children/babies)
    """
    single_party_flag = ((df['adults'].fillna(0) == 1) & (df['children'].fillna(0) == 0) &
                            (df['babies'].fillna(0) == 0)).astype(int)
    return single_party_flag

### 7) Customer unpredictability component ###
def customer_randomness_component(df: pd.DataFrame, seed: int | None = None) -> pd.Series:
    """
    Generate a customer unpredictability score in [0,1] randomly.
    """
    rng = np.random.default_rng(seed)
    return pd.Series(rng.random(len(df)), index=df.index, name="customer_randomness")

### 8) Compute cancellation risk score ###
def add_cxl_risk_score(df: pd.DataFrame, cfg: dict, rand_seed: int | None = None) -> pd.DataFrame:
    """
    Create rule-based cancellation risk score in [0,100] using:
      - lead_time            (minmax normalisation)
      - off_season_flag      (1 if month in the K least frequent months)
      - single_party_flag    (1 adult, no children/babies)
      - market_cxl_rate      (mean of is_canceled by market_segment)
      - customer_randomness  (Uniform[0,1])
    """
    
    w = cfg["weights"]

    off_season_flag = off_season_component(df, cfg.get("off_season_top_k", 3))
    df["market_cxl_rate"] = market_cxl_component(df)
    single_party_flag = single_party_component(df)
    df["customer_randomness"] = customer_randomness_component(df, seed=rand_seed).clip(0, 1)
    
 
    comp_list = []
    ### --- components --- ###
    comp_lead   = _minmax(df['lead_time']).fillna(1) # If `lead_time` is missing, default `comp_lead = 1`
    comp_off    = _minmax(off_season_flag).fillna(1) # If `arrival_date_month` is missing, default `comp_off = 1`
    comp_single = _minmax(single_party_flag).fillna(1) # If `single_party_flag` is missing, default `comp_single = 1
    comp_hist   = _minmax(df["market_cxl_rate"]).fillna(1) # If `market_segment` is missing, default `comp_hist = 1`
    comp_rand = df["customer_randomness"] # Already in [0,1], no NaN possible
    
    ### --- weighted sum --- ###
    wsum = sum(abs(x) for x in w.values()) or 1.0
    df["cxl_risk_score"] = (
        w["lead_time"]    * comp_lead +
        w["off_season"]   * comp_off +
        w["single_party"] * comp_single +
        w["market_hist"]  * comp_hist +
        w["customer_randomness"] * comp_rand
    ) / wsum * 100.0 # Restrict final score to [0,100]
    
    df["off_season_flag"] = off_season_flag  # For off-season months validation 
    
    return df

### 9) Save enriched dataset ###
def Save_csv(df: pd.DataFrame, path_out: str = "hotel_bookings_credit_score.csv") -> pd.DataFrame:
    """
    Save and output cleaned dataset into a csv file
    """
    df.to_csv(path_out, index=False)
    print(f"\nSaved cleaned dataset to: {path_out}")

    return df


In [39]:
df = Load_csv(path_in = "hotel_bookings_cleaned.csv")
cfg = config_dict()                 
df2 = add_cxl_risk_score(df, cfg, rand_seed=1554262)  # seed for reproducibility if needed
enriched_df = Save_csv(df2, path_out="hotel_bookings_credit_score.csv")
df2[["cxl_risk_score","customer_randomness"]].describe()


Loaded: hotel_bookings_cleaned.csv
Shape: 87,396 rows × 32 columns

Off-season months defined by demand:  ['January', 'November', 'December']

Saved cleaned dataset to: hotel_bookings_credit_score.csv


Unnamed: 0,cxl_risk_score,customer_randomness
count,87396.0,87396.0
mean,19.334962,0.500649
std,12.519749,0.28887
min,0.076645,1.2e-05
25%,10.355578,0.251717
50%,14.922596,0.500681
75%,28.845671,0.749536
max,74.426682,0.999993


In [40]:
df2.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,market_cxl_rate,customer_randomness,cxl_risk_score,off_season_flag
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,Transient,0.0,0,0,Check-Out,2015-07-01,0.147154,0.960654,20.714928,0
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,Transient,0.0,0,0,Check-Out,2015-07-01,0.147154,0.773707,30.904478,0
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,Transient,75.0,0,0,Check-Out,2015-07-02,0.147154,0.483812,28.219233,0
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,Transient,75.0,0,0,Check-Out,2015-07-02,0.121083,0.778332,30.680204,0
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,Transient,98.0,0,1,Check-Out,2015-07-03,0.353462,0.060194,6.978186,0
