In [None]:
import os
import json
import joblib
from pathlib import Path

import pandas as pd
from sqlalchemy import create_engine
from dotenv import load_dotenv

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score

from sklearn.preprocessing import OneHotEncoder


In [4]:
df = pd.read_csv("price_sensitivity_data.csv")
df.head()


Unnamed: 0,customer_id,total_spent,avg_order_value,avg_purchase_frequency,days_since_last_purchase,discount_behavior,loyalty_program_member,days_in_advance,flight_type,cabin_class,will_buy_after_price_increase
0,1,238.3,123.55,0.97,40,0.75,0,24,international,economy,0
1,2,207.45,136.67,0.65,2,0.27,1,62,domestic,economy,0
2,3,119.12,67.15,1.51,22,0.59,1,59,international,economy,0
3,4,679.57,148.34,2.86,36,0.64,0,25,international,business,0
4,5,131.6,163.9,0.64,50,0.74,0,66,domestic,economy,0


In [5]:
df["will_buy_after_price_increase"].value_counts()


will_buy_after_price_increase
0    6802
1    3198
Name: count, dtype: int64

In [6]:
# features
numeric_features = [
    "total_spent",
    "avg_order_value",
    "avg_purchase_frequency",
    "days_since_last_purchase",
    "discount_behavior",
    "loyalty_program_member",
    "days_in_advance",
]

categorical_features = ["flight_type", "cabin_class"]

X = df[numeric_features + categorical_features]
y = df["will_buy_after_price_increase"].astype(int)

In [7]:
# training set & Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [12]:
preprocess = ColumnTransformer(
    transformers=[
        ("numurical", StandardScaler(), numeric_features),
        ("categorical", OneHotEncoder(drop="first", handle_unknown="ignore", sparse_output=False), categorical_features)
    ]
)

model = Pipeline(
    steps=[
        ("preprocess", preprocess),
        (
            "clf",
            LogisticRegression(
                max_iter=1000, class_weight="balanced", solver="liblinear"
            ),
        ),
    ]
)

model.fit(X_train, y_train)

In [None]:
# accuracy
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

print("Validation report\n", classification_report(y_test, y_pred))
print("ROC‑AUC:", round(roc_auc_score(y_test, y_proba), 3))

Validation report
               precision    recall  f1-score   support

           0       0.81      0.67      0.73      1360
           1       0.49      0.68      0.57       640

    accuracy                           0.67      2000
   macro avg       0.65      0.67      0.65      2000
weighted avg       0.71      0.67      0.68      2000

ROC‑AUC: 0.733


In [14]:
# Save artefacts

Path("models").mkdir(exist_ok=True)

joblib.dump(model, "models/model.pkl")


['models/model.pkl']